# Process First Pass Results

### Load Libraries and Datasets

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import os
import sys
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

sys.path.append("../")
from data_generation_scripts.utils import check_rate_limit, check_add_orgs, check_add_repos, check_add_users, check_for_joins_in_older_queries, check_return_error_file
from data_generation_scripts.generate_user_repos_interactions import get_user_repo_activities
from data_generation_scripts.generate_translations import check_detect_language

Once we have run the `ProcessDHRepos`, `ProcessDHUsers`, and `ProcessDHOrgs` notebooks, we can start to expand our dataset. Our initial first pass expansion will be to add the following information to our dataset:

- owners of `initial_core_repos` will be added to `core_users` and `core_orgs`
- repos of `core_users` and `core_orgs` will be added to `core_repos`

We may do some thresholding to avoid too many users or repos being added.

In [3]:
user_df = pd.read_csv("../data/entity_files/users_dataset.csv")
repo_df = pd.read_csv("../data/large_files/entity_files/repos_dataset.csv", low_memory=False)
org_df = pd.read_csv("../data/entity_files/orgs_dataset.csv", low_memory=False)
search_queries_repo_join_df = pd.read_csv("../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv")
search_queries_user_join_df = pd.read_csv(
    "../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv")

In [25]:
initial_core_users = pd.read_csv("../data/derived_files/initial_core_users.csv")
initial_core_repos = pd.read_csv("../data/derived_files/initial_core_repos.csv")
initial_core_orgs = pd.read_csv("../data/derived_files/initial_core_orgs.csv")

### Explore Potential New Materials

In [26]:
missing_users = search_queries_repo_join_df[~search_queries_repo_join_df['owner.login'].isin(user_df.login)]
missing_users = missing_users[missing_users['owner.type'] == 'User']
missing_orgs = search_queries_repo_join_df[~search_queries_repo_join_df['owner.login'].isin(org_df.login)]
missing_orgs = missing_orgs[missing_orgs['owner.type'] == 'Organization']
len(missing_users), len(missing_orgs)


(0, 3)

In [27]:
missing_users = missing_users[['owner.login']]
missing_users = missing_users.rename(columns={'owner.login': 'login'})
missing_users['type'] = 'User'
missing_users['url'] = missing_users.login.apply(lambda x: f"https://api.github.com/users/{x}")

In [28]:
missing_orgs = missing_orgs[['owner.login']]
missing_orgs = missing_orgs.rename(columns={'owner.login': 'login'})
missing_orgs['type'] = 'Organization'
missing_orgs['url'] = missing_orgs.login.apply(lambda x: f"https://api.github.com/orgs/{x}")

In [29]:
if len(missing_orgs) > 0:
    org_df = check_add_orgs(missing_orgs, '../data/entity_files/orgs_dataset.csv', True, False)
if len(missing_users) > 0:
    user_df = check_add_users(missing_users, '../data/entity_files/users_dataset.csv', True, False)

Cleaning Orgs: 100%|██████████| 3/3 [00:02<00:00,  1.01it/s]

                       file_name  file_size  \
27  repos_dataset_2022_11_04.csv   39255456   
9   users_dataset_2023_06_01.csv   25190404   

                            directory       date  
27  ../data/older_files/entity_files/ 2022-11-04  
9   ../data/older_files/entity_files/ 2023-06-01  





In [30]:
expanded_core_users = user_df[(user_df.login.isin(search_queries_repo_join_df['owner.login']))]
expanded_core_users = expanded_core_users[~expanded_core_users.login.isin(initial_core_users.login)]
expanded_core_orgs = expanded_core_users[expanded_core_users['type'] == 'Organization']
expanded_core_users = expanded_core_users[expanded_core_users['type'] == 'User']
expanded_core_orgs = expanded_core_orgs[~expanded_core_orgs.login.isin(initial_core_orgs.login)]
len(expanded_core_orgs), len(expanded_core_users), len(initial_core_orgs), len(initial_core_users)

(244, 1531, 190, 736)

In [31]:
filter_columns = ['public_repos',
#  'public_gists',
#  'followers',
#  'following',
#  'star_count'
 ]

In [32]:
filtered_initial_core_users = initial_core_users.copy()
filtered_initial_core_orgs = initial_core_orgs.copy()
print(filtered_initial_core_users.public_repos.sum())
print(filtered_initial_core_orgs.public_repos.sum())
for col in filter_columns:
    filtered_initial_core_users = filtered_initial_core_users[(filtered_initial_core_users[col].ge(filtered_initial_core_users[col].quantile(q=.25))) & (filtered_initial_core_users[col].le(filtered_initial_core_users[col].quantile(q=.75)))]
    filtered_initial_core_orgs = filtered_initial_core_orgs[(filtered_initial_core_orgs[col].ge(filtered_initial_core_orgs[col].quantile(q=.25))) & (filtered_initial_core_orgs[col].le(filtered_initial_core_orgs[col].quantile(q=.75)))]
print(filtered_initial_core_users.public_repos.sum(), filtered_initial_core_orgs.public_repos.sum())

8878.0
3134.0
1932.0 594.0


In [33]:
filtered_users = pd.concat([filtered_initial_core_users, filtered_initial_core_orgs])

In [34]:
len(filtered_users)

581

In [35]:
total_missing_repos = 0
for _, row in filtered_users.iterrows():
    existing_repos = repo_df[repo_df['owner.login'] == row['login']]
    if len(existing_repos) != row['public_repos']:
        print(f"For user {row['login']} there are {len(existing_repos)} repos but they have {row['public_repos']} public repos")
        total_missing_repos += row['public_repos'] - len(existing_repos)

For user lisateichmann there are 0 repos but they have 6.0 public repos
For user yueyue4359 there are 0 repos but they have 5.0 public repos
For user 2enyoasamoah there are 0 repos but they have 4.0 public repos
For user hvm-uu there are 0 repos but they have 1.0 public repos
For user imlabormitlea-code there are 0 repos but they have 2.0 public repos
For user RemoGrillo there are 1 repos but they have 4.0 public repos
For user kfitz there are 0 repos but they have 7.0 public repos
For user XiaoyanYangAlice there are 1 repos but they have 7.0 public repos
For user Sourasky-DHLAB there are 0 repos but they have 1.0 public repos
For user valrighe there are 0 repos but they have 8.0 public repos
For user kochanovskayaanna there are 0 repos but they have 3.0 public repos
For user andreaspataro there are 0 repos but they have 8.0 public repos
For user casglur there are 0 repos but they have 8.0 public repos
For user BeatriceVaienti there are 0 repos but they have 1.0 public repos
For user j

In [36]:
# if total_missing_repos > 0:
#     user_repos_output_path = "../data/large_files/join_files/user_repos_join_dataset.csv"
#     repos_output_path = "../data/large_files/entity_files/repos_dataset.csv"
#     get_url_field = "repos_url"
#     load_existing_files = False
#     overwrite_existing_temp_files = False

#     users_repos_df, repo_df = get_user_repo_activities(filtered_users,user_repos_output_path, repos_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files)

In [37]:
expanded_core_repos = repo_df[repo_df['owner.login'].isin(filtered_users.login)]
expanded_core_repos = expanded_core_repos[~expanded_core_repos.full_name.isin(initial_core_repos.full_name)]

In [38]:
len(expanded_core_repos), len(initial_core_repos)

(1668, 2485)

In [39]:
total_missing_repos

726.0

In [40]:
len(expanded_core_repos)

1668

In [57]:
existing_expanded_core_orgs_file_path = "../data/derived_files/firstpass_core_orgs.csv"
existing_expanded_core_users_file_path = "../data/derived_files/firstpass_core_users.csv"
existing_expanded_core_repos_file_path = "../data/derived_files/firstpass_core_repos.csv"

if os.path.exists(existing_expanded_core_orgs_file_path):
    existing_expanded_core_orgs = pd.read_csv(existing_expanded_core_orgs_file_path)
    expanded_core_orgs = expanded_core_orgs[~expanded_core_orgs.login.isin(existing_expanded_core_orgs.login)]
    print(f"Added {len(expanded_core_orgs)} orgs to existing {len(existing_expanded_core_orgs)} orgs")
else:
    expanded_core_orgs.to_csv("../data/derived_files/firstpass_core_orgs.csv", index=False)

if os.path.exists(existing_expanded_core_users_file_path):
    existing_expanded_core_users = pd.read_csv(existing_expanded_core_users_file_path)
    expanded_core_users = expanded_core_users[~expanded_core_users.login.isin(existing_expanded_core_users.login)]
    print(f"Added {len(expanded_core_users)} users to existing {len(existing_expanded_core_users)} users")
else:
    expanded_core_users.to_csv("../data/derived_files/firstpass_core_users.csv", index=False)

if os.path.exists(existing_expanded_core_repos_file_path):
    existing_expanded_core_repos = pd.read_csv(existing_expanded_core_repos_file_path)
    expanded_core_repos = expanded_core_repos[~expanded_core_repos.full_name.isin(existing_expanded_core_repos.full_name)]
    print(f"Added {len(expanded_core_repos)} repos to existing {len(existing_expanded_core_repos)} repos")
else:
    expanded_core_repos.to_csv("../data/derived_files/firstpass_core_repos.csv", index=False)

Added 8 orgs to existing 262 orgs
Added 239 users to existing 1293 users
Added 0 repos to existing 1668 repos


In [61]:
expanded_core_repos = existing_expanded_core_repos.copy()

### Evaluate Languages of New Materials

In [62]:
tqdm.pandas(desc='Detecting language')
expanded_core_repos.description = expanded_core_repos.description.fillna('')
needs_language_repos = expanded_core_repos[expanded_core_repos.detected_language.isna()] 
needs_language_repos = needs_language_repos.progress_apply(check_detect_language, axis=1, is_repo=True)
expanded_core_repos = pd.concat([expanded_core_repos[~expanded_core_repos.detected_language.isna()], needs_language_repos])
expanded_core_repos = expanded_core_repos.reset_index(drop=True)
expanded_core_users.bio = expanded_core_users.bio.fillna('')
needs_language_users = expanded_core_users[expanded_core_users.detected_language.isna()]
needs_language_users = needs_language_users.progress_apply(check_detect_language, axis=1, is_repo=False)
expanded_core_users = pd.concat([expanded_core_users[~expanded_core_users.detected_language.isna()], needs_language_users])
expanded_core_users = expanded_core_users.reset_index(drop=True)
expanded_core_orgs.bio = expanded_core_orgs.bio.fillna('')
needs_language_orgs = expanded_core_orgs[expanded_core_orgs.detected_language.isna()]
needs_language_orgs = needs_language_orgs.progress_apply(check_detect_language, axis=1, is_repo=False)
expanded_core_orgs = pd.concat([expanded_core_orgs[~expanded_core_orgs.detected_language.isna()], needs_language_orgs])
expanded_core_orgs = expanded_core_orgs.reset_index(drop=True)

Detecting language: 100%|██████████| 471/471 [00:00<00:00, 19166.01it/s]
Detecting language: 100%|██████████| 184/184 [00:00<00:00, 17057.55it/s]
Detecting language: 100%|██████████| 7/7 [00:00<00:00, 5351.83it/s]


In [63]:
len(expanded_core_orgs[expanded_core_orgs.detected_language.isna()]), len(expanded_core_orgs[expanded_core_orgs.detected_language.notna()]), len(expanded_core_users[expanded_core_users.detected_language.isna()]), len(expanded_core_users[expanded_core_users.detected_language.notna()]), len(expanded_core_repos[expanded_core_repos.detected_language.isna()]), len(expanded_core_repos[expanded_core_repos.detected_language.notna()])

(7, 1, 184, 55, 471, 1197)

In [64]:
total_missing_languages = len(expanded_core_orgs[expanded_core_orgs.detected_language.isna()]) + len(expanded_core_users[expanded_core_users.detected_language.isna()]) + len(expanded_core_repos[expanded_core_repos.detected_language.isna()])
total_missing_languages

662

In [65]:
len(expanded_core_orgs[(expanded_core_orgs.detected_language.isna()) & (expanded_core_orgs.bio.str.len() > 0)]), len(expanded_core_users[(expanded_core_users.detected_language.isna()) & (expanded_core_users.bio.str.len() > 0)]), len(expanded_core_repos[(expanded_core_repos.detected_language.isna()) & (expanded_core_repos.description.str.len() > 0)])

(0, 0, 0)

In [66]:
initial_users = pd.concat([initial_core_orgs[['login', 'finalized_language']], initial_core_users[['login', 'finalized_language']]])

In [67]:
expanded_core_repos = pd.merge(expanded_core_repos, initial_users, left_on='owner.login', right_on='login', how='left')

In [68]:
expanded_core_repos['potential_language'] = np.where(expanded_core_repos['detected_language'].isnull(), expanded_core_repos['finalized_language'], expanded_core_repos['detected_language'])


In [69]:
expanded_core_repos = expanded_core_repos.drop(columns=['login', 'finalized_language'])

In [70]:
expanded_core_repos.duplicated().sum()

0

In [101]:
if os.path.exists(existing_expanded_core_repos_file_path):
    expanded_core_repos = pd.concat([expanded_core_repos, existing_expanded_core_repos])
    expanded_core_repos.to_csv("../data/derived_files/firstpass_core_repos.csv", index=False)
else:
    expanded_core_repos.to_csv("../data/derived_files/firstpass_core_repos.csv", index=False)

In [71]:
len(expanded_core_repos)

1668

In [73]:
len(expanded_core_users)

239

In [74]:
expanded_core_users = pd.merge(expanded_core_users, initial_core_repos[['finalized_language', 'owner.login']], left_on='login', right_on='owner.login', how='left')

In [75]:
expanded_core_users = expanded_core_users[~expanded_core_users.duplicated(subset=['login', 'owner.login', 'finalized_language'])]

In [76]:
expanded_core_users['potential_language'] = np.where(expanded_core_users['detected_language'].isnull(), expanded_core_users['finalized_language'], expanded_core_users['detected_language'])

In [77]:
expanded_core_users = expanded_core_users[~expanded_core_users.duplicated(subset=['login', 'owner.login', 'detected_language', 'finalized_language', 'potential_language'])]

In [78]:
expanded_core_users = expanded_core_users.drop(columns=['owner.login', 'finalized_language'])

In [79]:
len(expanded_core_users)

240

In [80]:
users_multiple_languages = expanded_core_users[expanded_core_users.duplicated(subset=['login'])].login.unique().tolist()
if len(users_multiple_languages) > 0:
    for login in users_multiple_languages:
        user_df = expanded_core_users[(expanded_core_users.login == login)]
        if len(user_df.potential_language.unique()) > 1:
            languages = user_df.potential_language.unique().tolist()
            languages = [x for x in languages if str(x) != 'nan']
            updated_languages = [x for x in languages if ',' in x]
            if len(updated_languages) == 0:
                updated_languages = ', '.join(languages)
            else:
                updated_languages = updated_languages[0]
            expanded_core_users.loc[expanded_core_users.login == login, 'potential_language'] = updated_languages

In [81]:
expanded_core_users = expanded_core_users.drop_duplicates(subset=['login', 'potential_language'])

In [82]:
if os.path.exists(existing_expanded_core_users_file_path):
    expanded_core_users = pd.concat([expanded_core_users, existing_expanded_core_users])
expanded_core_users.to_csv("../data/derived_files/firstpass_core_users.csv", index=False)

In [83]:
len(expanded_core_orgs)

8

In [84]:
expanded_core_orgs = pd.merge(expanded_core_orgs, initial_core_repos[['finalized_language', 'owner.login']], left_on='login', right_on='owner.login', how='left')

In [85]:
expanded_core_orgs = expanded_core_orgs[~expanded_core_orgs.duplicated(subset=['login', 'owner.login', 'finalized_language'])]
expanded_core_orgs['potential_language'] = np.where(expanded_core_orgs['detected_language'].isnull(), expanded_core_orgs['finalized_language'], expanded_core_orgs['detected_language'])
expanded_core_orgs = expanded_core_orgs[~expanded_core_orgs.duplicated(subset=['login', 'owner.login', 'detected_language', 'finalized_language', 'potential_language'])]
expanded_core_orgs = expanded_core_orgs.drop(columns=['owner.login', 'finalized_language'])
len(expanded_core_orgs)

8

In [86]:
orgs_multiple_languages = expanded_core_orgs[expanded_core_orgs.duplicated(subset=['login'])].login.unique().tolist()
if len(orgs_multiple_languages) > 0:
    for login in orgs_multiple_languages:
        user_df = expanded_core_orgs[(expanded_core_orgs.login == login)]
        if len(user_df.potential_language.unique()) > 1:
            languages = user_df.potential_language.unique().tolist()
            languages = [x for x in languages if str(x) != 'nan']
            updated_languages = [x for x in languages if ',' in x]
            if len(updated_languages) == 0:
                updated_languages = ', '.join(languages)
            else:
                updated_languages = updated_languages[0]
            expanded_core_orgs.loc[expanded_core_orgs.login == login, 'potential_language'] = updated_languages

In [87]:
expanded_core_orgs = expanded_core_orgs.drop_duplicates(subset=['login', 'potential_language'])
len(expanded_core_orgs)

8

In [88]:
if os.path.exists(existing_expanded_core_orgs_file_path):
    expanded_core_orgs = pd.concat([expanded_core_orgs, existing_expanded_core_orgs])
expanded_core_orgs.to_csv("../data/derived_files/firstpass_core_orgs.csv", index=False)