# Process First Pass Results

### Load Libraries and Datasets

In [65]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import os
import sys
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

sys.path.append("../")
from data_generation_scripts.utils import check_rate_limit, check_add_orgs, check_add_repos, check_add_users, check_for_joins_in_older_queries, check_return_error_file, check_for_entity_in_older_queries, check_if_older_file_exists
from data_generation_scripts.generate_user_repos_interactions import get_user_repo_activities
from data_generation_scripts.generate_translations import check_detect_language

Once we have run the `ProcessDHRepos`, `ProcessDHUsers`, and `ProcessDHOrgs` notebooks, we can start to expand our dataset. Our initial first pass expansion will be to add the following information to our dataset:

- owners of `initial_core_repos` will be added to `core_users` and `core_orgs`
- repos of `core_users` and `core_orgs` will be added to `core_repos`

We may do some thresholding to avoid too many users or repos being added.

In [47]:
user_df = pd.read_csv("../data/entity_files/users_dataset.csv")
repo_df = pd.read_csv("../data/large_files/entity_files/repos_dataset.csv", low_memory=False)
org_df = pd.read_csv("../data/entity_files/orgs_dataset.csv", low_memory=False)
search_queries_repo_join_df = pd.read_csv("../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv")
search_queries_user_join_df = pd.read_csv(
    "../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv")

In [29]:
initial_core_users = pd.read_csv("../data/derived_files/initial_core_users.csv")
initial_core_repos = pd.read_csv("../data/derived_files/initial_core_repos.csv")
initial_core_orgs = pd.read_csv("../data/derived_files/initial_core_orgs.csv")

In [30]:
org_members_df = pd.read_csv("../data/join_files/org_members_join_dataset.csv")

### Explore Potential New Materials

In [31]:
missing_users = search_queries_repo_join_df[~search_queries_repo_join_df['owner.login'].isin(user_df.login)]
missing_users = missing_users[missing_users['owner.type'] == 'User']
missing_orgs = search_queries_repo_join_df[~search_queries_repo_join_df['owner.login'].isin(org_df.login)]
missing_orgs = missing_orgs[missing_orgs['owner.type'] == 'Organization']
len(missing_users), len(missing_orgs)


(0, 3)

In [32]:
missing_users = missing_users[['owner.login']]
missing_users = missing_users.rename(columns={'owner.login': 'login'})
missing_users['type'] = 'User'
missing_users['url'] = missing_users.login.apply(lambda x: f"https://api.github.com/users/{x}")

In [33]:
initial_org_members = org_members_df[(org_members_df['org_login'].isin(initial_core_orgs.login)) ]
initial_org_members = initial_org_members[~initial_org_members['login'].isin(initial_core_users.login)]
additional_missing_users = initial_org_members[~initial_org_members['login'].isin(user_df.login)]
additional_missing_users = additional_missing_users[['login', 'url', 'type']] 

In [34]:
missing_users = pd.concat([missing_users, additional_missing_users])
len(missing_users)

3

In [35]:
missing_orgs = missing_orgs[['owner.login']]
missing_orgs = missing_orgs.rename(columns={'owner.login': 'login'})
missing_orgs['type'] = 'Organization'
missing_orgs['url'] = missing_orgs.login.apply(lambda x: f"https://api.github.com/orgs/{x}")

In [36]:
if len(missing_orgs) > 0:
    org_df = check_add_orgs(missing_orgs, '../data/entity_files/orgs_dataset.csv', True, False)
if len(missing_users) > 0:
    user_df = check_add_users(missing_users, '../data/entity_files/users_dataset.csv', True, False)

Cleaning Orgs: 100%|██████████| 3/3 [00:10<00:00,  3.62s/it]


                       file_name  file_size  \
28  repos_dataset_2022_11_04.csv   39255456   
14  users_dataset_2023_06_02.csv   25265947   

                            directory       date  
28  ../data/older_files/entity_files/ 2022-11-04  
14  ../data/older_files/entity_files/ 2023-06-02  
Number of new users: 3




3


Users:   0%|          | 0/3 [00:47<?, ?it/s]


                       file_name  file_size  \
29  repos_dataset_2022_11_04.csv   39255456   
15  users_dataset_2023_06_03.csv   33404563   

                            directory       date  
29  ../data/older_files/entity_files/ 2022-11-04  
15  ../data/older_files/entity_files/ 2023-06-03  


In [38]:
expanded_org_members = user_df[user_df.login.isin(initial_org_members.login)]

In [39]:
expanded_core_users = user_df[(user_df.login.isin(search_queries_repo_join_df['owner.login']))]
expanded_core_users = expanded_core_users[~expanded_core_users.login.isin(initial_core_users.login)]
expanded_core_orgs = expanded_core_users[expanded_core_users['type'] == 'Organization']
expanded_core_users = expanded_core_users[expanded_core_users['type'] == 'User']
expanded_core_orgs = expanded_core_orgs[~expanded_core_orgs.login.isin(initial_core_orgs.login)]
expanded_core_users = pd.concat([expanded_core_users, expanded_org_members])
len(expanded_core_orgs), len(expanded_core_users), len(initial_core_orgs), len(initial_core_users)

(244, 1783, 190, 736)

In [41]:
filter_columns = ['public_repos',
#  'public_gists',
#  'followers',
#  'following',
#  'star_count'
 ]
filtered_initial_core_users = initial_core_users.copy()
filtered_initial_core_orgs = initial_core_orgs.copy()
print(filtered_initial_core_users.public_repos.sum())
print(filtered_initial_core_orgs.public_repos.sum())
for col in filter_columns:
    filtered_initial_core_users = filtered_initial_core_users[(filtered_initial_core_users[col].ge(filtered_initial_core_users[col].quantile(q=.25))) & (filtered_initial_core_users[col].le(filtered_initial_core_users[col].quantile(q=.75)))]
    filtered_initial_core_orgs = filtered_initial_core_orgs[(filtered_initial_core_orgs[col].ge(filtered_initial_core_orgs[col].quantile(q=.25))) & (filtered_initial_core_orgs[col].le(filtered_initial_core_orgs[col].quantile(q=.75)))]
print(filtered_initial_core_users.public_repos.sum(), filtered_initial_core_orgs.public_repos.sum())
filtered_users = pd.concat([filtered_initial_core_users, filtered_initial_core_orgs])

8878.0
3134.0
1932.0 594.0


In [42]:
len(filtered_users), len(expanded_core_users)

(581, 1783)

In [48]:
filtered_users = pd.concat([initial_core_users, initial_core_orgs])
len(filtered_users)

926

In [71]:
total_missing_repos = 0
for _, row in filtered_users.iterrows():
    existing_repos = repo_df[repo_df['owner.login'] == row['login']]
    if len(existing_repos) != row['public_repos']:
        # print(f"For user {row['login']} there are {len(existing_repos)} repos but they have {row['public_repos']} public repos")
        total_missing_repos += row['public_repos'] - len(existing_repos)

In [72]:
total_missing_repos

-773.0

In [73]:
if total_missing_repos > 0:
    from datetime import datetime
    user_repos_output_path = "../data/large_files/join_files/user_repos_join_dataset.csv"
    repos_output_path = "../data/large_files/entity_files/repos_dataset.csv"
    users_output_path = "../data/entity_files/users_dataset.csv"
    get_url_field = "repos_url"
    load_existing_files = False
    overwrite_existing_temp_files = False
    join_unique_field = 'user_login'
    filter_fields = ['user_login', 'full_name']
    retry_errors = True

    user_repos_df, user_df = get_user_repo_activities(filtered_users,user_repos_output_path, users_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields, retry_errors)

    grouped_user_repos_df = user_repos_df['user_login'].value_counts().reset_index().rename(columns={'index': 'login', 'user_login': f'new_public_repos'})
    merged_df = pd.merge(filtered_users[['login', 'public_repos']], grouped_user_repos_df, on='login', how='left')
    merged_df.new_public_repos.fillna(0, inplace=True)
    new_repos = user_repos_df[~user_repos_df.full_name.isin(repo_df.full_name)]
    repo_headers = pd.read_csv('../data/metadata_files/repo_headers.csv')
    new_repo_df = new_repos[repo_headers.columns]
    repo_df = pd.concat([repo_df, new_repo_df])
    repo_df = repo_df.drop_duplicates(subset=['full_name'])
    repo_output_path = "../data/large_files/entity_files/repos_dataset.csv"

    check_if_older_file_exists(repo_output_path)
    repo_df['repo_query_time'] = datetime.now().strftime("%Y-%m-%d")
    repo_df.to_csv(repo_output_path, index=False)
    # print("Repo file updated", time.time())
    repo_df = check_for_entity_in_older_queries(repo_output_path, repo_df, is_large=True)

In [74]:
expanded_core_repos = repo_df[repo_df['owner.login'].isin(filtered_users.login)]
expanded_core_repos = expanded_core_repos[~expanded_core_repos.full_name.isin(initial_core_repos.full_name)]

In [75]:
len(expanded_core_repos), len(initial_core_repos)

(12495, 2485)

In [76]:
existing_expanded_core_orgs_file_path = "../data/derived_files/firstpass_core_orgs.csv"
existing_expanded_core_users_file_path = "../data/derived_files/firstpass_core_users.csv"
existing_expanded_core_repos_file_path = "../data/derived_files/firstpass_core_repos.csv"

if os.path.exists(existing_expanded_core_orgs_file_path):
    existing_expanded_core_orgs = pd.read_csv(existing_expanded_core_orgs_file_path)
    expanded_core_orgs = expanded_core_orgs[~expanded_core_orgs.login.isin(existing_expanded_core_orgs.login)]
    print(f"Added {len(expanded_core_orgs)} orgs to existing {len(existing_expanded_core_orgs)} orgs")
else:
    expanded_core_orgs.to_csv("../data/derived_files/firstpass_core_orgs.csv", index=False)

if os.path.exists(existing_expanded_core_users_file_path):
    existing_expanded_core_users = pd.read_csv(existing_expanded_core_users_file_path)
    expanded_core_users = expanded_core_users[~expanded_core_users.login.isin(existing_expanded_core_users.login)]
    print(f"Added {len(expanded_core_users)} users to existing {len(existing_expanded_core_users)} users")
else:
    expanded_core_users.to_csv("../data/derived_files/firstpass_core_users.csv", index=False)

if os.path.exists(existing_expanded_core_repos_file_path):
    existing_expanded_core_repos = pd.read_csv(existing_expanded_core_repos_file_path)
    expanded_core_repos = expanded_core_repos[~expanded_core_repos.full_name.isin(existing_expanded_core_repos.full_name)]
    print(f"Added {len(expanded_core_repos)} repos to existing {len(existing_expanded_core_repos)} repos")
else:
    expanded_core_repos.to_csv("../data/derived_files/firstpass_core_repos.csv", index=False)

Added 0 orgs to existing 270 orgs
Added 223 users to existing 1532 users
Added 10827 repos to existing 1668 repos


In [81]:
expanded_core_repos[0:1].to_dict()

{'id': {213164: 67361568.0},
 'node_id': {213164: 'MDEwOlJlcG9zaXRvcnk2NzM2MTU2OA=='},
 'name': {213164: 'tankbuster'},
 'full_name': {213164: 'thiippal/tankbuster'},
 'private': {213164: '0.0'},
 'html_url': {213164: 'https://github.com/thiippal/tankbuster'},
 'description': {213164: 'An image classifier trained to detect Soviet/Russian military vehicles'},
 'fork': {213164: '0.0'},
 'url': {213164: 'https://api.github.com/repos/thiippal/tankbuster'},
 'forks_url': {213164: 'https://api.github.com/repos/thiippal/tankbuster/forks'},
 'keys_url': {213164: 'https://api.github.com/repos/thiippal/tankbuster/keys{/key_id}'},
 'collaborators_url': {213164: 'https://api.github.com/repos/thiippal/tankbuster/collaborators{/collaborator}'},
 'teams_url': {213164: 'https://api.github.com/repos/thiippal/tankbuster/teams'},
 'hooks_url': {213164: 'https://api.github.com/repos/thiippal/tankbuster/hooks'},
 'issue_events_url': {213164: 'https://api.github.com/repos/thiippal/tankbuster/issues/events{/

In [77]:
# expanded_core_repos = existing_expanded_core_repos.copy()

### Evaluate Languages of New Materials

In [82]:
tqdm.pandas(desc='Detecting language')
expanded_core_repos.description = expanded_core_repos.description.fillna('')
if 'detected_language' in expanded_core_repos.columns:
    needs_language_repos = expanded_core_repos[expanded_core_repos.detected_language.isna()]
    existing_language_repos = expanded_core_repos[~expanded_core_repos.detected_language.isna()]
else:
    needs_language_repos = expanded_core_repos
    existing_language_repos = pd.DataFrame() 
needs_language_repos = needs_language_repos.progress_apply(check_detect_language, axis=1, is_repo=True)
expanded_core_repos = pd.concat([existing_language_repos, needs_language_repos])
expanded_core_repos = expanded_core_repos.reset_index(drop=True)
expanded_core_users.bio = expanded_core_users.bio.fillna('')
if 'detected_language' in expanded_core_users.columns:
    needs_language_users = expanded_core_users[expanded_core_users.detected_language.isna()]
    existing_language_users = expanded_core_users[~expanded_core_users.detected_language.isna()]
else:
    needs_language_users = expanded_core_users
    existing_language_users = pd.DataFrame()
needs_language_users = needs_language_users.progress_apply(check_detect_language, axis=1, is_repo=False)
expanded_core_users = pd.concat([existing_language_users, needs_language_users])
expanded_core_users = expanded_core_users.reset_index(drop=True)
expanded_core_orgs.bio = expanded_core_orgs.bio.fillna('')
if 'detected_language' in expanded_core_orgs.columns:
    needs_language_orgs = expanded_core_orgs[expanded_core_orgs.detected_language.isna()]
    existing_language_orgs = expanded_core_orgs[~expanded_core_orgs.detected_language.isna()]
else:
    needs_language_orgs = expanded_core_orgs
    existing_language_orgs = pd.DataFrame()
needs_language_orgs = needs_language_orgs.progress_apply(check_detect_language, axis=1, is_repo=False)
expanded_core_orgs = pd.concat([existing_language_orgs, needs_language_orgs])
expanded_core_orgs = expanded_core_orgs.reset_index(drop=True)

Detecting language: 100%|██████████| 10827/10827 [04:48<00:00, 37.59it/s]
Detecting language: 100%|██████████| 223/223 [00:02<00:00, 91.79it/s] 
Detecting language: 1it [00:00, 1111.07it/s]


In [87]:
initial_users = pd.concat([initial_core_orgs[['login', 'finalized_language']], initial_core_users[['login', 'finalized_language']]])

In [88]:
expanded_core_repos = pd.merge(expanded_core_repos, initial_users, left_on='owner.login', right_on='login', how='left')

In [89]:
expanded_core_repos['potential_language'] = np.where(expanded_core_repos['detected_language'].isnull(), expanded_core_repos['finalized_language'], expanded_core_repos['detected_language'])


In [90]:
expanded_core_repos = expanded_core_repos.drop(columns=['login', 'finalized_language'])

In [91]:
expanded_core_repos.duplicated().sum()

0

In [92]:
if os.path.exists(existing_expanded_core_repos_file_path):
    expanded_core_repos = pd.concat([expanded_core_repos, existing_expanded_core_repos])
    expanded_core_repos.to_csv("../data/derived_files/firstpass_core_repos.csv", index=False)
else:
    expanded_core_repos.to_csv("../data/derived_files/firstpass_core_repos.csv", index=False)

In [93]:
len(expanded_core_repos)

12495

In [94]:
len(expanded_core_users)

223

In [95]:
expanded_core_users = pd.merge(expanded_core_users, initial_core_repos[['finalized_language', 'owner.login']], left_on='login', right_on='owner.login', how='left')

In [96]:
expanded_core_users = expanded_core_users[~expanded_core_users.duplicated(subset=['login', 'owner.login', 'finalized_language'])]

In [97]:
expanded_core_users['potential_language'] = np.where(expanded_core_users['detected_language'].isnull(), expanded_core_users['finalized_language'], expanded_core_users['detected_language'])

In [98]:
expanded_core_users = expanded_core_users[~expanded_core_users.duplicated(subset=['login', 'owner.login', 'detected_language', 'finalized_language', 'potential_language'])]

In [99]:
expanded_core_users = expanded_core_users.drop(columns=['owner.login', 'finalized_language'])

In [100]:
len(expanded_core_users)

223

In [101]:
users_multiple_languages = expanded_core_users[expanded_core_users.duplicated(subset=['login'])].login.unique().tolist()
if len(users_multiple_languages) > 0:
    for login in users_multiple_languages:
        user_df = expanded_core_users[(expanded_core_users.login == login)]
        if len(user_df.potential_language.unique()) > 1:
            languages = user_df.potential_language.unique().tolist()
            languages = [x for x in languages if str(x) != 'nan']
            updated_languages = [x for x in languages if ',' in x]
            if len(updated_languages) == 0:
                updated_languages = ', '.join(languages)
            else:
                updated_languages = updated_languages[0]
            expanded_core_users.loc[expanded_core_users.login == login, 'potential_language'] = updated_languages

In [102]:
expanded_core_users = expanded_core_users.drop_duplicates(subset=['login', 'potential_language'])

In [103]:
if os.path.exists(existing_expanded_core_users_file_path):
    expanded_core_users = pd.concat([expanded_core_users, existing_expanded_core_users])
expanded_core_users.to_csv("../data/derived_files/firstpass_core_users.csv", index=False)

In [104]:
len(expanded_core_orgs)

0

In [84]:
expanded_core_orgs = pd.merge(expanded_core_orgs, initial_core_repos[['finalized_language', 'owner.login']], left_on='login', right_on='owner.login', how='left')

In [85]:
expanded_core_orgs = expanded_core_orgs[~expanded_core_orgs.duplicated(subset=['login', 'owner.login', 'finalized_language'])]
expanded_core_orgs['potential_language'] = np.where(expanded_core_orgs['detected_language'].isnull(), expanded_core_orgs['finalized_language'], expanded_core_orgs['detected_language'])
expanded_core_orgs = expanded_core_orgs[~expanded_core_orgs.duplicated(subset=['login', 'owner.login', 'detected_language', 'finalized_language', 'potential_language'])]
expanded_core_orgs = expanded_core_orgs.drop(columns=['owner.login', 'finalized_language'])
len(expanded_core_orgs)

8

In [86]:
orgs_multiple_languages = expanded_core_orgs[expanded_core_orgs.duplicated(subset=['login'])].login.unique().tolist()
if len(orgs_multiple_languages) > 0:
    for login in orgs_multiple_languages:
        user_df = expanded_core_orgs[(expanded_core_orgs.login == login)]
        if len(user_df.potential_language.unique()) > 1:
            languages = user_df.potential_language.unique().tolist()
            languages = [x for x in languages if str(x) != 'nan']
            updated_languages = [x for x in languages if ',' in x]
            if len(updated_languages) == 0:
                updated_languages = ', '.join(languages)
            else:
                updated_languages = updated_languages[0]
            expanded_core_orgs.loc[expanded_core_orgs.login == login, 'potential_language'] = updated_languages

In [87]:
expanded_core_orgs = expanded_core_orgs.drop_duplicates(subset=['login', 'potential_language'])
len(expanded_core_orgs)

8

In [105]:
if os.path.exists(existing_expanded_core_orgs_file_path):
    expanded_core_orgs = pd.concat([expanded_core_orgs, existing_expanded_core_orgs])
expanded_core_orgs.to_csv("../data/derived_files/firstpass_core_orgs.csv", index=False)