# Process First Pass Results

### Load Libraries and Datasets

In [36]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import os
import sys
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

sys.path.append("../")
from data_generation_scripts.utils import check_rate_limit, check_add_orgs, check_add_repos, check_add_users, check_for_joins_in_older_queries, check_return_error_file
from data_generation_scripts.generate_user_repos_interactions import get_user_repo_activities
from data_generation_scripts.generate_translations import check_detect_language

Once we have run the `ProcessDHRepos`, `ProcessDHUsers`, and `ProcessDHOrgs` notebooks, we can start to expand our dataset. Our initial first pass expansion will be to add the following information to our dataset:

- owners of `initial_core_repos` will be added to `core_users` and `core_orgs`
- repos of `core_users` and `core_orgs` will be added to `core_repos`

We may do some thresholding to avoid too many users or repos being added.

In [2]:
user_df = pd.read_csv("../data/entity_files/users_dataset.csv")
repo_df = pd.read_csv("../data/large_files/entity_files/repos_dataset.csv", low_memory=False)
org_df = pd.read_csv("../data/entity_files/orgs_dataset.csv", low_memory=False)
search_queries_repo_join_df = pd.read_csv("../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv")
search_queries_user_join_df = pd.read_csv(
    "../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv")

In [3]:
initial_core_users = pd.read_csv("../data/derived_files/initial_core_users.csv")
initial_core_repos = pd.read_csv("../data/derived_files/initial_core_repos.csv")
initial_core_orgs = pd.read_csv("../data/derived_files/initial_core_orgs.csv")

### Explore Potential New Materials

In [4]:
expanded_core_users = user_df[(user_df.login.isin(search_queries_repo_join_df['owner.login']))]
expanded_core_users = expanded_core_users[~expanded_core_users.login.isin(initial_core_users.login)]
expanded_core_orgs = expanded_core_users[expanded_core_users['type'] == 'Organization']
expanded_core_users = expanded_core_users[expanded_core_users['type'] == 'User']
expanded_core_orgs = expanded_core_orgs[~expanded_core_orgs.login.isin(initial_core_orgs.login)]
len(expanded_core_orgs), len(expanded_core_users), len(initial_core_orgs), len(initial_core_users)

(262, 1296, 126, 667)

In [5]:
filter_columns = ['public_repos',
#  'public_gists',
#  'followers',
#  'following',
#  'star_count'
 ]

In [6]:
filtered_initial_core_users = initial_core_users.copy()
filtered_initial_core_orgs = initial_core_orgs.copy()
print(filtered_initial_core_users.public_repos.sum())
print(filtered_initial_core_orgs.public_repos.sum())
for col in filter_columns:
    filtered_initial_core_users = filtered_initial_core_users[(filtered_initial_core_users[col].ge(filtered_initial_core_users[col].quantile(q=.25))) & (filtered_initial_core_users[col].le(filtered_initial_core_users[col].quantile(q=.75)))]
    filtered_initial_core_orgs = filtered_initial_core_orgs[(filtered_initial_core_orgs[col].ge(filtered_initial_core_orgs[col].quantile(q=.25))) & (filtered_initial_core_orgs[col].le(filtered_initial_core_orgs[col].quantile(q=.75)))]
print(filtered_initial_core_users.public_repos.sum(), filtered_initial_core_orgs.public_repos.sum())

8486.0
1860.0
1684.0 369.0


In [7]:
filtered_users = pd.concat([filtered_initial_core_users, filtered_initial_core_orgs])

In [8]:
len(filtered_users)

410

In [9]:
total_missing_repos = 0
for _, row in filtered_users.iterrows():
    existing_repos = repo_df[repo_df['owner.login'] == row['login']]
    if len(existing_repos) != row['public_repos']:
        print(f"For user {row['login']} there are {len(existing_repos)} repos but they have {row['public_repos']} public repos")
        total_missing_repos += row['public_repos'] - len(existing_repos)

For user jtonra there are 1 repos but they have 3.0 public repos
For user cclivaz there are 0 repos but they have 3.0 public repos
For user lrskjr there are 8 repos but they have 5.0 public repos
For user DennisFriedl there are 5 repos but they have 4.0 public repos
For user nikolito there are 12 repos but they have 11.0 public repos
For user mars-aria there are 20 repos but they have 6.0 public repos
For user digital-humanities-hse there are 1 repos but they have 2.0 public repos
For user paolomonella there are 9 repos but they have 8.0 public repos
For user tommasobattisti there are 10 repos but they have 4.0 public repos
For user ChessPiece21 there are 2 repos but they have 9.0 public repos
For user EstelleEvelyn there are 0 repos but they have 8.0 public repos
For user Nick-Archaeology there are 1 repos but they have 5.0 public repos
For user yaeln there are 1 repos but they have 10.0 public repos
For user satkey there are 1 repos but they have 5.0 public repos
For user Rosmerade t

In [10]:
# if total_missing_repos > 0:
#     user_repos_output_path = "../data/large_files/join_files/user_repos_join_dataset.csv"
#     repos_output_path = "../data/large_files/entity_files/repos_dataset.csv"
#     get_url_field = "repos_url"
#     load_existing_files = False
#     overwrite_existing_temp_files = False

#     users_repos_df, repo_df = get_user_repo_activities(filtered_users,user_repos_output_path, repos_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files)

In [11]:
expanded_core_repos = repo_df[repo_df['owner.login'].isin(filtered_users.login)]
expanded_core_repos = expanded_core_repos[~expanded_core_repos.full_name.isin(initial_core_repos.full_name)]

In [13]:
total_missing_repos

453.0

In [12]:
len(expanded_core_repos)

1497

In [14]:
expanded_core_orgs.to_csv("../data/derived_files/firstpass_core_orgs.csv", index=False)
expanded_core_users.to_csv("../data/derived_files/firstpass_core_users.csv", index=False)
expanded_core_repos.to_csv("../data/derived_files/firstpass_core_repos.csv", index=False)

### Evaluate Languages of New Materials

In [15]:
tqdm.pandas(desc='Detecting language')
expanded_core_repos.description = expanded_core_repos.description.fillna('')
expanded_core_repos = expanded_core_repos.progress_apply(check_detect_language, axis=1, is_repo=True)
expanded_core_users.bio = expanded_core_users.bio.fillna('')
expanded_core_users = expanded_core_users.progress_apply(check_detect_language, axis=1, is_repo=False)
expanded_core_orgs.bio = expanded_core_orgs.bio.fillna('')
expanded_core_orgs = expanded_core_orgs.progress_apply(check_detect_language, axis=1, is_repo=False)

Detecting language: 100%|██████████| 1497/1497 [00:42<00:00, 34.91it/s]
Detecting language: 100%|██████████| 1296/1296 [00:16<00:00, 77.70it/s] 
Detecting language: 100%|██████████| 262/262 [00:04<00:00, 62.62it/s]


In [18]:
len(expanded_core_orgs[expanded_core_orgs.detected_language.isna()]), len(expanded_core_orgs[expanded_core_orgs.detected_language.notna()]), len(expanded_core_users[expanded_core_users.detected_language.isna()]), len(expanded_core_users[expanded_core_users.detected_language.notna()]), len(expanded_core_repos[expanded_core_repos.detected_language.isna()]), len(expanded_core_repos[expanded_core_repos.detected_language.notna()])

(154, 108, 876, 420, 428, 1069)

In [19]:
total_missing_languages = len(expanded_core_orgs[expanded_core_orgs.detected_language.isna()]) + len(expanded_core_users[expanded_core_users.detected_language.isna()]) + len(expanded_core_repos[expanded_core_repos.detected_language.isna()])
total_missing_languages

1458

In [24]:
len(expanded_core_orgs[(expanded_core_orgs.detected_language.isna()) & (expanded_core_orgs.bio.str.len() > 0)]), len(expanded_core_users[(expanded_core_users.detected_language.isna()) & (expanded_core_users.bio.str.len() > 0)]), len(expanded_core_repos[(expanded_core_repos.detected_language.isna()) & (expanded_core_repos.description.str.len() > 0)])

(0, 0, 0)

In [25]:
expanded_core_orgs.to_csv("../data/derived_files/firstpass_core_orgs.csv", index=False)
expanded_core_users.to_csv("../data/derived_files/firstpass_core_users.csv", index=False)
expanded_core_repos.to_csv("../data/derived_files/firstpass_core_repos.csv", index=False)

In [51]:
initial_users = pd.concat([initial_core_orgs[['login', 'finalized_language']], initial_core_users[['login', 'finalized_language']]])

In [52]:
expanded_core_repos = pd.merge(expanded_core_repos, initial_users, left_on='owner.login', right_on='login', how='left')

In [53]:
expanded_core_repos['potential_language'] = np.where(expanded_core_repos['detected_language'].isnull(), expanded_core_repos['finalized_language'], expanded_core_repos['detected_language'])


In [66]:
expanded_core_repos = expanded_core_repos.drop(columns=['login', 'finalized_language'])

In [67]:
expanded_core_repos.duplicated().sum()

0

In [54]:
len(expanded_core_repos)

1497

In [141]:
len(expanded_core_users)

1293

In [142]:
expanded_core_users = pd.merge(expanded_core_users, initial_core_repos[['finalized_language', 'owner.login']], left_on='login', right_on='owner.login', how='left')

In [143]:
expanded_core_users = expanded_core_users[~expanded_core_users.duplicated(subset=['login', 'owner.login', 'finalized_language'])]

In [144]:
expanded_core_users['potential_language'] = np.where(expanded_core_users['detected_language'].isnull(), expanded_core_users['finalized_language'], expanded_core_users['detected_language'])

In [145]:
expanded_core_users = expanded_core_users[~expanded_core_users.duplicated(subset=['login', 'owner.login', 'detected_language', 'finalized_language', 'potential_language'])]

In [146]:
expanded_core_users = expanded_core_users.drop(columns=['owner.login', 'finalized_language'])

In [147]:
len(expanded_core_users)

1304

In [148]:
users_multiple_languages = expanded_core_users[expanded_core_users.duplicated(subset=['login'])].login.unique().tolist()
if len(users_multiple_languages) > 0:
    for login in users_multiple_languages:
        user_df = expanded_core_users[(expanded_core_users.login == login)]
        if len(user_df.potential_language.unique()) > 1:
            languages = user_df.potential_language.unique().tolist()
            languages = [x for x in languages if str(x) != 'nan']
            updated_languages = [x for x in languages if ',' in x]
            if len(updated_languages) == 0:
                updated_languages = ', '.join(languages)
            else:
                updated_languages = updated_languages[0]
            expanded_core_users.loc[expanded_core_users.login == login, 'potential_language'] = updated_languages

In [150]:
expanded_core_users = expanded_core_users.drop_duplicates(subset=['login', 'potential_language'])

In [151]:
expanded_core_users.to_csv("../data/derived_files/firstpass_core_users.csv", index=False)

In [152]:
len(expanded_core_orgs)

262

In [154]:
expanded_core_orgs = pd.merge(expanded_core_orgs, initial_core_repos[['finalized_language', 'owner.login']], left_on='login', right_on='owner.login', how='left')

In [155]:
expanded_core_orgs = expanded_core_orgs[~expanded_core_orgs.duplicated(subset=['login', 'owner.login', 'finalized_language'])]
expanded_core_orgs['potential_language'] = np.where(expanded_core_orgs['detected_language'].isnull(), expanded_core_orgs['finalized_language'], expanded_core_orgs['detected_language'])
expanded_core_orgs = expanded_core_orgs[~expanded_core_orgs.duplicated(subset=['login', 'owner.login', 'detected_language', 'finalized_language', 'potential_language'])]
expanded_core_orgs = expanded_core_orgs.drop(columns=['owner.login', 'finalized_language'])
len(expanded_core_orgs)

270

In [156]:
orgs_multiple_languages = expanded_core_orgs[expanded_core_orgs.duplicated(subset=['login'])].login.unique().tolist()
if len(orgs_multiple_languages) > 0:
    for login in orgs_multiple_languages:
        user_df = expanded_core_orgs[(expanded_core_orgs.login == login)]
        if len(user_df.potential_language.unique()) > 1:
            languages = user_df.potential_language.unique().tolist()
            languages = [x for x in languages if str(x) != 'nan']
            updated_languages = [x for x in languages if ',' in x]
            if len(updated_languages) == 0:
                updated_languages = ', '.join(languages)
            else:
                updated_languages = updated_languages[0]
            expanded_core_orgs.loc[expanded_core_orgs.login == login, 'potential_language'] = updated_languages

In [169]:
expanded_core_orgs = expanded_core_orgs.drop_duplicates(subset=['login', 'potential_language'])
len(expanded_core_orgs)

262

In [170]:
expanded_core_orgs.to_csv("../data/derived_files/firstpass_core_orgs.csv", index=False)