# Process First Pass Results

### Load Libraries and Datasets

In [74]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append("../")
from data_generation_scripts.utils import check_rate_limit, check_add_orgs, check_add_repos, check_add_users
from data_generation_scripts.generate_user_repos_interactions import get_user_repo_activities
from data_generation_scripts.generate_translations import check_detect_language

Once we have run the `ProcessDHRepos`, `ProcessDHUsers`, and `ProcessDHOrgs` notebooks, we can start to expand our dataset. Our initial first pass expansion will be to add the following information to our dataset:

- owners of `initial_core_repos` will be added to `core_users` and `core_orgs`
- repos of `core_users` and `core_orgs` will be added to `core_repos`

We may do some thresholding to avoid too many users or repos being added.

In [2]:
user_df = pd.read_csv("../data/entity_files/users_dataset.csv")
repo_df = pd.read_csv("../data/large_files/entity_files/repos_dataset.csv", low_memory=False)
org_df = pd.read_csv("../data/entity_files/orgs_dataset.csv", low_memory=False)
search_queries_repo_join_df = pd.read_csv("../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv")
search_queries_user_join_df = pd.read_csv(
    "../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv")

In [3]:
initial_core_users = pd.read_csv("../data/derived_files/initial_core_users.csv")
initial_core_repos = pd.read_csv("../data/derived_files/initial_core_repos.csv")
initial_core_orgs = pd.read_csv("../data/derived_files/initial_core_orgs.csv")

### Explore Potential New Materials

In [6]:
expanded_core_users = user_df[(user_df.login.isin(search_queries_repo_join_df['owner.login']))]
expanded_core_users = expanded_core_users[~expanded_core_users.login.isin(initial_core_users.login)]
expanded_core_orgs = expanded_core_users[expanded_core_users['type'] == 'Organization']
expanded_core_users = expanded_core_users[expanded_core_users['type'] == 'User']
expanded_core_orgs = expanded_core_orgs[~expanded_core_orgs.login.isin(initial_core_orgs.login)]
len(expanded_core_orgs), len(expanded_core_users), len(initial_core_orgs), len(initial_core_users)

(262, 1296, 126, 667)

In [59]:
filter_columns = ['public_repos',
#  'public_gists',
#  'followers',
#  'following',
#  'star_count'
 ]

In [63]:
filtered_initial_core_users = initial_core_users.copy()
filtered_initial_core_orgs = initial_core_orgs.copy()
print(filtered_initial_core_users.public_repos.sum())
print(filtered_initial_core_orgs.public_repos.sum())
for col in filter_columns:
    filtered_initial_core_users = filtered_initial_core_users[(filtered_initial_core_users[col].ge(filtered_initial_core_users[col].quantile(q=.25))) & (filtered_initial_core_users[col].le(filtered_initial_core_users[col].quantile(q=.75)))]
    filtered_initial_core_orgs = filtered_initial_core_orgs[(filtered_initial_core_orgs[col].ge(filtered_initial_core_orgs[col].quantile(q=.25))) & (filtered_initial_core_orgs[col].le(filtered_initial_core_orgs[col].quantile(q=.75)))]
print(filtered_initial_core_users.public_repos.sum(), filtered_initial_core_orgs.public_repos.sum())

8486.0
1860.0
1684.0 369.0


In [64]:
filtered_users = pd.concat([filtered_initial_core_users, filtered_initial_core_orgs])

In [71]:
len(filtered_users)

410

In [69]:
total_missing_repos = 0
for _, row in filtered_users.iterrows():
    existing_repos = repo_df[repo_df['owner.login'] == row['login']]
    if len(existing_repos) != row['public_repos']:
        print(f"For user {row['login']} there are {len(existing_repos)} repos but they have {row['public_repos']} public repos")
        total_missing_repos += row['public_repos'] - len(existing_repos)

For user jamesosullivan there are 1 repos but they have 4.0 public repos
For user juantieme there are 0 repos but they have 7.0 public repos
For user lcjcine there are 1 repos but they have 4.0 public repos
For user jtonra there are 1 repos but they have 3.0 public repos
For user AlexSanford13 there are 1 repos but they have 2.0 public repos
For user cclivaz there are 0 repos but they have 3.0 public repos
For user lrskjr there are 0 repos but they have 5.0 public repos
For user Bakko000 there are 1 repos but they have 7.0 public repos
For user DennisFriedl there are 1 repos but they have 4.0 public repos
For user nikolito there are 4 repos but they have 11.0 public repos
For user mars-aria there are 4 repos but they have 6.0 public repos
For user rabeakleymann there are 0 repos but they have 2.0 public repos
For user digital-humanities-hse there are 1 repos but they have 2.0 public repos
For user paolomonella there are 3 repos but they have 8.0 public repos
For user tommasobattisti th

In [70]:
if len(total_missing_repos) > 0:
    user_repos_output_path = "../data/large_files/join_files/user_repos_join_dataset.csv"
    repos_output_path = "../data/large_files/entity_files/repos_dataset.csv"
    get_url_field = "repos_url"
    load_existing_files = False
    overwrite_existing_temp_files = False

    users_repos_df, repo_df = get_user_repo_activities(filtered_users,user_repos_output_path, repos_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files)

1552.0

In [73]:
expanded_core_repos = repo_df[repo_df['owner.login'].isin(filtered_users.login)]
expanded_core_repos = expanded_core_repos[~expanded_core_repos.full_name.isin(initial_core_repos.full_name)]

In [None]:
expanded_core_orgs.to_csv("../data/derived_files/firstpass_core_orgs.csv", index=False)
expanded_core_users.to_csv("../data/derived_files/firstpass_core_users.csv", index=False)
expanded_core_repos.to_csv("../data/derived_files/firstpass_core_repos.csv", index=False)

### Evaluate Languages of New Materials