# Process DH Repos

This notebook will be run multiple times as we add new repositories to the DH repo list. It will take the data from the DH repo list and process it through making additional calls to GitHub's APIs.

### Load Libraries and Set Initial Dataset

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append("..")
from data_generation_scripts.utils import check_rate_limit, check_return_error_file, read_combine_files, check_total_pages, check_for_joins_in_older_queries, check_add_users, combined_updated_users, check_for_entity_in_older_queries
from data_generation_scripts.generate_search_data import get_initial_search_datasets
from data_generation_scripts.generate_repo_metadata import get_repo_languages, get_repo_labels, get_repo_tags,  get_repo_profile, get_total_commits
from data_generation_scripts.generate_repo_users_interactions import get_repos_user_actors
from data_generation_scripts.generate_repo_metadata import check_total_results

In [2]:
rates_df = check_rate_limit()

In [123]:
initial_core_repos = pd.read_csv("../data/derived_files/initial_core_repos.csv")

In [3]:
core_repos_path = "../data/derived_files/firstpass_core_repos.csv"
core_repos = pd.read_csv(core_repos_path)
len(core_repos)

12495

### Get Missing Counts

In [4]:
counts_fields = pd.read_csv('../data/metadata_files/repo_url_cols.csv')

In [5]:
counts_fields.loc[counts_fields.url_type == 'review_comments_url', 'count_type'] = 'review_count'

In [6]:
def get_counts(repo_df, url_type, count_type, overwrite_existing_temp_files = False):
    if count_type in repo_df.columns:
        needs_counts = repo_df[repo_df[count_type].isna()]
        has_counts = repo_df[repo_df[count_type].notna()]
    else:
        needs_counts = repo_df
        has_counts = pd.DataFrame()
        
    if len(has_counts) == len(repo_df):
        repo_df = has_counts
    else:
        needs_counts = check_total_results(needs_counts, url_type, overwrite_existing_temp_files)
        repo_df = pd.concat([needs_counts, has_counts])
    return repo_df


In [7]:
from IPython.display import clear_output
skip_types = ['review_comments_url', 'commits_url', 'collaborators_url']
overwrite_existing_temp_files = True
for index, row in counts_fields.iterrows():
    if (row.url_type not in skip_types):
        count_type = row.url_type.split("_")[0] + "_count"
        print(f"Getting {count_type} for {row['url_type']}")
        if (count_type not in core_repos.columns) or (core_repos[count_type].isna().any()):
            core_repos = get_counts(core_repos, row['url_type'], count_type, overwrite_existing_temp_files)
        row['count_type'] = count_type
        clear_output(wait=True)


Getting pulls_count for pulls_url


In [8]:
# core_repos.to_csv(core_repos_path, index=False)

In [9]:
# counts_fields.to_csv('../data/metadata_files/repo_url_cols.csv', index=False)

### Get Repo Contributors

In [124]:
get_url_field = 'contributors_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'login']
contributors_df, users_df = get_repos_user_actors(core_repos, '../data/join_files/repo_contributors_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
contributors_errors_df = check_return_error_file('../data/error_logs/repo_contributors_join_dataset_errors.csv')

In [134]:
subset_contributors_df = contributors_df[(contributors_df['repo_full_name'].isin(core_repos['full_name']))]

In [135]:
len(contributors_df), len(subset_contributors_df)

(120151, 114763)

In [136]:
print(f"From {len(core_repos)} repos, we found {len(subset_contributors_df)} contributors, of which {len(subset_contributors_df.login.unique())} are unique. There were {len(contributors_errors_df)} errors in getting contributors (likely user accounts that no longer exist).")

From 12495 repos, we found 114763 contributors, of which 64856 are unique. There were 6 errors in getting contributors (likely user accounts that no longer exist).


### Get Repo Starrers

In [137]:
get_url_field = 'stargazers_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'user.login']
stargazers_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_stargazers_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
stargazers_errors_df = check_return_error_file('../data/error_logs/repo_stargazers_join_dataset_errors.csv')

In [138]:
subset_stargazers_df = stargazers_df[(stargazers_df['repo_full_name'].isin(core_repos['full_name']))]

In [139]:
len(stargazers_df), len(subset_stargazers_df)

(33097, 11263)

### Get Repo Subscribers

In [140]:
get_url_field = 'subscribers_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'login']
subscribers_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_subscribers_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
subscribers_errors_df = check_return_error_file('../data/error_logs/repo_subscribers_join_dataset_errors.csv')

In [141]:
subset_subscribers_df = subscribers_df[subscribers_df.repo_full_name.isin(core_repos.full_name)]

In [142]:
len(subscribers_df), len(subset_subscribers_df)

(47047, 25149)

In [143]:
print(f"From {len(core_repos)} repos, we found {len(subset_subscribers_df)} subset_subscribers, of which {len(subset_subscribers_df.login.unique())} are unique. There were {len(subscribers_errors_df)} errors in getting subscribers (likely user accounts that no longer exist).")

From 12495 repos, we found 25149 subset_subscribers, of which 2747 are unique. There were 11 errors in getting subscribers (likely user accounts that no longer exist).


#### Cannot get repo collaborators

Need Push Access

In [144]:
# get_url_field = 'collaborators_url'
# load_existing_data = False
# is_stargazers = False
# collaborators_df, users_df = get_repos_user_actors(repo_df, '../data/repo_collaborators_join_dataset.csv', '../data/users_dataset.csv', rates_df, get_url_field, load_existing_data, is_stargazers)
# collaborators_errors_df = check_return_error_file('../data/error_logs/repo_collaborators_join_dataset_errors.csv')

### Get Repo Forks

In [145]:
get_url_field = 'forks_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'full_name']
forks_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_forks_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
forks_errors_df = check_return_error_file('../data/error_logs/repo_forks_join_dataset_errors.csv')

In [146]:
subset_forks_df = forks_df[forks_df.repo_full_name.isin(core_repos.full_name)]

In [147]:
len(forks_df), len(subset_forks_df)

(32329, 5594)

In [148]:
print(f"From {len(core_repos)} repos, we found {len(subset_forks_df)} subset_forks, of which {len(subset_forks_df['owner.login'].unique())} are unique. There were {len(forks_errors_df)} errors in getting forks (likely user accounts that no longer exist).")

From 12495 repos, we found 5594 subset_forks, of which 4294 are unique. There were 1 errors in getting forks (likely user accounts that no longer exist).


### Get Repo Issues

In [149]:
get_url_field = 'issues_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'user.login', 'id']
issues_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_issues_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
issues_errors_df = check_return_error_file('../data/error_logs/repo_issues_join_dataset_errors.csv')

In [150]:
subset_issues_df = issues_df[issues_df.repo_full_name.isin(core_repos.full_name)]

In [152]:
print(f"From {len(core_repos)} repos, we found {len(subset_issues_df)} issues, which come from {len(subset_issues_df.repository_url.unique())} unique repos and were created by {len(users_df[users_df.login.isin(issues_df['user.login'])])} unique users. There were {len(issues_errors_df)} errors in getting issues (likely repos that have no issues longer exist).")

From 12495 repos, we found 41626 issues, which come from 1336 unique repos and were created by 3062 unique users. There were 1 errors in getting issues (likely repos that have no issues longer exist).


#### Get Repo Issue Comments

In [187]:
get_url_field = 'comments_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'user.login', 'id']
issues_comments_df, users_df = get_repos_user_actors(issues_df, '../data/large_files/join_files/issues_comments_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
issues_comments_errors_df = check_return_error_file('../data/error_logs/issues_comments_join_dataset_errors.csv')

In [188]:
issues_comments_df = issues_comments_df[issues_comments_df.repo_full_name.isin(core_repos.full_name)]

In [189]:
print(f"From {len(issues_df)} repos with issues, we found {len(issues_comments_df)} comments, which come from {len(issues_df.repository_url.unique())} unique repos and were created by {len(users_df[users_df.login.isin(issues_comments_df['user.login'])])} unique users. There were {len(issues_comments_errors_df)} errors in getting issues comments (likely issues comments longer that no longer exist).")

From 90552 repos with issues, we found 36515 comments, which come from 1999 unique repos and were created by 1184 unique users. There were 2505 errors in getting issues comments (likely issues comments longer that no longer exist).


### Get Repo Pull Requests

In [227]:
get_url_field = 'pulls_url'
load_existing_files = True
overwrite_existing_temp_files = False
filter_fields = ['id', 'repo_full_name', 'user.login', 'head.user.login']
join_unique_field = 'repo_full_name'
pulls_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_pulls_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
pulls_errors_df = check_return_error_file('../data/error_logs/repo_pulls_join_dataset_errors.csv')

In [231]:
pulls_df.review_count.dtype

dtype('float64')

In [212]:
pulls_df = pulls_df[pulls_df.repo_full_name.isin(core_repos.full_name)]

In [10]:
print(f"From {len(core_repos)} repos, we found {len(pulls_df)} pulls, which come from {len(pulls_df.repo_id.unique())} unique repos and were created by {len(users_df[users_df.login.isin(pulls_df['user.login'])])} unique users. There were {len(pulls_errors_df)} errors in getting pulls (likely repos that have no pulls longer exist).")

From 2485 repos, we found 15752 pulls, which come from 478 unique repos and were created by 927 unique users. There were 0 errors in getting pulls (likely repos that have no pulls longer exist).


#### Get Repo Pull Request Comments

In [17]:
url_type = "review_comments_url"
count_type = "review_count"
pulls_df = get_counts(pulls_df, url_type, count_type, overwrite_existing_temp_files=False)
pulls_df.to_csv('../data/large_files/join_files/repo_pulls_join_dataset.csv', index=False)

In [156]:
get_url_field = 'review_comments_url'
load_existing_files = True
overwrite_existing_temp_files = False
filter_fields = ['repo_full_name', 'user.login', 'url']
join_unique_field = 'repo_full_name'
pulls_comments_df, users_df = get_repos_user_actors(pulls_df, '../data/large_files/join_files/pulls_comments_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
pulls_comments_errors_df = check_return_error_file('../data/error_logs/pulls_comments_join_dataset_errors.csv')

In [23]:
pulls_comments_df = pulls_comments_df[pulls_comments_df.repo_full_name.isin(core_repos.full_name)]

In [24]:
print(f"From {len(pulls_df)} repos with pulls, we found {len(pulls_comments_df)} comments, which come from {len(pulls_df.repo_id.unique())} unique repos and were created by {len(users_df[users_df.login.isin(pulls_comments_df['user.login'])])} unique users. There were {len(pulls_comments_errors_df)} errors in getting pulls comments (likely repos that have no pulls comments longer exist).")

From 15752 repos with pulls, we found 6140 comments, which come from 478 unique repos and were created by 164 unique users. There were 16 errors in getting pulls comments (likely repos that have no pulls comments longer exist).


### Get Repo Commits

In [19]:
core_repos = get_total_commits(core_repos, '../data/large_files/entity_files/subset_repos_dataset_with_commits.csv')

In [20]:
core_repos['cleaned_total_commits'] = core_repos.total_commits.astype(int)
print(f"From {len(core_repos)} repos, we found {core_repos.cleaned_total_commits.sum()} total commits, which considering the 5000 rate limit will take {core_repos.cleaned_total_commits.sum()/5000} hours to get.")

From 2108 repos, we found 241355 total commits, which considering the 5000 rate limit will take 48.271 hours to get.


In [None]:
get_url_field = 'commits_url'
load_existing_files = False
load_existing_temp_files = True
commits_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_commits_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, load_existing_temp_files)
commits_errors_df = check_return_error_file('../data/error_logs/repo_commits_join_dataset_errors.csv')

### Total Summary

### Get and Explore Repo Specific Features

In [None]:
repo_output_path = "../data/large_files/entity_files/repos_dataset.csv"
error_file_path = "../data/error_logs/repo_profile_errors.csv"
temp_repo_dir = "../data/temp/repo_profile/"
core_repos = get_repo_profile(core_repos, repo_output_path, rates_df, error_file_path, temp_repo_dir)

In [None]:
core_repos = get_repo_languages(core_repos, repo_output_path, rates_df)

In [None]:
core_repos = get_repo_labels(core_repos, repo_output_path, rates_df)

In [None]:
core_repos = get_repo_tags(core_repos, repo_output_path, rates_df)

In [225]:
repo_actors_output_path = '../data/large_files/join_files/pulls_comments_join_dataset.csv'
test = pd.read_csv(f"../data/error_logs/{repo_actors_output_path.split('/')[-1].split('.csv')[0]}_errors.csv")

In [226]:
test

Unnamed: 0,repo_full_name,error_time,error_url
0,180179382,1685518000.0,ScandinavianSection-UCLA/Nexus2
1,444790707,1685518000.0,CDRH/api
2,360156974,1685518000.0,CDRH/api
3,355247192,1685518000.0,CDRH/api
4,225865362,1685518000.0,CDRH/api
5,162486264,1685518000.0,CDRH/api
6,148198666,1685518000.0,CDRH/api
7,142989558,1685518000.0,CDRH/api
8,140403014,1685518000.0,CDRH/api
9,119517224,1685518000.0,CDRH/api
