# Process DH Repos

This notebook will be run multiple times as we add new repositories to the DH repo list. It will take the data from the DH repo list and process it through making additional calls to GitHub's APIs.

### Load Libraries and Set Initial Dataset

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append("..")
from data_generation_scripts.utils import check_rate_limit, check_return_error_file, read_combine_files, check_total_pages, check_for_joins_in_older_queries
from data_generation_scripts.generate_search_data import get_initial_search_datasets
from data_generation_scripts.generate_repo_metadata import get_repo_languages, get_repo_labels, get_repo_tags,  get_repo_profile, get_total_commits
from data_generation_scripts.generate_repo_users_interactions import get_repos_user_actors
from data_generation_scripts.generate_repo_metadata import check_total_results

In [2]:
rates_df = check_rate_limit()

In [3]:
core_repos_path = "../data/derived_files/firstpass_core_repos.csv"
core_repos = pd.read_csv(core_repos_path)
len(core_repos)

12495

### Get Missing Counts

In [4]:
counts_fields = pd.read_csv('../data/metadata_files/repo_url_cols.csv')

In [79]:
counts_fields.loc[counts_fields.url_type == 'review_comments_url', 'count_type'] = 'review_count'

In [80]:
def get_counts(repo_df, url_type, count_type, overwrite_existing_temp_files = False):
    if count_type in repo_df.columns:
        needs_counts = repo_df[repo_df[count_type].isna()]
        has_counts = repo_df[repo_df[count_type].notna()]
    else:
        needs_counts = repo_df
        has_counts = pd.DataFrame()
        
    if len(has_counts) == len(repo_df):
        repo_df = has_counts
    else:
        needs_counts = check_total_results(needs_counts, url_type, overwrite_existing_temp_files)
        repo_df = pd.concat([needs_counts, has_counts])
    return repo_df


In [81]:
from IPython.display import clear_output
skip_types = ['review_comments_url', 'commits_url', 'collaborators_url']
overwrite_existing_temp_files = True
for index, row in counts_fields.iterrows():
    if (row.url_type not in skip_types):
        count_type = row.url_type.split("_")[0] + "_count"
        print(f"Getting {count_type} for {row['url_type']}")
        if (count_type not in core_repos.columns) or (core_repos[count_type].isna().any()):
            core_repos = get_counts(core_repos, row['url_type'], count_type, overwrite_existing_temp_files)
        row['count_type'] = count_type
        clear_output(wait=True)


Getting pulls_count for pulls_url


In [82]:
# core_repos.to_csv(core_repos_path, index=False)

In [83]:
# counts_fields.to_csv('../data/metadata_files/repo_url_cols.csv', index=False)

### Get Repo Contributors

In [84]:
get_url_field = 'contributors_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'login']
contributors_df, users_df = get_repos_user_actors(core_repos, '../data/join_files/repo_contributors_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
contributors_errors_df = check_return_error_file('../data/error_logs/repo_contributors_join_dataset_errors.csv')

In [13]:
contributors_df = contributors_df[contributors_df['repo_full_name'].isin(core_repos['full_name'])]

In [15]:
print(f"From {len(core_repos)} repos, we found {len(contributors_df)} contributors, of which {len(contributors_df.login.unique())} are unique. There were {len(contributors_errors_df)} errors in getting contributors (likely user accounts that no longer exist).")

From 2264 repos, we found 5210 contributors, of which 3656 are unique. There were 6 errors in getting contributors (likely user accounts that no longer exist).


### Get Repo Starrers

In [16]:
get_url_field = 'stargazers_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'user.login']
stargazers_df, users_df = get_repos_user_actors(core_repos, '../data/join_files/repo_stargazers_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
stargazers_errors_df = check_return_error_file('../data/error_logs/repo_stargazers_join_dataset_errors.csv')

In [17]:
stargazers_df = stargazers_df[stargazers_df.repo_full_name.isin(core_repos.full_name)]

In [19]:
print(f"From {len(core_repos)} repos, we found {len(stargazers_df)} stars, which were created by {len(stargazers_df['user.login'].unique())} unique users on {stargazers_df.repo_full_name.nunique()} unique repos. There were {len(stargazers_errors_df)} errors in getting stargazers (likely user accounts that no longer exist).")

From 2264 repos, we found 35225 stars, which were created by 9182 unique users on 834 unique repos. There were 2 errors in getting stargazers (likely user accounts that no longer exist).


### Get Repo Subscribers

In [24]:
get_url_field = 'subscribers_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'login']
subscribers_df, users_df = get_repos_user_actors(core_repos, '../data/join_files/repo_subscribers_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
subscribers_errors_df = check_return_error_file('../data/error_logs/repo_subscribers_join_dataset_errors.csv')

In [25]:
subscribers_df = subscribers_df[subscribers_df.repo_full_name.isin(core_repos.full_name)]

In [26]:
print(f"From {len(core_repos)} repos, we found {len(subscribers_df)} subscribers, of which {len(subscribers_df.login.unique())} are unique. There were {len(subscribers_errors_df)} errors in getting subscribers (likely user accounts that no longer exist).")

From 2264 repos, we found 9969 subscribers, of which 3271 are unique. There were 11 errors in getting subscribers (likely user accounts that no longer exist).


#### Cannot get repo collaborators

Need Push Access

In [23]:
# get_url_field = 'collaborators_url'
# load_existing_data = False
# is_stargazers = False
# collaborators_df, users_df = get_repos_user_actors(repo_df, '../data/repo_collaborators_join_dataset.csv', '../data/users_dataset.csv', rates_df, get_url_field, load_existing_data, is_stargazers)
# collaborators_errors_df = check_return_error_file('../data/error_logs/repo_collaborators_join_dataset_errors.csv')

### Get Repo Forks

In [27]:
get_url_field = 'forks_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'full_name']
forks_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_forks_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
forks_errors_df = check_return_error_file('../data/error_logs/repo_forks_join_dataset_errors.csv')

In [28]:
forks_df = forks_df[forks_df.repo_full_name.isin(core_repos.full_name)]

In [29]:
print(f"From {len(core_repos)} repos, we found {len(forks_df)} forks, of which {len(forks_df['owner.login'].unique())} are unique. There were {len(forks_errors_df)} errors in getting forks (likely user accounts that no longer exist).")

From 2264 repos, we found 6379 forks, of which 4154 are unique. There were 1 errors in getting forks (likely user accounts that no longer exist).


### Get Repo Issues

In [39]:
get_url_field = 'issues_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'user.login', 'id']
issues_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_issues_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
issues_errors_df = check_return_error_file('../data/error_logs/repo_issues_join_dataset_errors.csv')

In [40]:
issues_df = issues_df[issues_df.repo_full_name.isin(core_repos.full_name)]

In [41]:
print(f"From {len(core_repos)} repos, we found {len(issues_df)} issues, which come from {len(issues_df.repository_url.unique())} unique repos and were created by {len(users_df[users_df.login.isin(issues_df['user.login'])])} unique users. There were {len(issues_errors_df)} errors in getting issues (likely repos that have no issues longer exist).")

From 2264 repos, we found 33256 issues, which come from 622 unique repos and were created by 1476 unique users. There were 1 errors in getting issues (likely repos that have no issues longer exist).


#### Get Repo Issue Comments

In [4]:
get_url_field = 'comments_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'user.login', 'id']
issues_comments_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/issues_comments_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
issues_comments_errors_df = check_return_error_file('../data/error_logs/issues_comments_join_dataset_errors.csv')

In [45]:
issues_comments_df = issues_comments_df[issues_comments_df.repo_full_name.isin(core_repos.full_name)]

In [46]:
print(f"From {len(issues_df)} repos with issues, we found {len(issues_comments_df)} comments, which come from {len(issues_df.repository_url.unique())} unique repos and were created by {len(users_df[users_df.login.isin(issues_comments_df['user.login'])])} unique users. There were {len(issues_comments_errors_df)} errors in getting issues comments (likely issues comments longer that no longer exist).")

From 33256 repos with issues, we found 50678 comments, which come from 622 unique repos and were created by 1184 unique users. There were 2477 errors in getting issues comments (likely issues comments longer that no longer exist).


### Get Repo Pull Requests

In [13]:
get_url_field = 'pulls_url'
load_existing_files = True
overwrite_existing_temp_files = False
filter_fields = ['id', 'repo_full_name', 'user.login', 'head.user.login']
join_unique_field = 'repo_full_name'
pulls_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_pulls_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
pulls_errors_df = check_return_error_file('../data/error_logs/repo_pulls_join_dataset_errors.csv')

In [14]:
pulls_df = pulls_df[pulls_df.repo_full_name.isin(core_repos.full_name)]

In [10]:
print(f"From {len(core_repos)} repos, we found {len(pulls_df)} pulls, which come from {len(pulls_df.repo_id.unique())} unique repos and were created by {len(users_df[users_df.login.isin(pulls_df['user.login'])])} unique users. There were {len(pulls_errors_df)} errors in getting pulls (likely repos that have no pulls longer exist).")

From 2485 repos, we found 15752 pulls, which come from 478 unique repos and were created by 927 unique users. There were 0 errors in getting pulls (likely repos that have no pulls longer exist).


#### Get Repo Pull Request Comments

In [17]:
if 'review_count' not in pulls_df.columns:
    url_type = "review_comments_url"
    count_type = "review_count"
    pulls_df = get_counts(pulls_df, url_type, count_type, overwrite_existing_temp_files=False)
    pulls_df.to_csv('../data/large_files/join_files/repo_pulls_join_dataset.csv', index=False)

In [18]:
pulls_df.review_count.sum() / 5000

1.268

In [22]:
get_url_field = 'review_comments_url'
load_existing_files = True
overwrite_existing_temp_files = False
filter_fields = ['repo_full_name', 'user.login', 'url']
join_unique_field = 'repo_full_name'
pulls_comments_df, users_df = get_repos_user_actors(pulls_df, '../data/large_files/join_files/pulls_comments_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
pulls_comments_errors_df = check_return_error_file('../data/error_logs/pulls_comments_join_dataset_errors.csv')

In [23]:
pulls_comments_df = pulls_comments_df[pulls_comments_df.repo_full_name.isin(core_repos.full_name)]

In [24]:
print(f"From {len(pulls_df)} repos with pulls, we found {len(pulls_comments_df)} comments, which come from {len(pulls_df.repo_id.unique())} unique repos and were created by {len(users_df[users_df.login.isin(pulls_comments_df['user.login'])])} unique users. There were {len(pulls_comments_errors_df)} errors in getting pulls comments (likely repos that have no pulls comments longer exist).")

From 15752 repos with pulls, we found 6140 comments, which come from 478 unique repos and were created by 164 unique users. There were 16 errors in getting pulls comments (likely repos that have no pulls comments longer exist).


### Get Repo Commits

In [19]:
core_repos = get_total_commits(core_repos, '../data/large_files/entity_files/subset_repos_dataset_with_commits.csv')

In [20]:
core_repos['cleaned_total_commits'] = core_repos.total_commits.astype(int)
print(f"From {len(core_repos)} repos, we found {core_repos.cleaned_total_commits.sum()} total commits, which considering the 5000 rate limit will take {core_repos.cleaned_total_commits.sum()/5000} hours to get.")

From 2108 repos, we found 241355 total commits, which considering the 5000 rate limit will take 48.271 hours to get.


In [None]:
get_url_field = 'commits_url'
load_existing_files = False
load_existing_temp_files = True
commits_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_commits_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, load_existing_temp_files)
commits_errors_df = check_return_error_file('../data/error_logs/repo_commits_join_dataset_errors.csv')

### Total Summary

### Get and Explore Repo Specific Features

In [22]:
test = pd.read_csv("../data/large_files/join_files/pulls_comments_join_dataset.csv")

In [25]:
check_for_joins_in_older_queries("../data/large_files/join_files/pulls_comments_join_dataset.csv", test, )

{'url': {0: 'https://api.github.com/repos/urschrei/pyzotero/pulls/comments/95563117'},
 'pull_request_review_id': {0: 16120671.0},
 'id': {0: 95563117},
 'node_id': {0: 'MDI0OlB1bGxSZXF1ZXN0UmV2aWV3Q29tbWVudDk1NTYzMTE3'},
 'path': {0: 'doc/index.rst'},
 'position': {0: nan},
 'original_position': {0: 56},
 'commit_id': {0: '5c03e0922132f0e41b32f156c465c16d5f7aadb9'},
 'original_commit_id': {0: '8241fac6514d10790cdaad5e6237160d291fa4a6'},
 'body': {0: 'Can we clarify what the since / search parameter is? I think it should be rendered `as code`, and include a link to https://www.zotero.org/support/dev/web_api/v3/syncing'},
 'created_at': {0: '2017-01-11T11:47:11Z'},
 'updated_at': {0: '2017-01-11T12:15:19Z'},
 'html_url': {0: 'https://github.com/urschrei/pyzotero/pull/64#discussion_r95563117'},
 'pull_request_url': {0: 'https://api.github.com/repos/urschrei/pyzotero/pulls/64'},
 'author_association': {0: 'OWNER'},
 'start_line': {0: nan},
 'original_start_line': {0: nan},
 'start_side': 

In [None]:
repo_output_path = "../data/large_files/entity_files/repos_dataset.csv"
error_file_path = "../data/error_logs/repo_profile_errors.csv"
temp_repo_dir = "../data/temp/repo_profile/"
core_repos = get_repo_profile(core_repos, repo_output_path, rates_df, error_file_path, temp_repo_dir)

In [None]:
core_repos = get_repo_languages(core_repos, repo_output_path, rates_df)

In [None]:
core_repos = get_repo_labels(core_repos, repo_output_path, rates_df)

In [None]:
core_repos = get_repo_tags(core_repos, repo_output_path, rates_df)