# Process DH Repos

This notebook will be run multiple times as we add new repositories to the DH repo list. It will take the data from the DH repo list and process it through making additional calls to GitHub's APIs.

### Load Libraries and Set Initial Dataset

In [29]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append("..")
from data_generation_scripts.utils import check_rate_limit, check_return_error_file, read_combine_files, check_total_pages, check_for_joins_in_older_queries
from data_generation_scripts.generate_search_data import get_initial_search_datasets
from data_generation_scripts.generate_repo_metadata import get_repo_languages, get_repo_labels, get_repo_tags,  get_repo_profile, get_total_commits
from data_generation_scripts.generate_repo_users_interactions import get_repos_user_actors
from data_generation_scripts.generate_repo_metadata import check_total_results

In [10]:
rates_df = check_rate_limit()

In [11]:
core_repos_path = "../data/derived_files/initial_core_repos.csv"
core_repos = pd.read_csv(core_repos_path)

### Get Missing Counts

In [12]:
counts_fields = pd.read_csv('../data/metadata_files/repo_url_cols.csv')

In [13]:
counts_fields.loc[counts_fields.url_type == 'review_comments_url', 'count_type'] = 'review_count'

In [14]:
def get_counts(repo_df, url_type, count_type, overwrite_existing_temp_files = False):
    if count_type in repo_df.columns:
        needs_counts = repo_df[repo_df[count_type].isna()]
        has_counts = repo_df[repo_df[count_type].notna()]
    else:
        needs_counts = repo_df
        has_counts = pd.DataFrame()
        
    if len(has_counts) == len(repo_df):
        repo_df = has_counts
    else:
        needs_counts = check_total_results(needs_counts, url_type, overwrite_existing_temp_files)
        repo_df = pd.concat([needs_counts, has_counts])
    return repo_df


In [15]:
from IPython.display import clear_output


In [16]:
skip_types = ['review_comments_url', 'commits_url', 'collaborators_url']
overwrite_existing_temp_files = True
for index, row in counts_fields.iterrows():
    if (row.url_type not in skip_types):
        count_type = row.url_type.split("_")[0] + "_count"
        print(f"Getting {count_type} for {row['url_type']}")
        core_repos = get_counts(core_repos, row['url_type'], count_type, overwrite_existing_temp_files)
        row['count_type'] = count_type
        clear_output(wait=True)


In [17]:
core_repos.to_csv(core_repos_path, index=False)

In [15]:
# counts_fields.to_csv('../data/metadata_files/repo_url_cols.csv', index=False)

### Get Repo Contributors

In [50]:
get_url_field = 'contributors_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'login']
contributors_df, users_df = get_repos_user_actors(core_repos, '../data/join_files/repo_contributors_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
contributors_errors_df = check_return_error_file('../data/error_logs/repo_contributors_join_dataset_errors.csv')

In [16]:
print(f"From {len(core_repos)} repos, we found {len(contributors_df)} contributors, of which {len(users_df[users_df.login.isin(contributors_df.login)])} are unique. There were {len(contributors_errors_df)} errors in getting contributors (likely user accounts that no longer exist).")

From 2326 repos, we found 4983 contributors, of which 3645 are unique. There were 6 errors in getting contributors (likely user accounts that no longer exist).


### Get Repo Starrers

In [36]:
get_url_field = 'stargazers_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique = 'repo_query'
filter_fields = ['repo_full_name', 'user.login']
stargazers_df, users_df = get_repos_user_actors(core_repos, '../data/join_files/repo_stargazers_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files)
stargazers_errors_df = check_return_error_file('../data/error_logs/repo_stargazers_join_dataset_errors.csv')

In [38]:
stargazers_df = stargazers_df[stargazers_df.repo_full_name.isin(core_repos.full_name)]

In [19]:
print(f"From {len(core_repos)} repos, we found {len(stargazers_df)} stars, which were created by {len(users_df[users_df.login.isin(stargazers_df['user.login'])])} unique users on {stargazers_df.repo_full_name.nunique()} unique repos. There were {len(stargazers_errors_df)} errors in getting stargazers (likely user accounts that no longer exist).")

From 2326 repos, we found 30351 stars, which were created by 3401 unique users on 832 unique repos. There were 2 errors in getting stargazers (likely user accounts that no longer exist).


### Get Repo Subscribers

In [20]:
get_url_field = 'subscribers_url'
load_existing_files = True
overwrite_existing_temp_files = False
subscribers_df, users_df = get_repos_user_actors(core_repos, '../data/join_files/repo_subscribers_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files)
subscribers_errors_df = check_return_error_file('../data/error_logs/repo_subscribers_join_dataset_errors.csv')

In [21]:
subscribers_df = subscribers_df[subscribers_df.repo_full_name.isin(core_repos.full_name)]

In [22]:
print(f"From {len(core_repos)} repos, we found {len(subscribers_df)} subscribers, of which {len(users_df[users_df.login.isin(subscribers_df.login)])} are unique. There were {len(subscribers_errors_df)} errors in getting subscribers (likely user accounts that no longer exist).")

From 2326 repos, we found 5559 subscribers, of which 3633 are unique. There were 11 errors in getting subscribers (likely user accounts that no longer exist).


#### Cannot get repo collaborators

Need Push Access

In [23]:
# get_url_field = 'collaborators_url'
# load_existing_data = False
# is_stargazers = False
# collaborators_df, users_df = get_repos_user_actors(repo_df, '../data/repo_collaborators_join_dataset.csv', '../data/users_dataset.csv', rates_df, get_url_field, load_existing_data, is_stargazers)
# collaborators_errors_df = check_return_error_file('../data/error_logs/repo_collaborators_join_dataset_errors.csv')

### Get Repo Forks

In [45]:
get_url_field = 'forks_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'full_name']
forks_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_forks_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)
forks_errors_df = check_return_error_file('../data/error_logs/repo_forks_join_dataset_errors.csv')

In [25]:
forks_df = forks_df[forks_df.repo_full_name.isin(core_repos.full_name)]

In [26]:
print(f"From {len(core_repos)} repos, we found {len(forks_df)} forks, of which {len(users_df[users_df.login.isin(forks_df['owner.login'])])} are unique. There were {len(forks_errors_df)} errors in getting forks (likely user accounts that no longer exist).")

From 2326 repos, we found 6897 forks, of which 1920 are unique. There were 1 errors in getting forks (likely user accounts that no longer exist).


### Get Repo Issues

In [27]:
get_url_field = 'issues_url'
load_existing_files = True
overwrite_existing_temp_files = False
issues_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_issues_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files)
issues_errors_df = check_return_error_file('../data/error_logs/repo_issues_join_dataset_errors.csv')

In [28]:
issues_df = issues_df[issues_df.repo_full_name.isin(core_repos.full_name)]

In [29]:
print(f"From {len(core_repos)} repos, we found {len(issues_df)} issues, which come from {len(issues_df.repository_url.unique())} unique repos and were created by {len(users_df[users_df.login.isin(issues_df['user.login'])])} unique users. There were {len(issues_errors_df)} errors in getting issues (likely repos that have no issues longer exist).")

From 2326 repos, we found 33449 issues, which come from 624 unique repos and were created by 1437 unique users. There were 1 errors in getting issues (likely repos that have no issues longer exist).


#### Get Repo Issue Comments

In [30]:
get_url_field = 'comments_url'
load_existing_files = True
overwrite_existing_temp_files = False
issues_comments_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/issues_comments_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files)
issues_comments_errors_df = check_return_error_file('../data/error_logs/issues_comments_join_dataset_errors.csv')

In [31]:
issues_comments_df = issues_comments_df[issues_comments_df.repo_full_name.isin(core_repos.full_name)]

In [32]:
print(f"From {len(issues_df)} repos with issues, we found {len(issues_comments_df)} comments, which come from {len(issues_df.repository_url.unique())} unique repos and were created by {len(users_df[users_df.login.isin(issues_comments_df['user.login'])])} unique users. There were {len(issues_comments_errors_df)} errors in getting issues comments (likely issues comments longer that no longer exist).")

From 33449 repos with issues, we found 49800 comments, which come from 624 unique repos and were created by 1166 unique users. There were 2329 errors in getting issues comments (likely issues comments longer that no longer exist).


### Get Repo Pull Requests

In [33]:
get_url_field = 'pulls_url'
load_existing_files = True
overwrite_existing_temp_files = False
pulls_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_pulls_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files)
pulls_errors_df = check_return_error_file('../data/error_logs/repo_pulls_join_dataset_errors.csv')

In [34]:
pulls_df = pulls_df[pulls_df.repo_full_name.isin(core_repos.full_name)]

In [35]:
print(f"From {len(core_repos)} repos, we found {len(pulls_df)} pulls, which come from {len(pulls_df.repo_id.unique())} unique repos and were created by {len(users_df[users_df.login.isin(pulls_df['user.login'])])} unique users. There were {len(pulls_errors_df)} errors in getting pulls (likely repos that have no pulls longer exist).")

From 2326 repos, we found 14263 pulls, which come from 467 unique repos and were created by 896 unique users. There were 0 errors in getting pulls (likely repos that have no pulls longer exist).


#### Get Repo Pull Request Comments

In [36]:
df = pd.read_csv("../data/temp/review_counts.csv")
len(df)

13199

In [37]:
if len(pulls_df[~pulls_df.repo_full_name.isin(df.repo_full_name.unique())]) > 0:
    url_type = "review_comments_url"
    count_type = "review_comments_count"
    overwrite_existing_temp_files = False
    pulls_df = get_counts(
        pulls_df[~pulls_df.repo_full_name.isin(df.repo_full_name.unique())], url_type, count_type, overwrite_existing_temp_files)
    clear_output(wait=True)


In [38]:
updated_pulls_df = df.copy()

In [39]:
updated_pulls_df.review_count = updated_pulls_df.review_count.fillna(0)
updated_pulls_df.review_count = updated_pulls_df.review_count.astype(int)

In [40]:
updated_pulls_df.sort_values(by="review_count", ascending=False)[['repo_full_name', 'id', 'review_count']].head(10)

Unnamed: 0,repo_full_name,id,review_count
1040,ic-labs/django-icekit,86893913,88
796,evt-project/evt-viewer,266047588,85
3690,Edirom/Edirom-Online,566678096,61
1136,ambuda-org/ambuda,1063533193,61
6154,Princeton-CDH/geniza,836615052,60
6350,Princeton-CDH/geniza,610018752,56
6198,Princeton-CDH/geniza,813413691,47
2881,Hypertopic/Porphyry,422274540,44
473,archivesunleashed/aut,384998994,43
8626,Princeton-CDH/pemm-data,443090186,36


In [41]:
import sys
sys.path.append('..')
from data_generation_scripts.utils import *

In [42]:
df2 = pd.read_csv(
    '../data/large_files/join_files/pulls_comments_join_dataset.csv')


In [44]:
updated_pulls_df[~updated_pulls_df.repo_full_name.isin(df2.repo_full_name.unique())]


Unnamed: 0,url,id,node_id,html_url,diff_url,patch_url,issue_url,number,state,locked,...,milestone.closed_issues,milestone.state,milestone.created_at,milestone.updated_at,milestone.due_on,milestone.closed_at,repo_query_time,head.repo.has_discussions,base.repo.has_discussions,review_count
266,https://api.github.com/repos/dbamman/book-nlp/...,116231638,MDExOlB1bGxSZXF1ZXN0MTE2MjMxNjM4,https://github.com/dbamman/book-nlp/pull/5,https://github.com/dbamman/book-nlp/pull/5.diff,https://github.com/dbamman/book-nlp/pull/5.patch,https://api.github.com/repos/dbamman/book-nlp/...,5,open,False,...,,,,,,,2022-12-19,,,0
267,https://api.github.com/repos/dbamman/book-nlp/...,76401739,MDExOlB1bGxSZXF1ZXN0NzY0MDE3Mzk=,https://github.com/dbamman/book-nlp/pull/3,https://github.com/dbamman/book-nlp/pull/3.diff,https://github.com/dbamman/book-nlp/pull/3.patch,https://api.github.com/repos/dbamman/book-nlp/...,3,open,False,...,,,,,,,2022-12-19,,,0
268,https://api.github.com/repos/dbamman/book-nlp/...,70727860,MDExOlB1bGxSZXF1ZXN0NzA3Mjc4NjA=,https://github.com/dbamman/book-nlp/pull/1,https://github.com/dbamman/book-nlp/pull/1.diff,https://github.com/dbamman/book-nlp/pull/1.patch,https://api.github.com/repos/dbamman/book-nlp/...,1,open,False,...,,,,,,,2022-12-19,,,0
269,https://api.github.com/repos/ryanjgallagher/sh...,1095308648,PR_kwDOB_aIKs5BSRVo,https://github.com/ryanjgallagher/shifterator/...,https://github.com/ryanjgallagher/shifterator/...,https://github.com/ryanjgallagher/shifterator/...,https://api.github.com/repos/ryanjgallagher/sh...,37,open,False,...,,,,,,,2022-12-19,,,0
270,https://api.github.com/repos/ryanjgallagher/sh...,1017656437,PR_kwDOB_aIKs48qDR1,https://github.com/ryanjgallagher/shifterator/...,https://github.com/ryanjgallagher/shifterator/...,https://github.com/ryanjgallagher/shifterator/...,https://api.github.com/repos/ryanjgallagher/sh...,35,open,False,...,,,,,,,2022-12-19,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13194,https://api.github.com/repos/gaoqiang1112/num2...,460781712,MDExOlB1bGxSZXF1ZXN0NDYwNzgxNzEy,https://github.com/gaoqiang1112/num2Capital/pu...,https://github.com/gaoqiang1112/num2Capital/pu...,https://github.com/gaoqiang1112/num2Capital/pu...,https://api.github.com/repos/gaoqiang1112/num2...,3,closed,False,...,,,,,,,2022-12-19,False,False,0
13195,https://api.github.com/repos/gaoqiang1112/num2...,452141578,MDExOlB1bGxSZXF1ZXN0NDUyMTQxNTc4,https://github.com/gaoqiang1112/num2Capital/pu...,https://github.com/gaoqiang1112/num2Capital/pu...,https://github.com/gaoqiang1112/num2Capital/pu...,https://api.github.com/repos/gaoqiang1112/num2...,2,closed,False,...,,,,,,,2022-12-19,False,False,0
13196,https://api.github.com/repos/gaoqiang1112/num2...,399041134,MDExOlB1bGxSZXF1ZXN0Mzk5MDQxMTM0,https://github.com/gaoqiang1112/num2Capital/pu...,https://github.com/gaoqiang1112/num2Capital/pu...,https://github.com/gaoqiang1112/num2Capital/pu...,https://api.github.com/repos/gaoqiang1112/num2...,1,open,False,...,,,,,,,2022-12-19,False,False,0
13197,https://api.github.com/repos/qyn123/springboot...,349786091,MDExOlB1bGxSZXF1ZXN0MzQ5Nzg2MDkx,https://github.com/qyn123/springbootplus/pull/2,https://github.com/qyn123/springbootplus/pull/...,https://github.com/qyn123/springbootplus/pull/...,https://api.github.com/repos/qyn123/springboot...,2,open,False,...,,,,,,,2022-12-19,False,False,0


In [46]:
updated_pulls_df[updated_pulls_df.review_count > 0]

Unnamed: 0,url,id,node_id,html_url,diff_url,patch_url,issue_url,number,state,locked,...,milestone.closed_issues,milestone.state,milestone.created_at,milestone.updated_at,milestone.due_on,milestone.closed_at,repo_query_time,head.repo.has_discussions,base.repo.has_discussions,review_count
68,https://api.github.com/repos/programminghistor...,1103980856,PR_kwDOAMHoHc5BzWk4,https://github.com/programminghistorian/jekyll...,https://github.com/programminghistorian/jekyll...,https://github.com/programminghistorian/jekyll...,https://api.github.com/repos/programminghistor...,2722,open,False,...,,,,,,,2022-12-19,,,4
69,https://api.github.com/repos/programminghistor...,1101880549,PR_kwDOAMHoHc5BrVzl,https://github.com/programminghistorian/jekyll...,https://github.com/programminghistorian/jekyll...,https://github.com/programminghistorian/jekyll...,https://api.github.com/repos/programminghistor...,2720,closed,False,...,,,,,,,2022-12-19,,,2
76,https://api.github.com/repos/programminghistor...,1091574975,PR_kwDOAMHoHc5BEBy_,https://github.com/programminghistorian/jekyll...,https://github.com/programminghistorian/jekyll...,https://github.com/programminghistorian/jekyll...,https://api.github.com/repos/programminghistor...,2708,closed,False,...,,,,,,,2022-12-19,,,4
95,https://api.github.com/repos/programminghistor...,1029448756,PR_kwDOAMHoHc49XCQ0,https://github.com/programminghistorian/jekyll...,https://github.com/programminghistorian/jekyll...,https://github.com/programminghistorian/jekyll...,https://api.github.com/repos/programminghistor...,2662,closed,False,...,,,,,,,2022-12-19,,,2
96,https://api.github.com/repos/programminghistor...,1029408372,PR_kwDOAMHoHc49W4Z0,https://github.com/programminghistorian/jekyll...,https://github.com/programminghistorian/jekyll...,https://github.com/programminghistorian/jekyll...,https://api.github.com/repos/programminghistor...,2661,closed,False,...,,,,,,,2022-12-19,,,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13085,https://api.github.com/repos/mei-friend/mei-fr...,1111540513,PR_kwDOGX5bkc5CQMMh,https://github.com/mei-friend/mei-friend/pull/8,https://github.com/mei-friend/mei-friend/pull/...,https://github.com/mei-friend/mei-friend/pull/...,https://api.github.com/repos/mei-friend/mei-fr...,8,closed,False,...,,,,,,,2022-12-19,False,False,7
13119,https://api.github.com/repos/cirosantilli/linu...,1092023973,PR_kwDOA9i5S85BFval,https://github.com/cirosantilli/linux-kernel-m...,https://github.com/cirosantilli/linux-kernel-m...,https://github.com/cirosantilli/linux-kernel-m...,https://api.github.com/repos/cirosantilli/linu...,226,open,False,...,,,,,,,2022-12-19,False,False,2
13153,https://api.github.com/repos/cirosantilli/chin...,327536887,MDExOlB1bGxSZXF1ZXN0MzI3NTM2ODg3,https://github.com/cirosantilli/china-dictator...,https://github.com/cirosantilli/china-dictator...,https://github.com/cirosantilli/china-dictator...,https://api.github.com/repos/cirosantilli/chin...,72,closed,False,...,,,,,,,2022-12-19,,False,21
13154,https://api.github.com/repos/cirosantilli/chin...,294988027,MDExOlB1bGxSZXF1ZXN0Mjk0OTg4MDI3,https://github.com/cirosantilli/china-dictator...,https://github.com/cirosantilli/china-dictator...,https://github.com/cirosantilli/china-dictator...,https://api.github.com/repos/cirosantilli/chin...,60,closed,False,...,,,,,,,2022-12-19,False,False,1


In [50]:
get_url_field = 'review_comments_url'
load_existing_files = False
overwrite_existing_temp_files = False
pulls_comments_df, users_df = get_repos_user_actors(updated_pulls_df, '../data/large_files/join_files/pulls_comments_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files)
pulls_comments_errors_df = check_return_error_file('../data/error_logs/pulls_comments_join_dataset_errors.csv')

../data/temp/pulls_comments_join_dataset/


Getting Repo Actors:  40%|████      | 4933/12264 [18:17<35:10,  3.47it/s]  

response code: 403. hit rate limiting. trying to sleep...
query failed twice with code 403. Failing URL: https://api.github.com/repos/ccnmtl/footprints/pulls/1283/comments?state=all&per_page=100&page=1
rate limit reached. sleeping for 1 hour


Getting Repo Actors:  81%|████████  | 9933/12264 [1:54:20<09:45,  3.98it/s]       

response code: 403. hit rate limiting. trying to sleep...
query failed twice with code 403. Failing URL: https://api.github.com/repos/wandertext/wandertext-server/pulls/331/comments?state=all&per_page=100&page=1
rate limit reached. sleeping for 1 hour


Getting Repo Actors:  90%|█████████ | 11090/12264 [4:15:03<03:51,  5.07it/s]      

response code: 404. hit rate limiting. trying to sleep...
query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/15/comments?state=all&per_page=100&page=1


Getting Repo Actors:  90%|█████████ | 11091/12264 [4:17:04<11:50:23, 36.34s/it]

response code: 404. hit rate limiting. trying to sleep...


Getting Repo Actors:  90%|█████████ | 11092/12264 [4:19:04<20:03:35, 61.62s/it]

query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/14/comments?state=all&per_page=100&page=1
response code: 404. hit rate limiting. trying to sleep...
query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/13/comments?state=all&per_page=100&page=1


Getting Repo Actors:  90%|█████████ | 11093/12264 [4:21:05<25:48:45, 79.36s/it]

response code: 404. hit rate limiting. trying to sleep...
query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/12/comments?state=all&per_page=100&page=1


Getting Repo Actors:  90%|█████████ | 11094/12264 [4:23:06<29:48:36, 91.72s/it]

response code: 404. hit rate limiting. trying to sleep...


Getting Repo Actors:  90%|█████████ | 11095/12264 [4:25:06<32:35:45, 100.38s/it]

query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/11/comments?state=all&per_page=100&page=1
response code: 404. hit rate limiting. trying to sleep...


Getting Repo Actors:  90%|█████████ | 11096/12264 [4:27:07<34:32:10, 106.45s/it]

query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/10/comments?state=all&per_page=100&page=1
response code: 404. hit rate limiting. trying to sleep...


Getting Repo Actors:  90%|█████████ | 11097/12264 [4:29:07<35:52:59, 110.69s/it]

query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/9/comments?state=all&per_page=100&page=1
response code: 404. hit rate limiting. trying to sleep...
query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/8/comments?state=all&per_page=100&page=1


Getting Repo Actors:  90%|█████████ | 11098/12264 [4:31:08<36:51:02, 113.78s/it]

response code: 404. hit rate limiting. trying to sleep...


Getting Repo Actors:  91%|█████████ | 11099/12264 [4:33:09<37:28:42, 115.81s/it]

query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/7/comments?state=all&per_page=100&page=1
response code: 404. hit rate limiting. trying to sleep...
query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/6/comments?state=all&per_page=100&page=1


Getting Repo Actors:  91%|█████████ | 11100/12264 [4:35:12<38:06:51, 117.88s/it]

response code: 404. hit rate limiting. trying to sleep...


Getting Repo Actors:  91%|█████████ | 11101/12264 [4:42:37<69:46:54, 216.01s/it]

query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/5/comments?state=all&per_page=100&page=1
response code: 404. hit rate limiting. trying to sleep...
query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/4/comments?state=all&per_page=100&page=1


Getting Repo Actors:  91%|█████████ | 11102/12264 [4:47:00<74:20:49, 230.33s/it]

response code: 404. hit rate limiting. trying to sleep...


Getting Repo Actors:  91%|█████████ | 11103/12264 [5:05:11<157:33:26, 488.55s/it]

query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/3/comments?state=all&per_page=100&page=1
response code: 404. hit rate limiting. trying to sleep...
query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/2/comments?state=all&per_page=100&page=1


Getting Repo Actors:  91%|█████████ | 11104/12264 [5:07:12<121:52:31, 378.23s/it]

response code: 404. hit rate limiting. trying to sleep...


Getting Repo Actors:  91%|█████████ | 11105/12264 [5:09:13<96:53:05, 300.94s/it] 

query failed twice with code 404. Failing URL: https://api.github.com/repos/cysouw/kurstest/pulls/1/comments?state=all&per_page=100&page=1


Getting Repo Actors: 100%|██████████| 12264/12264 [5:13:48<00:00,  1.54s/it]    


no error file to clean


In [51]:
print(f"From {len(pulls_df)} repos with pulls, we found {len(pulls_comments_df)} comments, which come from {len(pulls_df.repo_id.unique())} unique repos and were created by {len(users_df[users_df.login.isin(pulls_comments_df['user.login'])])} unique users. There were {len(pulls_comments_errors_df)} errors in getting pulls comments (likely repos that have no pulls comments longer exist).")

From 14263 repos with pulls, we found 6052 comments, which come from 467 unique repos and were created by 157 unique users. There were 0 errors in getting pulls comments (likely repos that have no pulls comments longer exist).


### Get Repo Commits

In [19]:
core_repos = get_total_commits(core_repos, '../data/large_files/entity_files/subset_repos_dataset_with_commits.csv')

In [20]:
core_repos['cleaned_total_commits'] = core_repos.total_commits.astype(int)
print(f"From {len(core_repos)} repos, we found {core_repos.cleaned_total_commits.sum()} total commits, which considering the 5000 rate limit will take {core_repos.cleaned_total_commits.sum()/5000} hours to get.")

From 2108 repos, we found 241355 total commits, which considering the 5000 rate limit will take 48.271 hours to get.


In [None]:
get_url_field = 'commits_url'
load_existing_files = False
load_existing_temp_files = True
commits_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_commits_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, load_existing_temp_files)
commits_errors_df = check_return_error_file('../data/error_logs/repo_commits_join_dataset_errors.csv')

### Total Summary

### Get and Explore Repo Specific Features

In [22]:
test = pd.read_csv("../data/large_files/join_files/pulls_comments_join_dataset.csv")

In [25]:
check_for_joins_in_older_queries("../data/large_files/join_files/pulls_comments_join_dataset.csv", test, )

{'url': {0: 'https://api.github.com/repos/urschrei/pyzotero/pulls/comments/95563117'},
 'pull_request_review_id': {0: 16120671.0},
 'id': {0: 95563117},
 'node_id': {0: 'MDI0OlB1bGxSZXF1ZXN0UmV2aWV3Q29tbWVudDk1NTYzMTE3'},
 'path': {0: 'doc/index.rst'},
 'position': {0: nan},
 'original_position': {0: 56},
 'commit_id': {0: '5c03e0922132f0e41b32f156c465c16d5f7aadb9'},
 'original_commit_id': {0: '8241fac6514d10790cdaad5e6237160d291fa4a6'},
 'body': {0: 'Can we clarify what the since / search parameter is? I think it should be rendered `as code`, and include a link to https://www.zotero.org/support/dev/web_api/v3/syncing'},
 'created_at': {0: '2017-01-11T11:47:11Z'},
 'updated_at': {0: '2017-01-11T12:15:19Z'},
 'html_url': {0: 'https://github.com/urschrei/pyzotero/pull/64#discussion_r95563117'},
 'pull_request_url': {0: 'https://api.github.com/repos/urschrei/pyzotero/pulls/64'},
 'author_association': {0: 'OWNER'},
 'start_line': {0: nan},
 'original_start_line': {0: nan},
 'start_side': 

In [None]:
repo_output_path = "../data/large_files/entity_files/repos_dataset.csv"
error_file_path = "../data/error_logs/repo_profile_errors.csv"
temp_repo_dir = "../data/temp/repo_profile/"
core_repos = get_repo_profile(core_repos, repo_output_path, rates_df, error_file_path, temp_repo_dir)

In [None]:
core_repos = get_repo_languages(core_repos, repo_output_path, rates_df)

In [None]:
core_repos = get_repo_labels(core_repos, repo_output_path, rates_df)

In [None]:
core_repos = get_repo_tags(core_repos, repo_output_path, rates_df)