# Process DH Repos

This notebook will be run multiple times as we add new repositories to the DH repo list. It will take the data from the DH repo list and process it through making additional calls to GitHub's APIs.

### Load Libraries and Set Initial Dataset

In [19]:
import pandas as pd
test = pd.read_csv("../../datasets/derived_files/grouped_cleaned_translated_dh_terms.csv")

In [64]:
test2 = pd.read_csv("../../datasets/derived_files/dh_terms_with_multiple_codes.csv")
cleaned_dh = pd.read_csv("../../datasets/derived_files/cleaned_translated_dh_terms.csv")

In [65]:
directionality_df = pd.read_csv("../../datasets/metadata_files/iso_639_choices_directionality_wikimedia.csv")

In [66]:
# Subset directionality to LTR and RTL languages
directionality_df = directionality_df[directionality_df.directionality.isin(['ltr', 'rtl'])]

# Merge the directionality data with the cleaned terms
merged_lang_terms = pd.merge(directionality_df[['code', 'directionality', 'English language name', 'local language name']], cleaned_dh, on='code', how="outer")
merged_lang_terms = merged_lang_terms[merged_lang_terms.code != "see also Test languages"]

In [32]:
subset_dh = merged_lang_terms[merged_lang_terms.term_source == 'Digital Humanities']

In [40]:
testing = subset_dh.groupby(['term_source', 'term', 'directionality']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

testing[testing.term.duplicated()]

Unnamed: 0,term_source,term,directionality,counts
9,Digital Humanities,Digital Humanities,rtl,1


In [73]:
merged_lang_terms.groupby(['term_source','term']).directionality.nunique().reset_index(name='counts').sort_values(by='counts', ascending=False)

Unnamed: 0,term_source,term,counts
537,Digital Humanities,Digital Humanities,2
563,Digital Humanities,Humanidades Dixitais,1
565,Digital Humanities,Humaniora Digital,1
566,Digital Humanities,Humanistyka cyfrowa,1
567,Digital Humanities,Humanitate Digitalak,1
...,...,...,...
289,Cultural Analytics,مەدەنىيەت ئانالىزى,1
290,Cultural Analytics,کلتوري تحلیلونه,1
291,Cultural Analytics,ކަލްޗަރަލް އެނަލިޓިކްސް,1
292,Cultural Analytics,सांस्कृतिक विश्लेषण,1


In [67]:
grouped_dh =merged_lang_terms.groupby(['term_source','term', 'directionality']).agg({'code': ','.join, 'term': 'count', 'English language name': ', '.join }).reset_index(level=0)
grouped_dh['processing'] = grouped_dh.index
# Split the 'processing' column into two columns
grouped_dh[['final_term', 'directionality']] = grouped_dh['processing'].apply(pd.Series)

# Now you can drop the 'processing' column if you want
grouped_dh = grouped_dh.drop('processing', axis=1)

In [68]:
grouped_dh = grouped_dh.reset_index(drop=True)
grouped_dh = grouped_dh.rename(columns={'term': 'count'})

In [71]:
grouped_dh[grouped_dh.directionality == 'ltr'].sort_values(by='count', ascending=False)

Unnamed: 0,term_source,code,count,English language name,final_term,directionality
4,Computational Humanities,"bs,da,de,en,fy,mg,no,ny,sm,sn,st,sv,tl,nb",14,"Bosnian, Danish, German, English, West Frisian...",Computational Humanities,ltr
115,Computational Social Science,"da,en,la,mg,mi,ny,sm,sn,st,tl",10,"Danish, English, Latin, Malagasy, Māori, Chich...",Computational Social Science,ltr
537,Digital Humanities,"en,ig,lb,mg,ny,sm,sn,st,tl,yo",10,"English, Igbo, Luxembourgish, Malagasy, Chiche...",Digital Humanities,ltr
233,Cultural Analytics,"bs,en,ky,lb,mg,ny,sn,tl",8,"Bosnian, English, Kirghiz, Luxembourgish, Mala...",Cultural Analytics,ltr
428,Digital History,"bs,en,fy,ky,la,sn,yo",7,"Bosnian, English, West Frisian, Kirghiz, Latin...",Digital History,ltr
...,...,...,...,...,...,...
302,Cultural Analytics,si,1,Sinhalese,සංස්කෘතික විශ්ලේෂණ,ltr
303,Cultural Analytics,th,1,Thai,การวิเคราะห์ทางวัฒนธรรม,ltr
304,Cultural Analytics,lo,1,Laotian,ການວິເຄາະວັດທະນະທໍາ,ltr
305,Cultural Analytics,my,1,Burmese,ယဉ်ကျေးမှုပိုင်းခြားစိတ်ဖြာမှု,ltr


In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append("..")
from data_generation_scripts.utils import check_rate_limit, check_return_error_file, read_combine_files, check_total_pages, check_for_joins_in_older_queries, check_add_users, combined_updated_users, check_for_entity_in_older_queries
from data_generation_scripts.generate_expanded_search_data import get_initial_search_datasets
from data_generation_scripts.generate_repo_metadata import get_repo_languages, get_repo_labels, get_repo_tags,  get_repo_profile, get_total_commits
from data_generation_scripts.generate_repo_users_interactions import get_repos_user_actors
from data_generation_scripts.generate_repo_metadata import check_total_results
from data_generation_scripts.general_utils import *

In [14]:
rates_df = check_rate_limit()
datafile_path = "../../datasets"

In [15]:
initial_core_repos = read_csv_file(f"{data_directory_path}/derived_files/initial_core_repos.csv")
core_repos = read_csv_file(f"{data_directory_path}/derived_files/firstpass_core_repos.csv")
len(initial_core_repos), len(core_repos)

### Get Missing Counts

In [5]:
counts_fields = read_csv_file(f'{data_directory_path}/metadata_files/repo_url_cols.csv')

In [6]:
counts_fields.loc[counts_fields.url_type == 'review_comments_url', 'count_type'] = 'review_count'

In [7]:
def get_counts(repo_df, url_type, count_type, overwrite_existing_temp_files = False):
    if count_type in repo_df.columns:
        needs_counts = repo_df[repo_df[count_type].isna()]
        has_counts = repo_df[repo_df[count_type].notna()]
    else:
        needs_counts = repo_df
        has_counts = pd.DataFrame()
        
    if len(has_counts) == len(repo_df):
        repo_df = has_counts
    else:
        needs_counts = check_total_results(needs_counts, url_type, overwrite_existing_temp_files)
        repo_df = pd.concat([needs_counts, has_counts])
    return repo_df


In [8]:
from IPython.display import clear_output
skip_types = ['review_comments_url', 'commits_url', 'collaborators_url']
overwrite_existing_temp_files = True
for index, row in counts_fields.iterrows():
    if (row.url_type not in skip_types):
        count_type = row.url_type.split("_")[0] + "_count"
        print(f"Getting {count_type} for {row['url_type']}")
        if (count_type not in core_repos.columns) or (core_repos[count_type].isna().any()):
            core_repos = get_counts(core_repos, row['url_type'], count_type, overwrite_existing_temp_files)
        row['count_type'] = count_type
        clear_output(wait=True)


Getting pulls_count for pulls_url


In [9]:
# core_repos.to_csv(core_repos_path, index=False)

In [10]:
# counts_fields.to_csv('../data/metadata_files/repo_url_cols.csv', index=False)

### Get Repo Contributors

In [13]:
get_url_field = 'contributors_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'login']
retry_errors = False
contributors_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_contributors_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields, retry_errors)
contributors_errors_df = check_return_error_file('../data/error_logs/repo_contributors_join_dataset_errors.csv')

In [15]:
subset_contributors_df = contributors_df[(contributors_df['repo_full_name'].isin(core_repos['full_name']))]

In [16]:
len(contributors_df), len(subset_contributors_df)

(120151, 114763)

In [17]:
print(f"From {len(core_repos)} repos, we found {len(subset_contributors_df)} contributors, of which {len(subset_contributors_df.login.unique())} are unique. There were {len(contributors_errors_df)} errors in getting contributors (likely user accounts that no longer exist).")

From 12495 repos, we found 114763 contributors, of which 64856 are unique. There were 6 errors in getting contributors (likely user accounts that no longer exist).


### Get Repo Starrers

In [18]:
get_url_field = 'stargazers_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'user.login']
retry_errors = False
stargazers_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_stargazers_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields, retry_errors)
stargazers_errors_df = check_return_error_file('../data/error_logs/repo_stargazers_join_dataset_errors.csv')

In [19]:
subset_stargazers_df = stargazers_df[(stargazers_df['repo_full_name'].isin(core_repos['full_name']))]

In [20]:
len(stargazers_df), len(subset_stargazers_df)

(33097, 11263)

In [23]:
print(f"From {len(core_repos)} repos, we found {len(subset_stargazers_df)} stargazers, of which {len(subset_stargazers_df['user.login'].unique())} are unique. There were {len(stargazers_errors_df)} errors in getting stargazers (likely user accounts that no longer exist).")

From 12495 repos, we found 11263 stargazers, of which 7792 are unique. There were 2 errors in getting stargazers (likely user accounts that no longer exist).


### Get Repo Subscribers

In [24]:
get_url_field = 'subscribers_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'login']
retry_errors = False
subscribers_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_subscribers_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields, retry_errors)
subscribers_errors_df = check_return_error_file('../data/error_logs/repo_subscribers_join_dataset_errors.csv')

In [25]:
subset_subscribers_df = subscribers_df[subscribers_df.repo_full_name.isin(core_repos.full_name)]

In [26]:
len(subscribers_df), len(subset_subscribers_df)

(47047, 25149)

In [27]:
print(f"From {len(core_repos)} repos, we found {len(subset_subscribers_df)} subset_subscribers, of which {len(subset_subscribers_df.login.unique())} are unique. There were {len(subscribers_errors_df)} errors in getting subscribers (likely user accounts that no longer exist).")

From 12495 repos, we found 25149 subset_subscribers, of which 2747 are unique. There were 11 errors in getting subscribers (likely user accounts that no longer exist).


#### Cannot get repo collaborators

Need Push Access

In [28]:
# get_url_field = 'collaborators_url'
# load_existing_data = False
# is_stargazers = False
# collaborators_df, users_df = get_repos_user_actors(repo_df, '../data/repo_collaborators_join_dataset.csv', '../data/users_dataset.csv', rates_df, get_url_field, load_existing_data, is_stargazers)
# collaborators_errors_df = check_return_error_file('../data/error_logs/repo_collaborators_join_dataset_errors.csv')

### Get Repo Forks

In [29]:
get_url_field = 'forks_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'full_name']
retry_errors = False
forks_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_forks_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields, retry_errors)
forks_errors_df = check_return_error_file('../data/error_logs/repo_forks_join_dataset_errors.csv')

In [30]:
subset_forks_df = forks_df[forks_df.repo_full_name.isin(core_repos.full_name)]

In [31]:
len(forks_df), len(subset_forks_df)

(32329, 5594)

In [33]:
print(f"From {len(core_repos)} repos, we found {len(subset_forks_df)} subset_forks, of which {len(subset_forks_df['owner.login'].unique())} are unique. There were {len(forks_errors_df)} errors in getting forks (likely user accounts that no longer exist).")

From 12495 repos, we found 5594 subset_forks, of which 4294 are unique. There were 1 errors in getting forks (likely user accounts that no longer exist).


### Get Repo Issues

In [34]:
get_url_field = 'issues_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'user.login', 'id']
retry_errors = False
issues_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_issues_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields, retry_errors)
issues_errors_df = check_return_error_file('../data/error_logs/repo_issues_join_dataset_errors.csv')

In [36]:
subset_issues_df = issues_df[issues_df.repo_full_name.isin(core_repos.full_name)]

In [37]:
print(f"From {len(core_repos)} repos, we found {len(subset_issues_df)} issues, which come from {len(subset_issues_df.repository_url.unique())} unique repos and were created by {len(users_df[users_df.login.isin(issues_df['user.login'])])} unique users. There were {len(issues_errors_df)} errors in getting issues (likely repos that have no issues longer exist).")

From 12495 repos, we found 41626 issues, which come from 1336 unique repos and were created by 3096 unique users. There were 1 errors in getting issues (likely repos that have no issues longer exist).


#### Get Repo Issue Comments

In [38]:
get_url_field = 'comments_url'
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'repo_full_name'
filter_fields = ['repo_full_name', 'user.login', 'id']
retry_errors = False
issues_comments_df, users_df = get_repos_user_actors(issues_df, '../data/large_files/join_files/issues_comments_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields, retry_errors)
issues_comments_errors_df = check_return_error_file('../data/error_logs/issues_comments_join_dataset_errors.csv')

In [39]:
issues_comments_df = issues_comments_df[issues_comments_df.repo_full_name.isin(core_repos.full_name)]

In [40]:
print(f"From {len(issues_df)} repos with issues, we found {len(issues_comments_df)} comments, which come from {len(issues_df.repository_url.unique())} unique repos and were created by {len(users_df[users_df.login.isin(issues_comments_df['user.login'])])} unique users. There were {len(issues_comments_errors_df)} errors in getting issues comments (likely issues comments longer that no longer exist).")

From 90552 repos with issues, we found 36515 comments, which come from 1999 unique repos and were created by 1205 unique users. There were 2505 errors in getting issues comments (likely issues comments longer that no longer exist).


### Get Repo Pull Requests

In [41]:
get_url_field = 'pulls_url'
load_existing_files = True
overwrite_existing_temp_files = False
filter_fields = ['id', 'repo_full_name', 'user.login', 'head.user.login']
join_unique_field = 'repo_full_name'
retry_errors = False
pulls_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_pulls_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields, retry_errors)
pulls_errors_df = check_return_error_file('../data/error_logs/repo_pulls_join_dataset_errors.csv')

In [42]:
pulls_df = pulls_df[pulls_df.repo_full_name.isin(core_repos.full_name)]

In [43]:
print(f"From {len(core_repos)} repos, we found {len(pulls_df)} pulls, which come from {len(pulls_df.repo_id.unique())} unique repos and were created by {len(users_df[users_df.login.isin(pulls_df['user.login'])])} unique users. There were {len(pulls_errors_df)} errors in getting pulls (likely repos that have no pulls longer exist).")

From 12495 repos, we found 14321 pulls, which come from 599 unique repos and were created by 805 unique users. There were 0 errors in getting pulls (likely repos that have no pulls longer exist).


#### Get Repo Pull Request Comments

In [17]:
url_type = "review_comments_url"
count_type = "review_count"
pulls_df = get_counts(pulls_df, url_type, count_type, overwrite_existing_temp_files=False)
pulls_df.to_csv('../data/large_files/join_files/repo_pulls_join_dataset.csv', index=False)

In [44]:
get_url_field = 'review_comments_url'
load_existing_files = True
overwrite_existing_temp_files = False
filter_fields = ['repo_full_name', 'user.login', 'url']
join_unique_field = 'repo_full_name'
retry_errors = False
pulls_comments_df, users_df = get_repos_user_actors(pulls_df, '../data/large_files/join_files/pulls_comments_join_dataset.csv', '../data/large_files/entity_files/users_dataset.csv', get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields, retry_errors)
pulls_comments_errors_df = check_return_error_file('../data/error_logs/pulls_comments_join_dataset_errors.csv')

In [45]:
pulls_comments_df = pulls_comments_df[pulls_comments_df.repo_full_name.isin(core_repos.full_name)]

In [46]:
print(f"From {len(pulls_df)} repos with pulls, we found {len(pulls_comments_df)} comments, which come from {len(pulls_df.repo_id.unique())} unique repos and were created by {len(users_df[users_df.login.isin(pulls_comments_df['user.login'])])} unique users. There were {len(pulls_comments_errors_df)} errors in getting pulls comments (likely repos that have no pulls comments longer exist).")

From 14321 repos with pulls, we found 6938 comments, which come from 599 unique repos and were created by 189 unique users. There were 16 errors in getting pulls comments (likely repos that have no pulls comments longer exist).


: 

### Get Repo Commits

In [19]:
core_repos = get_total_commits(core_repos, '../data/large_files/entity_files/subset_repos_dataset_with_commits.csv')

In [20]:
core_repos['cleaned_total_commits'] = core_repos.total_commits.astype(int)
print(f"From {len(core_repos)} repos, we found {core_repos.cleaned_total_commits.sum()} total commits, which considering the 5000 rate limit will take {core_repos.cleaned_total_commits.sum()/5000} hours to get.")

From 2108 repos, we found 241355 total commits, which considering the 5000 rate limit will take 48.271 hours to get.


In [None]:
get_url_field = 'commits_url'
load_existing_files = False
load_existing_temp_files = True
commits_df, users_df = get_repos_user_actors(core_repos, '../data/large_files/join_files/repo_commits_join_dataset.csv', '../data/entity_files/users_dataset.csv', get_url_field, load_existing_files, load_existing_temp_files)
commits_errors_df = check_return_error_file('../data/error_logs/repo_commits_join_dataset_errors.csv')

### Total Summary

### Get and Explore Repo Specific Features

In [20]:
initial_core_repos = pd.read_csv("../data/derived_files/initial_core_repos.csv")
firstpass_core_repos = pd.read_csv("../data/derived_files/firstpass_core_repos.csv")
finalpass_core_repos = pd.read_csv("../data/derived_files/finalpass_core_repos.csv")

In [21]:
core_repos = pd.concat([initial_core_repos, firstpass_core_repos, finalpass_core_repos])
core_repos.duplicated(subset=['full_name']).sum()

0

In [22]:
repo_output_path = "../data/large_files/entity_files/repos_dataset.csv"
error_file_path = "../data/error_logs/repo_profile_errors.csv"
temp_repo_dir = "../data/temp/repo_profile/"
core_repos = get_repo_profile(core_repos, repo_output_path, rates_df, error_file_path, temp_repo_dir)

KeyboardInterrupt: 

: 

In [None]:
core_repos = get_repo_languages(core_repos, repo_output_path, rates_df)

In [None]:
core_repos = get_repo_labels(core_repos, repo_output_path, rates_df)

In [None]:
core_repos = get_repo_tags(core_repos, repo_output_path, rates_df)