# Get and Load all Repo and Users Data

In [25]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import sys

sys.path.append("..")
from data_generation_scripts.utils import check_rate_limit, check_return_error_file
from data_generation_scripts.generate_search_data import get_initial_repo_df, combine_search_df
from data_generation_scripts.generate_repo_metadata import get_repo_languages, get_repo_labels, get_repo_tags,  get_repo_profile
from data_generation_scripts.generate_commits_data import get_repos_commits
from data_generation_scripts.generate_repo_user_interactions import get_repos_user_actors

In [26]:
rates_df = check_rate_limit()

### Get Initial Search Repos

In [27]:
initial_output_path = '../data/repo_data/'
repo_output_path = '../data/entity_files/repos_dataset.csv'
join_output_path = "../data/join_files/search_queries_join_dataset.csv"
load_existing_data = True
repo_df, search_queries_repo_df = get_initial_repo_df(repo_output_path, join_output_path, initial_output_path, rates_df, load_existing_data)


In [28]:
print(f"From {len(search_queries_repo_df['query'].unique())} unique queries with results, we found {len(search_queries_repo_df)} repos, of which {len(repo_df)} are unique.")

From 41 unique queries with results, we found 2191 repos, of which 2105 are unique.


### Get Repo Contributors

In [29]:
get_url_field = 'contributors_url'
load_existing_data = True
is_stargazers = False
is_forks = False
is_issues = False
contributors_df, users_df = get_repos_user_actors(repo_df, '../data/join_files/repo_contributors_join_dataset.csv', '../data/entity_files/users_dataset.csv', rates_df, get_url_field, load_existing_data, is_stargazers, is_forks, is_issues)
contributors_errors_df = check_return_error_file('../data/error_logs/repo_contributors_join_dataset_errors.csv')

In [30]:
print(f"From {len(repo_df)} repos, we found {len(contributors_df)} contributors, of which {len(users_df[users_df.login.isin(contributors_df.login)])} are unique. There were {len(contributors_errors_df)} errors in getting contributors (likely user accounts that no longer exist).")

From 2105 repos, we found 6745 contributors, of which 3455 are unique. There were 0 errors in getting contributors (likely user accounts that no longer exist).


### Get Repo Starrers

In [32]:
get_url_field = 'stargazers_url'
load_existing_data = True
is_stargazers = True
is_forks = False
is_issues = False
stargazers_df, users_df = get_repos_user_actors(repo_df, '../data/join_files/repo_stargazers_join_dataset.csv', '../data/entity_files/users_dataset.csv', rates_df, get_url_field, load_existing_data, is_stargazers, is_forks, is_issues)
stargazers_errors_df = check_return_error_file('../data/error_logs/repo_stargazers_join_dataset_errors.csv')

In [33]:
print(f"From {len(repo_df)} repos, we found {len(stargazers_df)} stargazers, of which {len(users_df[users_df.login.isin(stargazers_df.login)])} are unique. There were {len(stargazers_errors_df)} errors in getting stargazers (likely user accounts that no longer exist).")

From 2105 repos, we found 7564 stargazers, of which 3170 are unique. There were 12 errors in getting stargazers (likely user accounts that no longer exist).


### Get Repo Watchers

In [34]:
get_url_field = 'subscribers_url'
load_existing_data = True
is_stargazers = False
is_forks = False
is_issues = False
subscribers_df, users_df = get_repos_user_actors(repo_df, '../data/join_files/repo_subscribers_join_dataset.csv', '../data/entity_files/users_dataset.csv', rates_df, get_url_field, load_existing_data, is_stargazers, is_forks, is_issues)
subscribers_errors_df = check_return_error_file('../data/error_logs/repo_subscribers_join_dataset_errors.csv')

In [35]:
print(f"From {len(repo_df)} repos, we found {len(subscribers_df)} subscribers, of which {len(users_df[users_df.login.isin(subscribers_df.login)])} are unique. There were {len(subscribers_errors_df)} errors in getting subscribers (likely user accounts that no longer exist).")

From 2105 repos, we found 2685 subscribers, of which 1770 are unique. There were 0 errors in getting subscribers (likely user accounts that no longer exist).


#### Cannot get repo collaborators

Need Push Access

In [36]:
# get_url_field = 'collaborators_url'
# load_existing_data = False
# is_stargazers = False
# collaborators_df, users_df = get_repos_user_actors(repo_df, '../data/repo_collaborators_join_dataset.csv', '../data/users_dataset.csv', rates_df, get_url_field, load_existing_data, is_stargazers)
# collaborators_errors_df = check_return_error_file('../data/error_logs/repo_collaborators_join_dataset_errors.csv')

### Get Repo Forks

In [37]:
get_url_field = 'forks_url'
load_existing_data = True
is_stargazers = False
is_forks = True
is_issues = False
forks_df, users_df = get_repos_user_actors(repo_df, '../data/join_files/repo_forks_join_dataset.csv', '../data/entity_files/users_dataset.csv', rates_df, get_url_field, load_existing_data, is_stargazers, is_forks, is_issues)
forks_errors_df = check_return_error_file('../data/error_logs/repo_forks_join_dataset_errors.csv')

In [38]:
print(f"From {len(repo_df)} repos, we found {len(forks_df)} forks, of which {len(users_df[users_df.login.isin(forks_df.login)])} are unique. There were {len(forks_errors_df)} errors in getting forks (likely user accounts that no longer exist).")

From 2105 repos, we found 2512 forks, of which 1759 are unique. There were 0 errors in getting forks (likely user accounts that no longer exist).


### Get Repo Issues

In [42]:
get_url_field = 'issues_url'
load_existing_data = True
is_stargazers = False
is_forks = False
is_issues = True
issues_df, users_df = get_repos_user_actors(repo_df, '../data/large_datasets/repo_issues_join_dataset.csv', '../data/entity_files/users_dataset.csv', rates_df, get_url_field, load_existing_data, is_stargazers, is_forks, is_issues)
issues_errors_df = check_return_error_file('../data/error_logs/repo_issues_join_dataset_errors.csv')

In [43]:
print(f"From {len(repo_df)} repos, we found {len(issues_df)} issues, which come from {len(issues_df.repository_url.unique())} unique repos and were created by {len(users_df[users_df.login.isin(issues_df['user.login'])])} unique users. There were {len(issues_errors_df)} errors in getting issues (likely repos that have no issues longer exist).")

From 2105 repos, we found 22513 issues, which come from 211 unique repos and were created by 824 unique users. There were 1790 errors in getting issues (likely repos that have no issues longer exist).


### Total Summary

In [44]:
print(f"From {len(repo_df)} repos, we found {len(users_df)} users that are unique.")

From 2105 repos, we found 7552 users that are unique.


### Get and Explore Repo Specific Features

In [45]:
repo_output_path = "../data/entity_files/repos_dataset.csv"
error_file_path = "../data/error_logs/repo_profile_errors.csv"
temp_repo_dir = "../data/temp/repo_profile/"
repo_df = get_repo_profile(repo_df, repo_output_path, rates_df, error_file_path, temp_repo_dir)

In [16]:
repo_df = get_repo_languages(repo_df, repo_output_path, rates_df)

Getting Languages: 1it [00:00, 129.62it/s]


In [17]:
repo_df = get_repo_labels(repo_df, repo_output_path, rates_df)

Getting Labels: 100%|██████████| 2105/2105 [07:30<00:00,  4.67it/s]


In [18]:
repo_df = get_repo_tags(repo_df, repo_output_path, rates_df)

Getting Tags: 100%|██████████| 2105/2105 [07:41<00:00,  4.56it/s]


In [None]:
# commits_df = get_repos_commits(repo_df, '../private_data/search_tagged_dh_repos_commits.csv', rates_df)