In [1]:
import pandas as pd
import sys
import altair as alt
alt.data_transformers.disable_max_rows()
import os
sys.path.append("..")
from data_generation_scripts.utils import check_rate_limit, get_core_users_repos
from data_generation_scripts.generate_user_repos_interactions import get_user_repo_activities
from data_generation_scripts.generate_user_users_interactions import get_user_users_activities
from data_generation_scripts.generate_user_orgs_interactions import get_user_org_activities
from data_generation_scripts.generate_user_metadata import get_counts

In [2]:
rates_df = check_rate_limit()

### Get Initial Users

In [3]:
core_users_path = "../data/derived_files/initial_core_users.csv"
core_users = pd.read_csv(core_users_path)

### Get Missing Counts

In [4]:
if os.path.exists("../data/metadata_files/user_url_cols.csv"):
    cols_df = pd.read_csv("../data/metadata_files/user_url_cols.csv")
else:
    cols_dict ={'followers': 'followers', 'following': 'following', 'public_repos': 'public_repos', 'public_gists': 'public_gists', 'star_count': 'starred_url', 'subscription_count': 'subscriptions_url', 'organization_count': 'organizations_url'}
    cols_df = pd.DataFrame(cols_dict.items(), columns=['col_name', 'col_url'])
    cols = cols_df.col_name.tolist()
    reverse_cols = cols[::-1]
    cols_df.to_csv("../data/metadata_files/user_url_cols.csv", index=False)

In [6]:
cols_df

Unnamed: 0,col_name,col_url
0,followers,followers_url
1,following,following_url
2,public_repos,repos_url
3,public_gists,gists_url
4,star_count,starred_url
5,subscription_count,subscriptions_url
6,organization_count,organizations_url


In [5]:
# for index, row in cols_df.iterrows():
#     if (row['col_name'] not in core_users.columns) or (core_users[core_users[row.col_name].isna()].shape[0] > 0):
#         if 'url' in row.col_url:
#             print(f'Getting {row.col_name} for core users')
#             print(row.col_name, row.col_url)
#             # core_users = check_total_results(core_users, row.col_name, row.col_url)
#         else:
#             print(f'Issues with {row.col_name} for core users')

In [6]:
# core_users.to_csv("../data/derived_files/core_users.csv", index=False)

### Get User Followers

In [7]:
user_users_output_path = "../data/large_files/join_files/user_followers_join_dataset.csv"
users_output_path = "../data/large_files/entity_files/users_dataset.csv"
get_url_field = "followers_url"
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'user_login'
filter_fields = ['user_login', 'login']

users_followers_df, user_df = get_user_users_activities(core_users,user_users_output_path, users_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)

### Get User Following

In [18]:
user_users_output_path = "../data/large_files/join_files/user_following_join_dataset.csv"
users_output_path = "../data/entity_files/users_dataset.csv"
get_url_field = "following_url"
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'user_login'
filter_fields = ['user_login', 'login']

users_following_df, user_df = get_user_users_activities(core_users, user_users_output_path, users_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)

### Get User Stars

In [20]:
user_users_output_path = "../data/large_files/join_files/user_starred_join_dataset.csv"
users_output_path = "../data/entity_files/users_dataset.csv"
get_url_field = "starred_url"
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'user_login'
filter_fields = ['user_login', 'full_name']

users_starred_df, user_df = get_user_repo_activities(core_users,user_users_output_path, users_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)

### Get User Repos

In [5]:
user_repos_output_path = "../data/large_files/join_files/user_repos_join_dataset.csv"
users_output_path = "../data/entity_files/users_dataset.csv"
get_url_field = "repos_url"
load_existing_files = True
overwrite_existing_temp_files = False
retry_errors = False
join_unique_field = 'user_login'
filter_fields = ['user_login', 'full_name']

users_repos_df, user_df = get_user_repo_activities(core_users,user_repos_output_path, users_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields, retry_errors)

In [4]:
users_repos_df = pd.read_csv( "../data/large_files/join_files/user_repos_join_dataset.csv", nrows=100)

In [6]:
users_repos_df[0:1].to_dict()

{'id': {0: 124940812.0},
 'node_id': {0: 'MDEwOlJlcG9zaXRvcnkxMjQ5NDA4MTI='},
 'name': {0: 'c4l18-keynote-statement'},
 'full_name': {0: 'Purdom/c4l18-keynote-statement'},
 'private': {0: False},
 'html_url': {0: 'https://github.com/Purdom/c4l18-keynote-statement'},
 'description': {0: 'Code4Lib Community Statement in Support of Chris Bourg'},
 'fork': {0: True},
 'url': {0: 'https://api.github.com/repos/Purdom/c4l18-keynote-statement'},
 'forks_url': {0: 'https://api.github.com/repos/Purdom/c4l18-keynote-statement/forks'},
 'keys_url': {0: 'https://api.github.com/repos/Purdom/c4l18-keynote-statement/keys{/key_id}'},
 'collaborators_url': {0: 'https://api.github.com/repos/Purdom/c4l18-keynote-statement/collaborators{/collaborator}'},
 'teams_url': {0: 'https://api.github.com/repos/Purdom/c4l18-keynote-statement/teams'},
 'hooks_url': {0: 'https://api.github.com/repos/Purdom/c4l18-keynote-statement/hooks'},
 'issue_events_url': {0: 'https://api.github.com/repos/Purdom/c4l18-keynote-stat

### Get User Subscriptions

In [24]:
user_subscriptions_output_path = "../data/large_files/join_files/user_subscriptions_join_dataset.csv"
users_output_path = "../data/entity_files/users_dataset.csv"
get_url_field = "subscriptions_url"
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'user_login'
filter_fields = ['user_login', 'full_name']

users_subscriptions_df, user_df = get_user_repo_activities(core_users,user_subscriptions_output_path, users_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)

### Get User Organizations

In [26]:
user_organizations_output_path = "../data/large_files/join_files/user_organizations_join_dataset.csv"
users_output_path = "../data/entity_files/users_dataset.csv"
get_url_field = "organizations_url"
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = 'user_login'
filter_fields = ['user_login', 'login']

users_organizations_df, user_df = get_user_org_activities(core_users,user_organizations_output_path, users_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields)