In [1]:
import pandas as pd
import sys

sys.path.append("..")
from data_generation_scripts.utils import check_rate_limit, check_return_error_file, read_combine_files
from data_generation_scripts.generate_repo_metadata import get_repo_owners
from data_generation_scripts.generate_user_repos_interactions import get_user_repo_activities
from data_generation_scripts.generate_user_users_interactions import get_user_users_activities

In [2]:
rates_df = check_rate_limit()

### Get Initial Users

In [3]:
user_df = pd.read_csv("../data/entity_files/users_dataset.csv")
repo_df = pd.read_csv("../data/large_files/entity_files/repos_dataset.csv", low_memory=False)
search_queries_join_df = pd.read_csv("../data/join_files/search_queries_join_dataset.csv")

In [4]:
subset_repo_df = repo_df[repo_df["id"].isin(search_queries_join_df["id"].unique())]
original_owners = user_df[user_df.login.isin(subset_repo_df['owner.login'])]

### Get User Followers

In [6]:
user_users_output_path = "../data/join_files/user_followers_join_dataset.csv"
users_output_path = "../data/entity_files/users_dataset.csv"
get_url_field = "followers_url"
load_existing_files = True
overwrite_existing_temp_files = False

users_followers_df, user_df = get_user_users_activities(original_owners,user_users_output_path, users_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files)

### Get User Following

In [7]:
user_users_output_path = "../data/join_files/user_following_join_dataset.csv"
users_output_path = "../data/entity_files/users_dataset.csv"
get_url_field = "following_url"
load_existing_files = True
overwrite_existing_temp_files = False

users_following_df, user_df = get_user_users_activities(original_owners,user_users_output_path, users_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files)

### Get User Stars

In [11]:
user_users_output_path = "../data/large_files/user_starred_join_dataset.csv"
users_output_path = "../data/entity_files/users_dataset.csv"
get_url_field = "starred_url"
load_existing_files = True
overwrite_existing_temp_files = False

users_starred_df, user_df = get_user_repo_activities(original_owners,user_users_output_path, users_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files)

### Get Contributors

In [12]:
contributors_df = pd.read_csv("../data/join_files/repo_contributors_join_dataset.csv")

In [15]:
original_contributors = user_df[user_df.login.isin(contributors_df['login'])]
original_contributors = original_contributors[~original_contributors.login.isin(original_owners.login)]

In [16]:
original_contributors.public_repos.sum() / 5000

30.0796

In [21]:
excluded_users = pd.read_csv('../data/metadata_files/excluded_users.csv')

In [22]:
original_contributors = original_contributors[~original_contributors.login.isin(excluded_users.login)]

In [18]:
import altair as alt

In [24]:
alt.Chart(original_contributors).mark_bar().encode(
    alt.X('public_repos:Q', bin=True),
    y='count()'
)

In [25]:
original_contributors.sort_values(by='public_repos', ascending=False).head(10)

Unnamed: 0,login,id,node_id,avatar_url,url,html_url,followers_url,following_url,gists_url,starred_url,...,public_repos,public_gists,followers,following,created_at,updated_at,starred_at,gravatar_id,contributions,pushed_at
1677,bryant1410,3905501,MDQ6VXNlcjM5MDU1MDE=,https://avatars.githubusercontent.com/u/390550...,https://api.github.com/users/bryant1410,https://github.com/bryant1410,https://api.github.com/users/bryant1410/followers,https://api.github.com/users/bryant1410/follow...,https://api.github.com/users/bryant1410/gists{...,https://api.github.com/users/bryant1410/starre...,...,4053.0,15.0,438.0,253.0,2013-03-19T03:12:38Z,2022-10-13T05:37:30Z,,,,
109,kant,32717,MDQ6VXNlcjMyNzE3,https://avatars.githubusercontent.com/u/32717?v=4,https://api.github.com/users/kant,https://github.com/kant,https://api.github.com/users/kant/followers,https://api.github.com/users/kant/following{/o...,https://api.github.com/users/kant/gists{/gist_id},https://api.github.com/users/kant/starred{/own...,...,4033.0,12.0,187.0,7.0,2008-11-05T02:06:00Z,2022-09-30T16:58:41Z,,,,
1823,ocefpaf,950575,MDQ6VXNlcjk1MDU3NQ==,https://avatars.githubusercontent.com/u/950575...,https://api.github.com/users/ocefpaf,https://github.com/ocefpaf,https://api.github.com/users/ocefpaf/followers,https://api.github.com/users/ocefpaf/following...,https://api.github.com/users/ocefpaf/gists{/gi...,https://api.github.com/users/ocefpaf/starred{/...,...,1325.0,154.0,543.0,4.0,2011-07-31T23:10:26Z,2022-10-11T15:33:46Z,,,,
1868,marwahaha,2541209,MDQ6VXNlcjI1NDEyMDk=,https://avatars.githubusercontent.com/u/254120...,https://api.github.com/users/marwahaha,https://github.com/marwahaha,https://api.github.com/users/marwahaha/followers,https://api.github.com/users/marwahaha/followi...,https://api.github.com/users/marwahaha/gists{/...,https://api.github.com/users/marwahaha/starred...,...,1194.0,21.0,139.0,8.0,2012-10-12T01:30:18Z,2022-10-06T05:39:49Z,,,,
713,purcell,5636,MDQ6VXNlcjU2MzY=,https://avatars.githubusercontent.com/u/5636?v=4,https://api.github.com/users/purcell,https://github.com/purcell,https://api.github.com/users/purcell/followers,https://api.github.com/users/purcell/following...,https://api.github.com/users/purcell/gists{/gi...,https://api.github.com/users/purcell/starred{/...,...,1041.0,66.0,2857.0,58.0,2008-04-07T18:50:12Z,2022-09-02T19:26:29Z,,,,
2057,prayagverma,829526,MDQ6VXNlcjgyOTUyNg==,https://avatars.githubusercontent.com/u/829526...,https://api.github.com/users/prayagverma,https://github.com/prayagverma,https://api.github.com/users/prayagverma/follo...,https://api.github.com/users/prayagverma/follo...,https://api.github.com/users/prayagverma/gists...,https://api.github.com/users/prayagverma/starr...,...,822.0,13.0,109.0,139.0,2011-06-04T15:56:17Z,2022-10-14T04:45:11Z,,,,
641,willingc,2680980,MDQ6VXNlcjI2ODA5ODA=,https://avatars.githubusercontent.com/u/268098...,https://api.github.com/users/willingc,https://github.com/willingc,https://api.github.com/users/willingc/followers,https://api.github.com/users/willingc/followin...,https://api.github.com/users/willingc/gists{/g...,https://api.github.com/users/willingc/starred{...,...,802.0,81.0,1124.0,300.0,2012-10-30T05:07:14Z,2022-10-11T03:19:03Z,,,,
1400,maxogden,39759,MDQ6VXNlcjM5NzU5,https://avatars.githubusercontent.com/u/39759?v=4,https://api.github.com/users/maxogden,https://github.com/maxogden,https://api.github.com/users/maxogden/followers,https://api.github.com/users/maxogden/followin...,https://api.github.com/users/maxogden/gists{/g...,https://api.github.com/users/maxogden/starred{...,...,752.0,519.0,6517.0,818.0,2008-12-11T06:52:00Z,2022-05-19T01:12:39Z,,,,
1359,dandv,33569,MDQ6VXNlcjMzNTY5,https://avatars.githubusercontent.com/u/33569?v=4,https://api.github.com/users/dandv,https://github.com/dandv,https://api.github.com/users/dandv/followers,https://api.github.com/users/dandv/following{/...,https://api.github.com/users/dandv/gists{/gist...,https://api.github.com/users/dandv/starred{/ow...,...,749.0,22.0,569.0,22.0,2008-11-10T03:40:31Z,2022-09-29T18:53:51Z,,,,
1525,tlevine,10280,MDQ6VXNlcjEwMjgw,https://avatars.githubusercontent.com/u/10280?v=4,https://api.github.com/users/tlevine,https://github.com/tlevine,https://api.github.com/users/tlevine/followers,https://api.github.com/users/tlevine/following...,https://api.github.com/users/tlevine/gists{/gi...,https://api.github.com/users/tlevine/starred{/...,...,728.0,24.0,232.0,83.0,2008-05-14T16:10:54Z,2021-06-25T15:14:56Z,,,,
