In [2]:
import pandas as pd
import sys
import os

sys.path.append("..")
from data_generation_scripts.utils import check_rate_limit
from data_generation_scripts.generate_org_users_interactions import get_org_users_activities
from data_generation_scripts.generate_org_metadata import get_counts
from data_generation_scripts.generate_org_repos_interactions import get_org_repo_activities

In [3]:
rates_df = check_rate_limit()

### Get Initial Orgs

In [4]:
initial_core_orgs = pd.read_csv("../data/derived_files/initial_core_orgs.csv")

In [5]:
core_orgs_path = "../data/derived_files/firstpass_core_orgs.csv"
core_orgs = pd.read_csv(core_orgs_path)

In [6]:
combined_core_orgs = pd.concat([initial_core_orgs, core_orgs], ignore_index=True)

In [7]:
combined_core_orgs[combined_core_orgs.public_repos < 1001].public_repos.describe()

count    459.000000
mean      27.283224
std       59.144025
min        0.000000
25%        3.000000
50%        8.000000
75%       26.500000
max      669.000000
Name: public_repos, dtype: float64

In [314]:
user_df = pd.read_csv("../data/large_files/entity_files/users_dataset.csv")

In [315]:
orgs_df = pd.read_csv("../data/entity_files/orgs_dataset.csv")

### Get Missing Counts

In [328]:
if os.path.exists("../data/metadata_files/user_url_cols.csv"):
    cols_df = pd.read_csv("../data/metadata_files/user_url_cols.csv")
else:
    cols_dict ={'followers': 'followers_url', 'following': 'following_url', 'public_repos': 'repos_url', 'public_gists': 'gists_url', 'star_count': 'starred_url', 'subscription_count': 'subscriptions_url', 'organization_count': 'organizations_url'}
    cols_df = pd.DataFrame(cols_dict.items(), columns=['col_name', 'col_url'])
    cols = cols_df.col_name.tolist()
    reverse_cols = cols[::-1]
    cols_df.to_csv("../data/metadata_files/user_url_cols.csv", index=False)

In [329]:
core_orgs["members_url"] = core_orgs["url"].apply(lambda x: x + "/public_members")
core_orgs.members_url = core_orgs.members_url.str.replace('users', 'orgs')
for index, row in cols_df.iterrows():
    if (row['col_name'] not in core_orgs.columns) or (core_orgs[core_orgs[row.col_name].isna()].shape[0] > 0):
        if 'url' in row.col_url:
            print(f'Getting {row.col_name} for core orgs')
            core_orgs = get_counts(core_orgs, row.col_url, row.col_name, overwrite_existing_temp_files=False)
            core_orgs.to_csv(core_orgs_path, index=False)
        else:
            print(f'Issues with {row.col_name} for core orgs')
                

### Get Organization Members

In [342]:
org_members_output_path = "../data/join_files/org_members_join_dataset.csv"
users_output_path = "../data/large_files/entity_files/users_dataset.csv"
get_url_field = "members_url"
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = "org_login"
filter_fields = ["org_login", "login"]
retry_error = False

org_members_df, user_df = get_org_users_activities(core_orgs,org_members_output_path, users_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields, retry_error)

### Get Organization Followers

In [None]:
org_followers_output_path = "../data/join_files/org_followers_join_dataset.csv"
users_output_path = "../data/entity_files/users_dataset.csv"
get_url_field = "followers_url"
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = "org_login"
filter_fields = ["org_login", "login"]
retry_error = False
org_followers_df, user_df = get_org_users_activities(core_orgs,org_followers_output_path, users_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields, retry_error)

### Get Organization Repos

In [32]:
org_repos_output_path = "../data/join_files/org_repos_join_dataset.csv"
repo_output_path = "../data/large_files/entity_files/repo_dataset.csv"
get_url_field = "repos_url"
load_existing_files = True
overwrite_existing_temp_files = False
join_unique_field = "org_login"
filter_fields = ["org_login", "full_name"]
retry_error = True
org_repos_df, user_df = get_org_repo_activities(core_orgs,org_repos_output_path, repo_output_path, get_url_field, load_existing_files, overwrite_existing_temp_files, join_unique_field, filter_fields, retry_error)