#### Load Libraries and Data

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None
import altair as alt
alt.data_transformers.disable_max_rows()
import os
import sys
import warnings
warnings.filterwarnings('ignore')

In [5]:
user_df = pd.read_csv("../data/entity_files/users_dataset.csv")
repo_df = pd.read_csv("../data/large_files/entity_files/repos_dataset.csv", low_memory=False)
search_queries_repo_join_df = pd.read_csv("../data/join_files/search_queries_repo_join_dataset.csv")
search_queries_user_join_df = pd.read_csv("../data/join_files/search_queries_user_join_dataset.csv")
org_members_df = pd.read_csv('../data/join_files/org_members_dataset.csv')

In [6]:
expanded_user_df = pd.read_csv("../expanded_search/data/entity_files/users_dataset.csv")
expanded_repo_df = pd.read_csv("../expanded_search/data/large_files/entity_files/repos_dataset.csv", low_memory=False)
expanded_search_queries_repo_join_df = pd.read_csv("../expanded_search/data/join_files/search_queries_repo_join_dataset.csv")
expanded_search_queries_user_join_df = pd.read_csv("../expanded_search/data/join_files/search_queries_user_join_dataset.csv")

#### Subset Data

In [8]:
subset_user_df = user_df[user_df['id'].isin(search_queries_user_join_df['id'])]
subset_repo_df = repo_df[repo_df['id'].isin(search_queries_repo_join_df['id'])]

In [9]:
subset_expanded_user_df = expanded_user_df[expanded_user_df['id'].isin(expanded_search_queries_user_join_df['id'])]
subset_expanded_repo_df = expanded_repo_df[expanded_repo_df['id'].isin(expanded_search_queries_repo_join_df['id'])]

In [11]:
print(f"From our original search queries, we found {len(subset_user_df)} users and {len(subset_repo_df)} repositories. From our expanded search queries, we found {len(subset_expanded_user_df)} users and {len(subset_expanded_repo_df)} repositories.")

From our original search queries, we found 846 users and 2120 repositories. From our expanded search queries, we found 1361 users and 3292 repositories.


In [12]:
core_users = pd.read_csv('../data/derived_files/core_users.csv')

In [18]:
print(f"Of these original users, {round(len(subset_user_df[subset_user_df.login.isin(core_users.login)])/len(subset_user_df) * 100)}% are core users. Of these expanded users, {round(len(subset_expanded_user_df[subset_expanded_user_df.login.isin(core_users.login)])/ len(subset_expanded_user_df) * 100)}% are core users.")

Of these original users, 100% are core users. Of these expanded users, 64% are core users.


In [21]:
missing_users = subset_expanded_user_df[~subset_expanded_user_df.login.isin(core_users.login)]

In [24]:
missing_users[missing_users.bio.str.contains('humanities', na=False)]

Unnamed: 0,login,id,node_id,avatar_url,url,html_url,followers_url,following_url,gists_url,starred_url,...,public_repos,public_gists,followers,following,created_at,updated_at,user_query_time,gravatar_id,score,search_query_time
1168,dacb,6485498,MDQ6VXNlcjY0ODU0OTg=,https://avatars.githubusercontent.com/u/648549...,https://api.github.com/users/dacb,https://github.com/dacb,https://api.github.com/users/dacb/followers,https://api.github.com/users/dacb/following{/o...,https://api.github.com/users/dacb/gists{/gist_id},https://api.github.com/users/dacb/starred{/own...,...,67.0,0.0,66.0,18.0,2014-01-23T20:37:09Z,2022-10-23T17:27:22Z,2022-11-22,,,
2084,ajdapretnar,12524972,MDQ6VXNlcjEyNTI0OTcy,https://avatars.githubusercontent.com/u/125249...,https://api.github.com/users/ajdapretnar,https://github.com/ajdapretnar,https://api.github.com/users/ajdapretnar/follo...,https://api.github.com/users/ajdapretnar/follo...,https://api.github.com/users/ajdapretnar/gists...,https://api.github.com/users/ajdapretnar/starr...,...,52.0,3.0,34.0,1.0,2015-05-20T07:20:10Z,2022-10-30T11:29:24Z,2022-11-22,,,
4867,jorisvanzundert,420879,MDQ6VXNlcjQyMDg3OQ==,https://avatars.githubusercontent.com/u/420879...,https://api.github.com/users/jorisvanzundert,https://github.com/jorisvanzundert,https://api.github.com/users/jorisvanzundert/f...,https://api.github.com/users/jorisvanzundert/f...,https://api.github.com/users/jorisvanzundert/g...,https://api.github.com/users/jorisvanzundert/s...,...,25.0,0.0,21.0,3.0,2010-09-29T14:26:14Z,2022-10-17T19:58:08Z,2022-11-22,,,
5269,sharris-umass,36164237,MDQ6VXNlcjM2MTY0MjM3,https://avatars.githubusercontent.com/u/361642...,https://api.github.com/users/sharris-umass,https://github.com/sharris-umass,https://api.github.com/users/sharris-umass/fol...,https://api.github.com/users/sharris-umass/fol...,https://api.github.com/users/sharris-umass/gis...,https://api.github.com/users/sharris-umass/sta...,...,4.0,1.0,1.0,5.0,2018-02-05T15:32:51Z,2022-11-02T22:08:23Z,2022-11-22,,,
7232,unr-ndad,64289132,MDEyOk9yZ2FuaXphdGlvbjY0Mjg5MTMy,https://avatars.githubusercontent.com/u/642891...,https://api.github.com/users/unr-ndad,https://github.com/unr-ndad,https://api.github.com/users/unr-ndad/followers,https://api.github.com/users/unr-ndad/followin...,https://api.github.com/users/unr-ndad/gists{/g...,https://api.github.com/users/unr-ndad/starred{...,...,3.0,0.0,0.0,0.0,2020-04-24T22:44:27Z,2020-04-28T05:49:37Z,2022-11-22,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8697,Baenz-Ledin,97942506,U_kgDOBdZ76g,https://avatars.githubusercontent.com/u/979425...,https://api.github.com/users/Baenz-Ledin,https://github.com/Baenz-Ledin,https://api.github.com/users/Baenz-Ledin/follo...,https://api.github.com/users/Baenz-Ledin/follo...,https://api.github.com/users/Baenz-Ledin/gists...,https://api.github.com/users/Baenz-Ledin/starr...,...,2.0,0.0,0.0,0.0,2022-01-18T08:06:26Z,2022-06-30T08:05:26Z,2022-11-22,,,
8698,hafzh21,114286736,U_kgDOBs_gkA,https://avatars.githubusercontent.com/u/114286...,https://api.github.com/users/hafzh21,https://github.com/hafzh21,https://api.github.com/users/hafzh21/followers,https://api.github.com/users/hafzh21/following...,https://api.github.com/users/hafzh21/gists{/gi...,https://api.github.com/users/hafzh21/starred{/...,...,0.0,0.0,0.0,0.0,2022-09-24T09:35:12Z,2022-09-24T10:11:22Z,2022-11-22,,,
8699,polydo,112898334,U_kgDOBrqxHg,https://avatars.githubusercontent.com/u/112898...,https://api.github.com/users/polydo,https://github.com/polydo,https://api.github.com/users/polydo/followers,https://api.github.com/users/polydo/following{...,https://api.github.com/users/polydo/gists{/gis...,https://api.github.com/users/polydo/starred{/o...,...,0.0,0.0,0.0,2.0,2022-09-05T18:05:50Z,2022-09-05T18:35:10Z,2022-11-22,,,
8700,jl8kii,111535834,U_kgDOBqXm2g,https://avatars.githubusercontent.com/u/111535...,https://api.github.com/users/jl8kii,https://github.com/jl8kii,https://api.github.com/users/jl8kii/followers,https://api.github.com/users/jl8kii/following{...,https://api.github.com/users/jl8kii/gists{/gis...,https://api.github.com/users/jl8kii/starred{/o...,...,1.0,0.0,0.0,0.0,2022-08-18T20:00:47Z,2022-08-18T21:41:24Z,2022-11-22,,,
