# Process Initial Results

In [4]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append("../")
from data_generation_scripts.utils import check_rate_limit, check_add_orgs, check_add_repos, check_add_users, check_for_joins_in_older_queries, read_combine_files
from data_generation_scripts.generate_expanded_search_data import get_initial_search_datasets
from data_generation_scripts.generate_repo_metadata import get_repo_languages, get_repo_labels, get_repo_tags,  get_repo_profile, get_total_commits
from data_generation_scripts.generate_repo_users_interactions import get_repos_user_actors
from data_generation_scripts.generate_repo_metadata import check_total_results

Once you've run `generate_expanded_search_data.py` and then `check_clean_search_results.py` you'll have a series of files in the `data/` directory that contain the results of your search. This notebook will help you process those results into a single file that can be used for analysis.

Example of how to run `generate_expanded_search_data.py`:

```python3
rates_df = check_rate_limit()
initial_repo_output_path = "../data/repo_data/"
repo_output_path = "../data/large_files/entity_files/repos_dataset.csv"
repo_join_output_path = "../data/large_files/join_files/search_queries_repo_join_dataset.csv"

initial_user_output_path = "../data/user_data/"
user_output_path = "../data/entity_files/users_dataset.csv"
user_join_output_path = "../data/join_files/search_queries_user_join_dataset.csv"
load_existing_data = False
overwrite_existing_temp_files = False
org_output_path = "../data/entity_files/orgs_dataset.csv"

get_initial_search_datasets(rates_df, initial_repo_output_path,  repo_output_path, repo_join_output_path, initial_user_output_path, user_output_path, user_join_output_path, org_output_path, overwrite_existing_temp_files, load_existing_data)
```

And then just run `check_clean_search_results.py` 

### Create Initial Core Results

In [54]:
user_df = pd.read_csv("../data/entity_files/users_dataset.csv")
repo_df = pd.read_csv("../data/large_files/entity_files/repos_dataset.csv", low_memory=False)
org_df = pd.read_csv("../data/entity_files/orgs_dataset.csv", low_memory=False)
search_queries_repo_join_df = pd.read_csv("../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv")
search_queries_user_join_df = pd.read_csv(
    "../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv")

Initial core datasets will be comprised of the following:

- `core_repos`: A list of all repos that were returned by the search query
- `core_users`: A list of all users that were returned by the search query
- `core_orgs`: A list of all orgs that were returned by the search query

In [121]:
join_file_path = "../data/join_files/search_queries_user_join_dataset.csv"
join_type = join_file_path.split("/")[-1].split("_dataset")[0]

older_join_file_path = join_file_path.replace("data/", "data/older_files/")
older_join_file_dir = os.path.dirname(older_join_file_path) + "/"

older_join_df = read_combine_files(dir_path=older_join_file_dir, check_all_dirs=True, file_path_contains=join_type, large_files=False) 

entity_type = "" if "search" in join_file_path else "repo" if "repo" in join_file_path else "user"

In [122]:
subset_terms = ['Digital Humanities']

In [123]:
older_join_df = older_join_df[older_join_df.search_term_source.isin(subset_terms)]

In [124]:
join_unique_field = "search_query"
older_join_df = older_join_df[older_join_df[join_unique_field].notna()]

In [59]:
len(older_join_df)

6993

In [60]:
df = search_queries_user_join_df.copy()
older_df = older_join_df.copy()
subset_fields = ["login", "cleaned_search_query"]

In [61]:
import numpy as np

In [63]:
test = pd.concat([search_queries_user_join_df, older_join_df])
len(test)

7790

In [64]:
test['cleaned_search_query_time'] = None
test.loc[test.cleaned_search_query_time.isna(), 'cleaned_search_query_time'] = "2022-10-10"
test.loc[test.search_query_time.notna(), 'cleaned_search_query_time'] = test.loc[test.search_query_time.notna(), 'search_query_time']
test['cleaned_search_query_time'] = pd.to_datetime(test['cleaned_search_query_time'], errors='coerce')

In [65]:
test['cleaned_search_query'] = test['search_query'].str.replace('%22', '"').str.replace('"', '').str.replace('%3A', ':').str.split('&page').str[0]

In [66]:
test2 = test.sort_values(by=['cleaned_search_query_time'], ascending=False).drop_duplicates(subset=subset_fields, keep='first')

In [92]:
df = search_queries_user_join_df.copy()
older_df = test2.copy()
len(df), len(older_df)

(797, 930)

In [99]:
subset_older_df = older_df[['login', 'cleaned_search_query']].reset_index(drop=True)
subset_older_df = subset_older_df[subset_older_df.login.notna()]

In [101]:
subset_df = df[['login', 'cleaned_search_query']].reset_index(drop=True)

In [102]:
subset_df['type_of_join'] = "new"
subset_older_df['type_of_join'] = "old"

In [105]:
merged_df = pd.merge(subset_df, subset_older_df, on=['login', 'cleaned_search_query'], how='outer', indicator=True)

In [119]:
missing_values = merged_df[merged_df._merge == 'right_only']

double_check = missing_values[subset_fields]
combined_condition = np.ones(len(older_df), dtype=bool)
for field in subset_fields:
    combined_condition = combined_condition & older_df[field].isin(double_check[field])
older_df['double_check'] = np.where(combined_condition, 1, 0)
final_missing_values = older_df[(older_df.double_check == 1) & (older_df[subset_fields[0]].isin(double_check[subset_fields[0]]))]

In [120]:
final_missing_values

Unnamed: 0,login,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,gists_url,...,hooks_url,issues_url,members_url,public_members_url,description,is_verified,has_organization_projects,has_repository_projects,double_check,cleaned_search_query_time
585,dhtaxonomy,6716560.0,MDEyOk9yZ2FuaXphdGlvbjY3MTY1NjA=,https://avatars.githubusercontent.com/u/671656...,,https://api.github.com/users/dhtaxonomy,https://github.com/dhtaxonomy,https://api.github.com/users/dhtaxonomy/followers,https://api.github.com/users/dhtaxonomy/follow...,https://api.github.com/users/dhtaxonomy/gists{...,...,,,,,,,,,1,2023-03-19
593,CDH-SC,10634990.0,MDEyOk9yZ2FuaXphdGlvbjEwNjM0OTkw,https://avatars.githubusercontent.com/u/106349...,,https://api.github.com/users/CDH-SC,https://github.com/CDH-SC,https://api.github.com/users/CDH-SC/followers,https://api.github.com/users/CDH-SC/following{...,https://api.github.com/users/CDH-SC/gists{/gis...,...,,,,,,,,,1,2023-03-19
599,adholibdh,17990648.0,MDEyOk9yZ2FuaXphdGlvbjE3OTkwNjQ4,https://avatars.githubusercontent.com/u/179906...,,https://api.github.com/users/adholibdh,https://github.com/adholibdh,https://api.github.com/users/adholibdh/followers,https://api.github.com/users/adholibdh/followi...,https://api.github.com/users/adholibdh/gists{/...,...,,,,,,,,,1,2023-03-19
568,dhh21,83238279.0,MDEyOk9yZ2FuaXphdGlvbjgzMjM4Mjc5,https://avatars.githubusercontent.com/u/832382...,,https://api.github.com/users/dhh21,https://github.com/dhh21,https://api.github.com/users/dhh21/followers,https://api.github.com/users/dhh21/following{/...,https://api.github.com/users/dhh21/gists{/gist...,...,,,,,,,,,1,2023-03-19
572,idrhku,17508677.0,MDEyOk9yZ2FuaXphdGlvbjE3NTA4Njc3,https://avatars.githubusercontent.com/u/175086...,,https://api.github.com/users/idrhku,https://github.com/idrhku,https://api.github.com/users/idrhku/followers,https://api.github.com/users/idrhku/following{...,https://api.github.com/users/idrhku/gists{/gis...,...,,,,,,,,,1,2023-03-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1153,XiaoyanYangAlice,121414040.0,U_kgDOBzyhmA,https://avatars.githubusercontent.com/u/121414...,,https://api.github.com/users/XiaoyanYangAlice,https://github.com/XiaoyanYangAlice,https://api.github.com/users/XiaoyanYangAlice/...,https://api.github.com/users/XiaoyanYangAlice/...,https://api.github.com/users/XiaoyanYangAlice/...,...,,,,,,,,,1,2023-03-19
3785,ngonthier,23408564.0,MDQ6VXNlcjIzNDA4NTY0,https://avatars.githubusercontent.com/u/234085...,,https://api.github.com/users/ngonthier,https://github.com/ngonthier,https://api.github.com/users/ngonthier/followers,https://api.github.com/users/ngonthier/followi...,https://api.github.com/users/ngonthier/gists{/...,...,,,,,,,,,1,2022-11-19
3786,yueyue4359,88612363.0,MDQ6VXNlcjg4NjEyMzYz,https://avatars.githubusercontent.com/u/886123...,,https://api.github.com/users/yueyue4359,https://github.com/yueyue4359,https://api.github.com/users/yueyue4359/followers,https://api.github.com/users/yueyue4359/follow...,https://api.github.com/users/yueyue4359/gists{...,...,,,,,,,,,1,2022-11-19
3787,hvm-uu,83591395.0,MDQ6VXNlcjgzNTkxMzk1,https://avatars.githubusercontent.com/u/835913...,,https://api.github.com/users/hvm-uu,https://github.com/hvm-uu,https://api.github.com/users/hvm-uu/followers,https://api.github.com/users/hvm-uu/following{...,https://api.github.com/users/hvm-uu/gists{/gis...,...,,,,,,,,,1,2022-11-12


In [106]:
merged_df._merge.value_counts()

both          796
right_only    133
left_only       1
Name: _merge, dtype: int64

In [89]:
concat_df = pd.concat([df[['login', 'cleaned_search_query', 'type_of_join']], older_df[['login', 'cleaned_search_query', 'type_of_join']]])

In [91]:
concat_df

Unnamed: 0,login,cleaned_search_query,type_of_join
0,Humanites-Numeriques-PSL,https://api.github.com/search/users?q=Humanité...,new
1,hdcaicyt,https://api.github.com/search/users?q=Humanida...,new
2,humboldtdigital,https://api.github.com/search/users?q=Humanida...,new
3,Juanaloga,https://api.github.com/search/users?q=Humanida...,new
4,DCSCyHD,https://api.github.com/search/users?q=Humanida...,new
...,...,...,...
3785,ngonthier,https://api.github.com/search/users?q=Digital+...,old
3786,yueyue4359,https://api.github.com/search/users?q=Digital+...,old
3787,hvm-uu,https://api.github.com/search/users?q=Digital+...,old
3788,2enyoasamoah,https://api.github.com/search/users?q=Digital+...,old


In [90]:
pd.pivot(concat_df, index='login', columns='cleaned_search_query', values='type_of_join')

ValueError: Index contains duplicate entries, cannot reshape

In [77]:
newer_counts = df.groupby(subset_fields).size().reset_index(name='new_counts')
older_counts = older_df.groupby(subset_fields).size().reset_index(name='older_counts')
merged_counts = pd.merge(newer_counts, older_counts, on=subset_fields, how='left')
missing_values = merged_counts[(merged_counts.new_counts < merged_counts.older_counts) | (merged_counts.older_counts.isna())]
missing_join = pd.merge(older_df, missing_values[subset_fields], on=subset_fields, how='inner')
missing_join = missing_join.drop_duplicates(subset=subset_fields)
print(len(missing_join))
# double_check = missing_join[subset_fields]
# combined_condition = np.ones(len(df), dtype=bool)
# for field in subset_fields:
#     combined_condition = combined_condition & df[field].isin(double_check[field])
# df['double_check'] = np.where(combined_condition, 1, 0)
# final_missing_values = df[(df.double_check == 0) & (df[subset_fields[0]].isin(double_check[subset_fields[0]]))]
# print(len(final_missing_values))

794


In [81]:
len(newer_counts), len(older_counts)

(796, 927)

In [82]:
merged_counts[(merged_counts.older_counts != merged_counts.new_counts) ]

Unnamed: 0,login,cleaned_search_query,new_counts,older_counts
0,1r3n3,https://api.github.com/search/users?q=Digital+...,1,2
1,1ucyP,https://api.github.com/search/users?q=Digital+...,1,2
2,5colldh,https://api.github.com/search/users?q=Digital+...,1,2
3,ABC-DH,https://api.github.com/search/users?q=Digital+...,1,2
4,ADHO,https://api.github.com/search/users?q=Digital+...,1,2
...,...,...,...,...
791,yrochat,https://api.github.com/search/users?q=Digital+...,1,2
792,yukiyuqichen,https://api.github.com/search/users?q=Digital+...,1,2
793,zimgraz,https://api.github.com/search/users?q=Digital+...,1,2
794,zkmacdon,https://api.github.com/search/users?q=Digital+...,1,2


In [68]:
test3 = pd.concat([test2[['login', 'cleaned_search_query']], search_queries_user_join_df[['login', 'cleaned_search_query']]])

In [76]:
test2[(~test2.login.isin(search_queries_user_join_df.login))].cleaned_search_query.value_counts()

https://api.github.com/search/users?q=Digital+Humanities&per_page=100       123
https://api.github.com/search/users?q=Humanidades+digitales&per_page=100      7
https://api.github.com/search/users?q=Humanidades+Digitais&per_page=100       2
https://api.github.com/search/users?q=Digitaalhumanitaaria&per_page=100       1
Name: cleaned_search_query, dtype: int64

In [52]:
search_queries_user_join_df[search_queries_user_join_df.login == "hdcaicyt"].cleaned_search_query.values[0]

'https://api.github.com/search/users?q=Humanidades+digitales&per_page=100'

In [49]:
merged_test[merged_test.type_search.isna()].cleaned_search_query.unique()

array(['https://api.github.com/search/users?q=Humanidades+digitales&per_page=100',
       'https://api.github.com/search/users?q=Informatica+umanistica&per_page=100',
       'https://api.github.com/search/users?q=Digital+Humanities&per_page=100'],
      dtype=object)

In [53]:
test2.groupby(subset_fields).size().reset_index(name='counts').sort_values(by=['counts'], ascending=False)

Unnamed: 0,login,cleaned_search_query,counts
0,1r3n3,https://api.github.com/search/users?q=Digital+...,1
624,johlei,https://api.github.com/search/users?q=Digital+...,1
612,jessgrimmer,https://api.github.com/search/users?q=Digital+...,1
613,jessprof,https://api.github.com/search/users?q=Humanida...,1
614,jeyrena1,https://api.github.com/search/users?q=Digital+...,1
...,...,...,...
314,aergithub,https://api.github.com/search/users?q=Digital+...,1
315,agnesecam,https://api.github.com/search/users?q=Informat...,1
316,agustinjaramillo,https://api.github.com/search/users?q=Digital+...,1
317,aiucd,https://api.github.com/search/users?q=Informat...,1


#### Check if all items exist in entity files

In [4]:
missing_repos = search_queries_repo_join_df[~search_queries_repo_join_df.full_name.isin(repo_df.full_name)]
missing_users = search_queries_user_join_df[(~search_queries_user_join_df.login.isin(user_df.login)) & (search_queries_user_join_df['type'] == 'User')]
missing_orgs = search_queries_user_join_df[(~search_queries_user_join_df.login.isin(user_df.login)) & (search_queries_user_join_df['type'] == 'Organization')]

len(missing_repos), len(missing_users), len(missing_orgs)

(23, 0, 0)

In [6]:
if len(missing_repos) > 0:
    repo_df = check_add_repos(missing_repos, '../data/large_files/entity_files/repos_dataset.csv', True)
if len(missing_orgs) > 0:
    org_df = check_add_orgs(missing_orgs, '../data/entity_files/orgs_dataset.csv', True, False)
if len(missing_users) > 0:
    user_df = check_add_users(missing_users, '../data/entity_files/users_dataset.csv', True, False)

In [None]:
core_repos = pd.merge(repo_df, search_queries_repo_join_df[['full_name', 'finalized_language', 'keep_resource']], on='full_name', how='inner')
core_repos = core_repos.drop_duplicates(subset=['full_name'])
core_users = pd.merge(user_df, search_queries_user_join_df[['login', 'finalized_language', 'keep_resource']], on='login', how='inner')
core_users = core_users.drop_duplicates(subset=['login'])
core_orgs = core_users[core_users['type'] == 'Organization']
core_users = core_users[core_users['type'] == 'User']

len(core_repos), len(core_users), len(core_orgs)

(2264, 667, 126)

In [11]:
core_users.to_csv("../data/derived_files/initial_core_users.csv", index=False)
core_orgs.to_csv("../data/derived_files/initial_core_orgs.csv", index=False)
core_repos.to_csv("../data/derived_files/initial_core_repos.csv", index=False)