# Process Initial Results

In [27]:
import pandas as pd
import urllib.parse



df = pd.read_csv("../../datasets/older_files/large_files/join_files/search_queries_repo_join_dataset_2023_03_18.csv", low_memory=False)
# Assuming df is your DataFrame and 'search_query' is the column with the URLs
# df['extracted_search_term'] = df['search_query'].apply(lambda url: urllib.parse.parse_qs(urllib.parse.urlparse(url).query).get('q', [None])[0])

# # Replace '+' with ' ' and decode unicode quotes
# df['extracted_search_term'] = df['extracted_search_term'].str.replace('+', ' ').replace('%22', '"').replace('%27', "'")

In [23]:
test = pd.read_csv("../../datasets/derived_files/grouped_translated_others_terms.csv")

In [24]:
test

Unnamed: 0,term_source,code,term,English language name,final_term
0,Computational Humanities,"bs,da,de,en,fy,mg,no,ny,sm,sn,st,sv,tl,nb",14,"Bosnian, Danish, German, English, West Frisian...",Computational Humanities
1,Cultural Analytics,"bs,en,ky,lb,mg,ny,sn,tl",8,"Bosnian, English, Kirghiz, Luxembourgish, Mala...",Cultural Analytics
2,Humanities,"en,fy,ku,mg,sm,sn,tl",7,"English, West Frisian, Kurdish (Kurmanji), Mal...",Humanities
3,Digital History,"bs,en,fy,ky,la,sn,yo",7,"Bosnian, English, West Frisian, Kirghiz, Latin...",Digital History
4,Digital Cultural Heritage,"en,ny,sn,st,tl,yo",6,"English, Chichewa, Shona, Southern Sotho, Taga...",Digital Cultural Heritage
...,...,...,...,...,...
638,Digital Cultural Heritage,sk,1,Slovak,Digitálne kultúrne dedičstvo
639,Digital Cultural Heritage,cs,1,Czech,Digitální kulturní dědictví
640,Digital Cultural Heritage,lv,1,Latvian,Digitālais kultūras mantojums
641,Digital Cultural Heritage,tr,1,Turkish,Dijital Kültürel Miras


In [12]:
df.search_term_source.value_counts()

Humanities                   4636
Digital Humanities           2440
Cultural Analytics            323
Digital History               316
Public History                125
Computational Humanities       88
Digital Cultural Heritage      37
Name: search_term_source, dtype: int64

In [22]:
df[(df.search_term_source == "Digital Humanities") & (df.search_query.str.contains("Humanities"))].search_query.unique()

array(['https://api.github.com/search/repositories?q="I-Digital+Humanities"&per_page=100&page=1',
       'https://api.github.com/search/repositories?q=Digitale+Humanities&per_page=100&page=1',
       'https://api.github.com/search/repositories?q=Digital+Humanities+created%3A2009-01-01..2009-12-31+sort:created&per_page=100&page=1',
       'https://api.github.com/search/repositories?q=%22Digital+Humanities%22+created%3A2009-01-01..2009-12-31+sort:created&per_page=100&page=1',
       'https://api.github.com/search/repositories?q=Digital+Humanities+created%3A2010-01-01..2010-12-31+sort:created&per_page=100&page=1',
       'https://api.github.com/search/repositories?q=Digital+Humanities+created%3A2011-01-01..2011-12-31+sort:created&per_page=100&page=1',
       'https://api.github.com/search/repositories?q=Digital+Humanities+created%3A2012-01-01..2012-12-31+sort:created&per_page=100&page=1',
       'https://api.github.com/search/repositories?q=Digital+Humanities+created%3A2013-01-01..2013-12

In [16]:
df[df.search_term != df.search_term_source][['search_term', 'search_term_source', 'search_query']][0:1].to_dict()

{'search_term': {0: 'Humanidades'},
 'search_term_source': {0: 'Humanities'},
 'search_query': {0: 'https://api.github.com/search/repositories?q="Humanidades"&per_page=100&page=1'}}

In [18]:
df[df.search_term != df.search_term_source].search_query.unique()

array(['https://api.github.com/search/repositories?q="Humanidades"&per_page=100&page=1',
       'https://api.github.com/search/repositories?q=%22Humanidades%22&per_page=100&page=2',
       'https://api.github.com/search/repositories?q=%22Humanidades%22&per_page=100&page=3',
       'https://api.github.com/search/repositories?q="Humaniora"&per_page=100&page=1',
       'https://api.github.com/search/repositories?q=Humanistyka&per_page=100&page=1',
       'https://api.github.com/search/repositories?q=인문학&per_page=100&page=1',
       'https://api.github.com/search/repositories?q="Sastra"&per_page=100&page=1',
       'https://api.github.com/search/repositories?q=%22Sastra%22&per_page=100&page=2',
       'https://api.github.com/search/repositories?q=%22Sastra%22&per_page=100&page=3',
       'https://api.github.com/search/repositories?q=%22Sastra%22&per_page=100&page=4',
       'https://api.github.com/search/repositories?q="Ubumuntu"&per_page=100&page=1',
       'https://api.github.com/search/

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append("../")
from data_generation_scripts.utils import check_rate_limit, check_add_orgs, check_add_repos, check_add_users, check_for_joins_in_older_queries, read_combine_files, get_new_repos, get_response_data

Once you've run `generate_expanded_search_data.py` and then `check_clean_search_results.py` you'll have a series of files in the `data/` directory that contain the results of your search. This notebook will help you process those results into a single file that can be used for analysis.

Example of how to run `generate_expanded_search_data.py`:

```python3
rates_df = check_rate_limit()
initial_repo_output_path = "../data/repo_data/"
repo_output_path = "../data/large_files/entity_files/repos_dataset.csv"
repo_join_output_path = "../data/large_files/join_files/search_queries_repo_join_dataset.csv"

initial_user_output_path = "../data/user_data/"
user_output_path = "../data/entity_files/users_dataset.csv"
user_join_output_path = "../data/join_files/search_queries_user_join_dataset.csv"
load_existing_data = False
overwrite_existing_temp_files = False
org_output_path = "../data/entity_files/orgs_dataset.csv"

get_initial_search_datasets(rates_df, initial_repo_output_path,  repo_output_path, repo_join_output_path, initial_user_output_path, user_output_path, user_join_output_path, org_output_path, overwrite_existing_temp_files, load_existing_data)
```

And then just run `check_clean_search_results.py` 

### Create Initial Core Results

In [2]:
user_df = pd.read_csv("../data/entity_files/users_dataset.csv")
repo_df = pd.read_csv("../data/large_files/entity_files/repos_dataset.csv", low_memory=False)
org_df = pd.read_csv("../data/entity_files/orgs_dataset.csv", low_memory=False)
search_queries_repo_join_df = pd.read_csv("../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv")
search_queries_user_join_df = pd.read_csv(
    "../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv")

#### Fix results

In [34]:
initial_repo_output_path = "../data/repo_data/"
repo_output_path = "../data/large_files/entity_files/repos_dataset.csv"
initial_repo_join_output_path = "../data/large_files/join_files/search_queries_repo_join_dataset.csv"
repo_join_output_path = "../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv"

initial_user_output_path = "../data/user_data/"
user_output_path = "../data/entity_files/users_dataset.csv"
org_output_path = "../data/entity_files/orgs_dataset.csv"
initial_user_join_output_path = "../data/join_files/search_queries_user_join_dataset.csv"
user_join_output_path = "../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv"


# search_queries_repo_df, search_queries_user_df = verify_results_exist(initial_repo_join_output_path, repo_join_output_path, initial_user_join_output_path, user_join_output_path, subset_terms)

# search_queries_repo_df.to_csv("../data/derived_files/initial_search_queries_repo_join_subset_dh_dataset.csv", index=False)
# search_queries_user_df.to_csv("../data/derived_files/initial_search_queries_user_join_subset_dh_dataset.csv", index=False)

search_queries_repo_df = pd.read_csv("../data/derived_files/initial_search_queries_repo_join_subset_dh_dataset.csv")
search_queries_user_df = pd.read_csv("../data/derived_files/initial_search_queries_user_join_subset_dh_dataset.csv")

existing_search_queries_repo_df = pd.read_csv(repo_join_output_path)
existing_search_queries_user_df = pd.read_csv(user_join_output_path)

search_queries_repo_df.loc[(search_queries_repo_df.finalized_language.notna()) & (search_queries_repo_df.keep_resource == False), 'keep_resource'] = True
search_queries_user_df.loc[(search_queries_user_df.finalized_language.notna()) & (search_queries_user_df.keep_resource == False), 'keep_resource'] = True

# needs_checking = search_queries_repo_df[(search_queries_repo_df['keep_resource'] == True) & (search_queries_repo_df['finalized_language'].isna())]

needs_checking_repos = search_queries_repo_df[(search_queries_repo_df['finalized_language'].isna())].full_name.unique().tolist()
search_queries_repo_df.loc[search_queries_repo_df.detected_language.isna(), 'detected_language'] = None
search_queries_repo_df.loc[search_queries_repo_df.natural_language.isna(), 'natural_language'] = None
search_queries_repo_df = search_queries_repo_df.reset_index(drop=True)
print(len(needs_checking_repos))
needs_checking_repos = existing_search_queries_repo_df[(~existing_search_queries_repo_df.full_name.isin(needs_checking_repos)) & (existing_search_queries_repo_df.finalized_language.isna()) & (existing_search_queries_repo_df.keep_resource == True)].full_name.unique().tolist()
print(len(needs_checking_repos))


266
0


In [27]:
test = pd.read_csv("/Users/zleblanc/updated_search_queries_repo_join_subset_dh_dataset.csv")

In [42]:
needs_checking = search_queries_repo_df[(search_queries_repo_df.finalized_language.isna()) & ((search_queries_repo_df.keep_resource.isna()) | (search_queries_repo_df.keep_resource == True))]
len(needs_checking)

274

In [37]:
existing_search_queries_repo_df[(existing_search_queries_repo_df.full_name.isin(needs_checking.full_name)) & (existing_search_queries_repo_df.finalized_language.isna())]

Unnamed: 0,id,node_id,name,full_name,private,html_url,description,fork,url,forks_url,...,search_type,search_query_time,detected_language,detected_language_confidence,finalized_language,cleaned_search_query,keep_resource,query,updated_natural_language,double_check


In [36]:
needs_checking

Unnamed: 0,id,node_id,name,full_name,private,html_url,description,fork,url,forks_url,...,search_type,search_query_time,detected_language,detected_language_confidence,finalized_language,cleaned_search_query,keep_resource,query,updated_natural_language,double_check
2514,485948971.0,R_kgDOHPb-Kw,Assignment-of-Introduction-to-Digital-Humaniti...,Ashley0916/Assignment-of-Introduction-to-Digit...,0.0,https://github.com/Ashley0916/Assignment-of-In...,,0.0,https://api.github.com/repos/Ashley0916/Assign...,https://api.github.com/repos/Ashley0916/Assign...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,,,,
2515,493731705.0,R_kgDOHW2_eQ,2022-REEES-Think-Tank-Digital-Humanities-Resou...,grunewas/2022-REEES-Think-Tank-Digital-Humanit...,0.0,https://github.com/grunewas/2022-REEES-Think-T...,,0.0,https://api.github.com/repos/grunewas/2022-REE...,https://api.github.com/repos/grunewas/2022-REE...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,,,,
2516,528456909.0,R_kgDOH3-czQ,Writing-for-the-Digital-Humanities,cbharveyDH/Writing-for-the-Digital-Humanities,0.0,https://github.com/cbharveyDH/Writing-for-the-...,,0.0,https://api.github.com/repos/cbharveyDH/Writin...,https://api.github.com/repos/cbharveyDH/Writin...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,,,,
2517,453171035.0,R_kgDOGwLXWw,libraries-archives-and-the-digital-humanities,isabelgalina/libraries-archives-and-the-digita...,0.0,https://github.com/isabelgalina/libraries-arch...,,0.0,https://api.github.com/repos/isabelgalina/libr...,https://api.github.com/repos/isabelgalina/libr...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,,,,
2518,528166260.0,R_kgDOH3stdA,Using-Web-Scraping-in-Digital-Humanities,zeehan786/Using-Web-Scraping-in-Digital-Humani...,0.0,https://github.com/zeehan786/Using-Web-Scrapin...,,0.0,https://api.github.com/repos/zeehan786/Using-W...,https://api.github.com/repos/zeehan786/Using-W...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2823,459751619.0,R_kgDOG2dAww,recits-numeriques,digitalArtHistory/recits-numeriques,0.0,https://github.com/digitalArtHistory/recits-nu...,,0.0,https://api.github.com/repos/digitalArtHistory...,https://api.github.com/repos/digitalArtHistory...,...,tagged,2022-12-12,,,,https://api.github.com/search/repositories?q=t...,,,,
2824,184833645.0,MDEwOlJlcG9zaXRvcnkxODQ4MzM2NDU=,handle,shepdl/handle,0.0,https://github.com/shepdl/handle,,0.0,https://api.github.com/repos/shepdl/handle,https://api.github.com/repos/shepdl/handle/forks,...,tagged,2022-12-12,,,,https://api.github.com/search/repositories?q=t...,,,,
2825,136824100.0,MDEwOlJlcG9zaXRvcnkxMzY4MjQxMDA=,CloudReady,Pascal-KOTTE/CloudReady,0.0,https://github.com/Pascal-KOTTE/CloudReady,,0.0,https://api.github.com/repos/Pascal-KOTTE/Clou...,https://api.github.com/repos/Pascal-KOTTE/Clou...,...,tagged,2022-12-12,,,,https://api.github.com/search/repositories?q=t...,,,,
2826,116059695.0,MDEwOlJlcG9zaXRvcnkxMTYwNTk2OTU=,la-gregueria-virtual,tzembo/la-gregueria-virtual,0.0,https://github.com/tzembo/la-gregueria-virtual,,0.0,https://api.github.com/repos/tzembo/la-greguer...,https://api.github.com/repos/tzembo/la-greguer...,...,tagged,2022-12-12,,,,https://api.github.com/search/repositories?q=t...,,,,


In [15]:
needs_checking_users = search_queries_user_df[(search_queries_user_df['finalized_language'].isna())].login.unique().tolist()
search_queries_user_df.loc[search_queries_user_df.detected_language.isna(), 'detected_language'] = None
search_queries_user_df.loc[search_queries_user_df.natural_language.isna(), 'natural_language'] = None
search_queries_user_df = search_queries_user_df.reset_index(drop=True)
print(len(needs_checking_users))
needs_checking_users = existing_search_queries_user_df[(existing_search_queries_user_df.finalized_language.isna()) & (existing_search_queries_user_df.keep_resource == True)].login.unique().tolist()
print(len(needs_checking_users))

133
0


In [20]:
existing_search_queries_user_df[(existing_search_queries_user_df.login.isin(needs_checking_users))]

Unnamed: 0,login,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,gists_url,...,cleaned_search_query,hooks_url,issues_url,members_url,public_members_url,description,is_verified,has_organization_projects,has_repository_projects,double_check


In [33]:
search_queries_user_df[(search_queries_user_df.login.isin(existing_search_queries_user_df.login)) & (search_queries_user_df.finalized_language.isna())]

Unnamed: 0,login,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,gists_url,...,cleaned_search_query,hooks_url,issues_url,members_url,public_members_url,description,is_verified,has_organization_projects,has_repository_projects,double_check
796,dhh18,38721525.0,MDEyOk9yZ2FuaXphdGlvbjM4NzIxNTI1,https://avatars.githubusercontent.com/u/387215...,,https://api.github.com/users/dhh18,https://github.com/dhh18,https://api.github.com/users/dhh18/followers,https://api.github.com/users/dhh18/following{/...,https://api.github.com/users/dhh18/gists{/gist...,...,https://api.github.com/search/users?q=Digital+...,,,,,,,,,
797,RusBilot,125308156.0,U_kgDOB3gM_A,https://avatars.githubusercontent.com/u/125308...,,https://api.github.com/users/RusBilot,https://github.com/RusBilot,https://api.github.com/users/RusBilot/followers,https://api.github.com/users/RusBilot/followin...,https://api.github.com/users/RusBilot/gists{/g...,...,https://api.github.com/search/users?q=Digital+...,,,,,,,,,
798,hridigital,23073585.0,MDEyOk9yZ2FuaXphdGlvbjIzMDczNTg1,https://avatars.githubusercontent.com/u/230735...,,https://api.github.com/users/hridigital,https://github.com/hridigital,https://api.github.com/users/hridigital/followers,https://api.github.com/users/hridigital/follow...,https://api.github.com/users/hridigital/gists{...,...,https://api.github.com/search/users?q=Digital+...,,,,,,,,,
799,Digital-Humanities-Creative-Lab,80300081.0,MDEyOk9yZ2FuaXphdGlvbjgwMzAwMDgx,https://avatars.githubusercontent.com/u/803000...,,https://api.github.com/users/Digital-Humanitie...,https://github.com/Digital-Humanities-Creative...,https://api.github.com/users/Digital-Humanitie...,https://api.github.com/users/Digital-Humanitie...,https://api.github.com/users/Digital-Humanitie...,...,https://api.github.com/search/users?q=Digital+...,,,,,,,,,
800,DigitalHumanitiesLabs,4134853.0,MDQ6VXNlcjQxMzQ4NTM=,https://avatars.githubusercontent.com/u/413485...,,https://api.github.com/users/DigitalHumanities...,https://github.com/DigitalHumanitiesLabs,https://api.github.com/users/DigitalHumanities...,https://api.github.com/users/DigitalHumanities...,https://api.github.com/users/DigitalHumanities...,...,https://api.github.com/search/users?q=Digital+...,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
924,CDH-DevTeam,17449038.0,MDEyOk9yZ2FuaXphdGlvbjE3NDQ5MDM4,https://avatars.githubusercontent.com/u/174490...,,https://api.github.com/users/CDH-DevTeam,https://github.com/CDH-DevTeam,https://api.github.com/users/CDH-DevTeam/follo...,https://api.github.com/users/CDH-DevTeam/follo...,https://api.github.com/users/CDH-DevTeam/gists...,...,https://api.github.com/search/users?q=Digital+...,,,,,,,,,
925,ngonthier,23408564.0,MDQ6VXNlcjIzNDA4NTY0,https://avatars.githubusercontent.com/u/234085...,,https://api.github.com/users/ngonthier,https://github.com/ngonthier,https://api.github.com/users/ngonthier/followers,https://api.github.com/users/ngonthier/followi...,https://api.github.com/users/ngonthier/gists{/...,...,https://api.github.com/search/users?q=Digital+...,,,,,,,,,
926,yueyue4359,88612363.0,MDQ6VXNlcjg4NjEyMzYz,https://avatars.githubusercontent.com/u/886123...,,https://api.github.com/users/yueyue4359,https://github.com/yueyue4359,https://api.github.com/users/yueyue4359/followers,https://api.github.com/users/yueyue4359/follow...,https://api.github.com/users/yueyue4359/gists{...,...,https://api.github.com/search/users?q=Digital+...,,,,,,,,,
927,2enyoasamoah,106137379.0,U_kgDOBlOHIw,https://avatars.githubusercontent.com/u/106137...,,https://api.github.com/users/2enyoasamoah,https://github.com/2enyoasamoah,https://api.github.com/users/2enyoasamoah/foll...,https://api.github.com/users/2enyoasamoah/foll...,https://api.github.com/users/2enyoasamoah/gist...,...,https://api.github.com/search/users?q=Digital+...,,,,,,,,,


Initial core datasets will be comprised of the following:

- `core_repos`: A list of all repos that were returned by the search query
- `core_users`: A list of all users that were returned by the search query
- `core_orgs`: A list of all orgs that were returned by the search query

#### Eventually Delete

In [10]:
repo_join_output_path = "../data/large_files/join_files/search_queries_repo_join_dataset.csv"
user_join_output_path = "../data/join_files/search_queries_user_join_dataset.csv"
join_unique_field = 'search_query'
repo_filter_fields = ['full_name', 'cleaned_search_query']
user_filter_fields = ['login', 'cleaned_search_query']

In [11]:
initial_repo_join_output_path = "../data/large_files/join_files/search_queries_repo_join_dataset.csv"
existing_repo_join_output_path = "../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv"

initial_user_join_output_path = "../data/join_files/search_queries_user_join_dataset.csv"
existing_user_join_output_path = "../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv"

In [12]:
subset_terms = ['Digital Humanities']

In [13]:
search_queries_repo_df = search_queries_repo_join_df.copy()
search_queries_user_df = search_queries_user_join_df.copy()

In [14]:
search_queries_user_df['cleaned_search_query'] = search_queries_user_df['search_query'].str.replace('%22', '"').str.replace('"', '').str.replace('%3A', ':').str.split('&page').str[0]
search_queries_repo_df['cleaned_search_query'] = search_queries_repo_df['search_query'].str.replace('%22', '"').str.replace('"', '').str.replace('%3A', ':').str.split('&page').str[0]

updated_search_queries_repo_df = check_for_joins_in_older_queries(repo_join_output_path, search_queries_repo_df, join_unique_field, repo_filter_fields, subset_terms)
updated_search_queries_user_df = check_for_joins_in_older_queries(user_join_output_path, search_queries_user_df, join_unique_field, user_filter_fields, subset_terms)

In [16]:
initial_search_queries_repo_df = pd.read_csv(initial_repo_join_output_path)
initial_search_queries_user_df  = pd.read_csv(initial_user_join_output_path)

In [17]:
initial_search_queries_user_df['cleaned_search_query'] = initial_search_queries_user_df['search_query'].str.replace('%22', '"').str.replace('"', '').str.replace('%3A', ':').str.split('&page').str[0]
initial_search_queries_repo_df['cleaned_search_query'] = initial_search_queries_repo_df['search_query'].str.replace('%22', '"').str.replace('"', '').str.replace('%3A', ':').str.split('&page').str[0]

initial_search_queries_repo_df = initial_search_queries_repo_df[initial_search_queries_repo_df.search_term_source.isin(subset_terms)]
initial_search_queries_user_df = initial_search_queries_user_df[initial_search_queries_user_df.search_term_source.isin(subset_terms)]


search_queries_repo_df = pd.concat([updated_search_queries_repo_df, initial_search_queries_repo_df])
search_queries_user_df = pd.concat([updated_search_queries_user_df, initial_search_queries_user_df])

In [21]:
def fill_missing_language_data(rows: pd.DataFrame, is_repo: bool) -> pd.DataFrame:
    """Fill in the missing language data for the search queries data.
    :param rows: The search queries data
    :type rows: pandas.DataFrame
    :param is_repo: Whether the search queries data is for repos or users
    :type is_repo: bool
    :return: The search queries data with the missing language data filled in"""
    if len(rows[rows.finalized_language.notna()]) == 0:
        entity_type = 'Repo' if is_repo else 'User'
        field = 'full_name' if is_repo else 'login'
        print(f"No finalized language {len(rows)}, {rows.finalized_language.unique()}, {entity_type} {rows[rows[field].notna()][field].unique()[0]}") 
    detected_language = rows[rows.detected_language.notnull()].detected_language.unique()
    rows.detected_language = detected_language[0] if len(detected_language) > 0 else None
    detected_language_confidence = rows[rows.detected_language_confidence.notnull()].detected_language_confidence.unique()
    if len(detected_language_confidence) > 1:
        detected_language_confidence = [rows[rows.detected_language_confidence.notnull()].detected_language_confidence.max()]
    rows.detected_language_confidence = detected_language_confidence[0] if len(detected_language_confidence) > 0 else None
    finalized_language = rows[rows.finalized_language.notna()].finalized_language.unique()
    if len(finalized_language) > 1:
        print(finalized_language)
        finalized_language = [lang for lang in finalized_language if lang != None]
        print(finalized_language)
    rows.finalized_language = finalized_language[0] if len(finalized_language) > 0 else None
    keep_resource = rows[rows.keep_resource.notna()].keep_resource.unique()
    rows.keep_resource = keep_resource[0] if len(keep_resource) > 0 else None
    if (len(detected_language) > 1) | (len(detected_language_confidence) > 1) | (len(finalized_language) > 1) | (len(keep_resource) > 1):
        entity_type = 'Repo' if is_repo else 'User'
        field = 'full_name' if is_repo else 'login'
        unique_id = rows[rows[field].notna()][field].unique()[0]
        print(f"{entity_type} {unique_id}: Detected: {len(detected_language)}, Confidence: {detected_language_confidence}, Finalized: {len(finalized_language)}, Keep: {len(keep_resource)}")
 
    return rows

In [35]:
from tqdm import tqdm
from IPython.display import clear_output

In [36]:
tqdm.pandas(desc="Fill missing language data")
cleaned_search_queries_repo_df = search_queries_repo_df.groupby(['full_name']).progress_apply(fill_missing_language_data, is_repo=True)
clear_output(wait=True)
cleaned_search_queries_user_df = search_queries_user_df.groupby(['login']).progress_apply(fill_missing_language_data, is_repo=False)
clear_output(wait=True)

Fill missing language data:  13%|█▎        | 119/926 [00:00<00:01, 614.90it/s]

No finalized language 4, [nan], User 2enyoasamoah
No finalized language 6, [nan], User ANAVDUTT
No finalized language 16, [nan], User BCDH
No finalized language 16, [nan], User BYU-ODH
No finalized language 6, [nan], User BeatriceVaienti
No finalized language 10, [nan], User CDH-DevTeam
No finalized language 16, [nan], User CDH-SC
No finalized language 16, [nan], User CNMATDH
No finalized language 16, [nan], User CVCEeu-dh
No finalized language 6, [nan], User ChloeNewman
No finalized language 16, [nan], User DH-Cologne
No finalized language 6, [nan], User DHCodeReview
No finalized language 16, [nan], User DHLUW
No finalized language 16, [nan], User DHSIG
No finalized language 16, [nan], User DIGI-VUB
No finalized language 6, [nan], User DaKuschel
No finalized language 12, [nan], User Digitaalhumanitaaria
No finalized language 16, [nan], User Digital-Humanities-Centre
No finalized language 16, [nan], User Digital-Humanities-Creative-Lab
No finalized language 16, [nan], User DigitalHuman

Fill missing language data:  32%|███▏      | 292/926 [00:00<00:00, 775.76it/s]

No finalized language 6, [nan], User FrancescoDiCursi
No finalized language 16, [nan], User GhentCDH
No finalized language 16, [nan], User GroningenDH
No finalized language 6, [nan], User GusRiva
No finalized language 18, [nan], User HumanidadesDigitales
No finalized language 6, [nan], User JajwalyaRK
No finalized language 6, [nan], User JillBriggeman
No finalized language 6, [nan], User Kalo9603
No finalized language 16, [nan], User KeystoneDH
No finalized language 16, [nan], User King-s-Digital-Humanities
No finalized language 16, [nan], User LoyolaChicagoDigitalHumanities
No finalized language 16, [nan], User M-L-D-H
No finalized language 16, [nan], User Maynooth-Center-for-Digital-Humanities
No finalized language 6, [nan], User MescoCoder
No finalized language 16, [nan], User NYUADDH
No finalized language 6, [nan], User Princeton-CDH
No finalized language 16, [nan], User Python-Tutorials-for-Digital-Humanities
No finalized language 6, [nan], User RemoGrillo
No finalized language 6,

Fill missing language data:  49%|████▊     | 450/926 [00:00<00:00, 774.23it/s]

No finalized language 6, [nan], User alice13510
No finalized language 6, [nan], User andreaspataro
No finalized language 6, [nan], User angstigone
No finalized language 16, [nan], User bcdhbonn
No finalized language 6, [nan], User casglur
No finalized language 16, [nan], User comp-int-hum
No finalized language 6, [nan], User danieltepavac
No finalized language 16, [nan], User dh-trier
No finalized language 16, [nan], User dhc-barnard
No finalized language 16, [nan], User dhc-uob
No finalized language 16, [nan], User dhcbalamand
No finalized language 16, [nan], User dhdc
No finalized language 16, [nan], User dhh16
No finalized language 16, [nan], User dhh17
No finalized language 16, [nan], User dhh18
No finalized language 16, [nan], User dhh19
No finalized language 16, [nan], User dhh21
No finalized language 16, [nan], User dhh22
No finalized language 16, [nan], User dhhse
No finalized language 6, [nan], User dhinfra-at
No finalized language 16, [nan], User dhlab-epfl
No finalized langu

Fill missing language data:  67%|██████▋   | 621/926 [00:00<00:00, 819.56it/s]

No finalized language 6, [nan], User egibso10
No finalized language 6, [nan], User elizastuglik
No finalized language 6, [nan], User eugestumm
No finalized language 6, [nan], User exploratoriohd
No finalized language 6, [nan], User gdmeo
No finalized language 16, [nan], User go-dh
No finalized language 6, [nan], User gu-gridh
No finalized language 6, [nan], User hermann-bahr
No finalized language 16, [nan], User hridigital
No finalized language 6, [nan], User httpschiara
No finalized language 4, [nan], User hvm-uu
No finalized language 16, [nan], User idhmc-tamu
No finalized language 12, [nan], User idrhku
No finalized language 6, [nan], User imlabormitlea-code
No finalized language 6, [nan], User iserenko
No finalized language 6, [nan], User joshuavachon25
No finalized language 6, [nan], User juanfuc
No finalized language 6, [nan], User justinwigard
No finalized language 6, [nan], User karljazz74
No finalized language 6, [nan], User kbadly1
No finalized language 6, [nan], User kfitz
N

Fill missing language data:  87%|████████▋ | 804/926 [00:01<00:00, 865.36it/s]

No finalized language 18, [nan], User linhd-dev
No finalized language 6, [nan], User lisateichmann
No finalized language 4, [nan], User lunaparrafos
No finalized language 6, [nan], User lyang02
No finalized language 6, [nan], User maehr
No finalized language 18, [nan], User maestriahd
No finalized language 6, [nan], User manny-rocha
No finalized language 6, [nan], User martasoricetti
No finalized language 6, [nan], User mathewjordan
No finalized language 16, [nan], User matrix-msu
No finalized language 6, [nan], User michaelgfalk
No finalized language 2, [nan], User ngonthier
No finalized language 6, [nan], User oengin15
No finalized language 6, [nan], User olgagolgan
No finalized language 6, [nan], User pbd84
No finalized language 16, [nan], User princetoncdh
No finalized language 18, [nan], User redcolhd
No finalized language 6, [nan], User skotheim9
No finalized language 6, [nan], User t-lini
No finalized language 16, [nan], User ucdh


Fill missing language data:  97%|█████████▋| 900/926 [00:01<00:00, 890.66it/s]

No finalized language 16, [nan], User usf-dh
No finalized language 6, [nan], User valrighe
No finalized language 16, [nan], User villaitatti
No finalized language 16, [nan], User wludh
No finalized language 2, [nan], User yueyue4359


Fill missing language data: 100%|██████████| 926/926 [00:01<00:00, 665.86it/s]


In [37]:
cleaned_search_queries_user_df['cleaned_search_query'] = cleaned_search_queries_user_df['search_query'].str.replace('%22', '"').str.replace('"', '').str.replace('%3A', ':').str.split('&page').str[0]
cleaned_search_queries_repo_df['cleaned_search_query'] = cleaned_search_queries_repo_df['search_query'].str.replace('%22', '"').str.replace('"', '').str.replace('%3A', ':').str.split('&page').str[0]

In [38]:
cleaned_search_queries_repo_df.loc[cleaned_search_queries_repo_df.search_query_time.isna(), 'search_query_time'] = "2022-10-10"
cleaned_search_queries_repo_df['search_query_time'] = pd.to_datetime(cleaned_search_queries_repo_df['search_query_time'], errors='coerce')
cleaned_search_queries_repo_df = cleaned_search_queries_repo_df.sort_values(by=['search_query_time'], ascending=False).drop_duplicates(subset=['full_name', 'cleaned_search_query'], keep='first')

cleaned_search_queries_user_df.loc[cleaned_search_queries_user_df.search_query_time.isna(), 'search_query_time'] = "2022-10-10"
cleaned_search_queries_user_df['search_query_time'] = pd.to_datetime(cleaned_search_queries_user_df['search_query_time'], errors='coerce')
cleaned_search_queries_user_df = cleaned_search_queries_user_df.sort_values(by=['search_query_time'], ascending=False).drop_duplicates(subset=['login','cleaned_search_query'], keep='first')

In [45]:
def fix_results(search_queries_repo_df: pd.DataFrame, search_queries_user_df: pd.DataFrame) -> pd.DataFrame:
    """Fix the results of the search queries to ensure that the results are correct.
    :param search_queries_repo_df: The search queries data for repos
    :type search_queries_repo_df: pandas.DataFrame
    :param search_queries_user_df: The search queries data for users
    :type search_queries_user_df: pandas.DataFrame
    :return: The fixed search queries data"""

    fix_repo_queries = search_queries_repo_df[(search_queries_repo_df.cleaned_search_query.str.contains('q="Humanities"')) & (search_queries_repo_df.search_term_source == "Digital Humanities")]
    fix_user_queries = search_queries_user_df[(search_queries_user_df.cleaned_search_query.str.contains('q="Humanities"')) & (search_queries_user_df.search_term_source == "Digital Humanities")]
    if len(fix_repo_queries) > 0:
        replace_repo_queries = search_queries_repo_df[(search_queries_repo_df.full_name.isin(fix_repo_queries.full_name)) & (search_queries_repo_df.search_term_source == "Digital Humanities")][['full_name', 'search_query']]
        search_queries_repo_df.loc[search_queries_repo_df.full_name.isin(fix_repo_queries.full_name), 'cleaned_search_query'] = search_queries_repo_df.loc[search_queries_repo_df.full_name.isin(fix_repo_queries.full_name), 'full_name'].map(replace_repo_queries.set_index('full_name').to_dict()['search_query'])
        
    if len(fix_user_queries) > 0:
        replace_user_queries = search_queries_user_df[(search_queries_user_df.full_name.isin(fix_user_queries.login)) & (search_queries_user_df.search_term_source == "Digital Humanities")][['login', 'search_query']]
        search_queries_user_df.loc[search_queries_user_df.login.isin(fix_user_queries.login), 'cleaned_search_query'] = search_queries_user_df.loc[search_queries_user_df.login.isin(fix_user_queries.login), 'login'].map(replace_user_queries.set_index('login').to_dict()['search_query'])
    return search_queries_repo_df, search_queries_user_df

In [46]:
search_queries_repo_df, search_queries_user_df = fix_results(search_queries_repo_df, search_queries_user_df)

In [49]:
cleaned_search_queries_user_df.duplicated(subset=['login', 'cleaned_search_query']).sum(), cleaned_search_queries_repo_df.duplicated(subset=['full_name', 'cleaned_search_query']).sum()

(0, 0)

In [51]:
cleaned_search_queries_user_df.finalized_language.isna().sum(), cleaned_search_queries_repo_df.finalized_language.isna().sum()

(133, 314)

In [50]:
from data_generation_scripts.generate_translations import check_detect_language

In [52]:
def get_languages(search_df: pd.DataFrame, search_type: str) -> pd.DataFrame:
    """Get the languages for the search queries data.
    :param search_df: The search queries data for repos
    :type search_df: pandas.DataFrame
    :param search_type: The type of search queries data
    :type search_type: str
    :return: The search queries data with the languages added"""
    tqdm.pandas(desc='Detecting language')
    if 'repo' in search_type:
        search_df.description = search_df.description.fillna('')
    else:
        search_df.bio = search_df.bio.fillna('')
    search_df = search_df.progress_apply(check_detect_language, axis=1, is_repo=True)
    return search_df

def clean_languages(search_df: pd.DataFrame, join_field: str) -> pd.DataFrame:
    """Clean the languages for the search queries data.
    :param search_df: The search queries data for repos
    :type search_df: pandas.DataFrame
    :param join_field: The field to join the search queries data to the repo data
    :type join_field: str
    :return: The search queries data with the languages cleaned"""
    english_langs = 'en, ny, ha, ig, lb, mg, sm, sn, st, tl, yo'
    english_langs = english_langs.split(', ')
    search_df.loc[(search_df.detected_language.isin(
        english_langs)) & (search_df.finalized_language.isna()), 'finalized_language'] = search_df.detected_language
    search_df.loc[(search_df.natural_language == search_df.detected_language) & (search_df.finalized_language.isna()),
                  'finalized_language'] = search_df.detected_language
    needs_language = search_df[(search_df.detected_language.str.contains('zh', na=False)) & (search_df.natural_language == 'zh') & (search_df.finalized_language.isna())]
    if len(needs_language) > 0:
        search_df.loc[(search_df.detected_language.str.contains('zh', na=False)) & (search_df.natural_language == 'zh'), 'finalized_language'] = search_df.loc[(search_df.detected_language.str.contains('zh', na=False)) & (search_df.natural_language == 'zh'), 'detected_language']
    needs_language =  search_df[(search_df.natural_language.str.contains('fr')) & (search_df.detected_language.str.contains('fr')) & (search_df.finalized_language.isna())]
    if len(needs_language) > 0:
        search_df.loc[(search_df.natural_language.str.contains('fr')) & (search_df.detected_language.str.contains('fr')), 'finalized_language'] = 'fr'
    needs_language = search_df[(search_df.natural_language == 'xh, zu') & (search_df.finalized_language.isna())]
    if len(needs_language) > 0:
        search_df.loc[(search_df.natural_language == 'xh, zu') & (search_df.finalized_language.isna()), 'finalized_language'] = search_df.loc[(search_df.natural_language == 'xh, zu') & (search_df.detected_language.notna()), 'detected_language']
    search_df.loc[(search_df.finalized_language.isna()) & (
        search_df.detected_language_confidence < 0.5), 'finalized_language'] = None
    if join_field == 'full_name':

        search_df.loc[(search_df.finalized_language.isna()) & (
        search_df.description.str.len() < 30), 'finalized_language'] = None
        search_df.loc[(search_df.detected_language.isna()) & (
            search_df.description.isna()) & (search_df.finalized_language.isna()), 'finalized_language'] = None
        search_df.loc[(search_df.detected_language.isna()) & (
            search_df.description.isna()) & (search_df['size'] < 1) & (search_df.finalized_language.isna()), 'keep_resource'] = False
    if join_field == 'login':
        search_df.loc[(search_df.finalized_language.isna()) & (
            search_df.bio.str.len() < 30), 'finalized_language'] = None
        search_df.loc[(search_df.detected_language.isna()) & (
            search_df.bio.isna() & (search_df.finalized_language.isna())), 'finalized_language'] = None
    return search_df

def clean_search_queries_data(search_df: object, join_field: str, search_type: str) -> object:
    """Clean the search queries data and try to determine as much as possible the exact language using automated language detection and natural language processing.
    :param search_df: The search queries data
    :type search_df: pandas.DataFrame
    :param join_field: The field to join the search queries data to the repo data
    :type join_field: str
    :param search_type: The type of search queries data
    :type search_type: str
    :return: The cleaned search queries data
    :rtype: pandas.DataFrame"""
    
    search_df = search_df.drop_duplicates(
        subset=[join_field, 'cleaned_search_query'])
    
    if 'keep_resource' not in search_df.columns:
        search_df['keep_resource'] = True
    else:
        search_df.loc[search_df.keep_resource == 'None'] = None
    

    if 'finalized_language' not in search_df.columns:
        search_df['finalized_language'] = None
    else:
        search_df.loc[search_df.finalized_language == 'None'] = None
    
    if 'detected_language' not in search_df.columns:
        search_df = get_languages(search_df, search_type)
        search_df = clean_languages(search_df, join_field)
    else:
        subset_search_df = search_df[(search_df.detected_language.isna()) & (search_df.finalized_language.isna())]
        existing_search_df = search_df[(search_df.detected_language.notna()) & (search_df.finalized_language.notna())]
        print(len(subset_search_df), len(existing_search_df))
        # if len(subset_search_df) > 0:
        #     subset_search_df = get_languages(subset_search_df, search_type)

        # search_df = pd.concat([existing_search_df, subset_search_df])
        # search_df = clean_languages(search_df, join_field)
    return search_df

In [53]:
search_queries_repo_df = clean_search_queries_data(search_queries_repo_df, 'full_name', 'repo')
search_queries_user_df = clean_search_queries_data(search_queries_user_df, 'login', 'user')

Detecting language:  14%|█▎        | 554/4080 [00:14<01:36, 36.48it/s]

In [61]:
import numpy as np

In [63]:
test = pd.concat([search_queries_user_join_df, older_join_df])
len(test)

7790

In [64]:
test['cleaned_search_query_time'] = None
test.loc[test.cleaned_search_query_time.isna(), 'cleaned_search_query_time'] = "2022-10-10"
test.loc[test.search_query_time.notna(), 'cleaned_search_query_time'] = test.loc[test.search_query_time.notna(), 'search_query_time']
test['cleaned_search_query_time'] = pd.to_datetime(test['cleaned_search_query_time'], errors='coerce')

In [65]:
test['cleaned_search_query'] = test['search_query'].str.replace('%22', '"').str.replace('"', '').str.replace('%3A', ':').str.split('&page').str[0]

In [66]:
test2 = test.sort_values(by=['cleaned_search_query_time'], ascending=False).drop_duplicates(subset=subset_fields, keep='first')

In [92]:
df = search_queries_user_join_df.copy()
older_df = test2.copy()
len(df), len(older_df)

(797, 930)

In [99]:
subset_older_df = older_df[['login', 'cleaned_search_query']].reset_index(drop=True)
subset_older_df = subset_older_df[subset_older_df.login.notna()]

In [101]:
subset_df = df[['login', 'cleaned_search_query']].reset_index(drop=True)

In [102]:
subset_df['type_of_join'] = "new"
subset_older_df['type_of_join'] = "old"

In [105]:
merged_df = pd.merge(subset_df, subset_older_df, on=['login', 'cleaned_search_query'], how='outer', indicator=True)

In [119]:
missing_values = merged_df[merged_df._merge == 'right_only']

double_check = missing_values[subset_fields]
combined_condition = np.ones(len(older_df), dtype=bool)
for field in subset_fields:
    combined_condition = combined_condition & older_df[field].isin(double_check[field])
older_df['double_check'] = np.where(combined_condition, 1, 0)
final_missing_values = older_df[(older_df.double_check == 1) & (older_df[subset_fields[0]].isin(double_check[subset_fields[0]]))]

In [120]:
final_missing_values

Unnamed: 0,login,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,gists_url,...,hooks_url,issues_url,members_url,public_members_url,description,is_verified,has_organization_projects,has_repository_projects,double_check,cleaned_search_query_time
585,dhtaxonomy,6716560.0,MDEyOk9yZ2FuaXphdGlvbjY3MTY1NjA=,https://avatars.githubusercontent.com/u/671656...,,https://api.github.com/users/dhtaxonomy,https://github.com/dhtaxonomy,https://api.github.com/users/dhtaxonomy/followers,https://api.github.com/users/dhtaxonomy/follow...,https://api.github.com/users/dhtaxonomy/gists{...,...,,,,,,,,,1,2023-03-19
593,CDH-SC,10634990.0,MDEyOk9yZ2FuaXphdGlvbjEwNjM0OTkw,https://avatars.githubusercontent.com/u/106349...,,https://api.github.com/users/CDH-SC,https://github.com/CDH-SC,https://api.github.com/users/CDH-SC/followers,https://api.github.com/users/CDH-SC/following{...,https://api.github.com/users/CDH-SC/gists{/gis...,...,,,,,,,,,1,2023-03-19
599,adholibdh,17990648.0,MDEyOk9yZ2FuaXphdGlvbjE3OTkwNjQ4,https://avatars.githubusercontent.com/u/179906...,,https://api.github.com/users/adholibdh,https://github.com/adholibdh,https://api.github.com/users/adholibdh/followers,https://api.github.com/users/adholibdh/followi...,https://api.github.com/users/adholibdh/gists{/...,...,,,,,,,,,1,2023-03-19
568,dhh21,83238279.0,MDEyOk9yZ2FuaXphdGlvbjgzMjM4Mjc5,https://avatars.githubusercontent.com/u/832382...,,https://api.github.com/users/dhh21,https://github.com/dhh21,https://api.github.com/users/dhh21/followers,https://api.github.com/users/dhh21/following{/...,https://api.github.com/users/dhh21/gists{/gist...,...,,,,,,,,,1,2023-03-19
572,idrhku,17508677.0,MDEyOk9yZ2FuaXphdGlvbjE3NTA4Njc3,https://avatars.githubusercontent.com/u/175086...,,https://api.github.com/users/idrhku,https://github.com/idrhku,https://api.github.com/users/idrhku/followers,https://api.github.com/users/idrhku/following{...,https://api.github.com/users/idrhku/gists{/gis...,...,,,,,,,,,1,2023-03-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1153,XiaoyanYangAlice,121414040.0,U_kgDOBzyhmA,https://avatars.githubusercontent.com/u/121414...,,https://api.github.com/users/XiaoyanYangAlice,https://github.com/XiaoyanYangAlice,https://api.github.com/users/XiaoyanYangAlice/...,https://api.github.com/users/XiaoyanYangAlice/...,https://api.github.com/users/XiaoyanYangAlice/...,...,,,,,,,,,1,2023-03-19
3785,ngonthier,23408564.0,MDQ6VXNlcjIzNDA4NTY0,https://avatars.githubusercontent.com/u/234085...,,https://api.github.com/users/ngonthier,https://github.com/ngonthier,https://api.github.com/users/ngonthier/followers,https://api.github.com/users/ngonthier/followi...,https://api.github.com/users/ngonthier/gists{/...,...,,,,,,,,,1,2022-11-19
3786,yueyue4359,88612363.0,MDQ6VXNlcjg4NjEyMzYz,https://avatars.githubusercontent.com/u/886123...,,https://api.github.com/users/yueyue4359,https://github.com/yueyue4359,https://api.github.com/users/yueyue4359/followers,https://api.github.com/users/yueyue4359/follow...,https://api.github.com/users/yueyue4359/gists{...,...,,,,,,,,,1,2022-11-19
3787,hvm-uu,83591395.0,MDQ6VXNlcjgzNTkxMzk1,https://avatars.githubusercontent.com/u/835913...,,https://api.github.com/users/hvm-uu,https://github.com/hvm-uu,https://api.github.com/users/hvm-uu/followers,https://api.github.com/users/hvm-uu/following{...,https://api.github.com/users/hvm-uu/gists{/gis...,...,,,,,,,,,1,2022-11-12


In [106]:
merged_df._merge.value_counts()

both          796
right_only    133
left_only       1
Name: _merge, dtype: int64

In [77]:
newer_counts = df.groupby(subset_fields).size().reset_index(name='new_counts')
older_counts = older_df.groupby(subset_fields).size().reset_index(name='older_counts')
merged_counts = pd.merge(newer_counts, older_counts, on=subset_fields, how='left')
missing_values = merged_counts[(merged_counts.new_counts < merged_counts.older_counts) | (merged_counts.older_counts.isna())]
missing_join = pd.merge(older_df, missing_values[subset_fields], on=subset_fields, how='inner')
missing_join = missing_join.drop_duplicates(subset=subset_fields)
print(len(missing_join))
# double_check = missing_join[subset_fields]
# combined_condition = np.ones(len(df), dtype=bool)
# for field in subset_fields:
#     combined_condition = combined_condition & df[field].isin(double_check[field])
# df['double_check'] = np.where(combined_condition, 1, 0)
# final_missing_values = df[(df.double_check == 0) & (df[subset_fields[0]].isin(double_check[subset_fields[0]]))]
# print(len(final_missing_values))

794


In [81]:
len(newer_counts), len(older_counts)

(796, 927)

In [82]:
merged_counts[(merged_counts.older_counts != merged_counts.new_counts) ]

Unnamed: 0,login,cleaned_search_query,new_counts,older_counts
0,1r3n3,https://api.github.com/search/users?q=Digital+...,1,2
1,1ucyP,https://api.github.com/search/users?q=Digital+...,1,2
2,5colldh,https://api.github.com/search/users?q=Digital+...,1,2
3,ABC-DH,https://api.github.com/search/users?q=Digital+...,1,2
4,ADHO,https://api.github.com/search/users?q=Digital+...,1,2
...,...,...,...,...
791,yrochat,https://api.github.com/search/users?q=Digital+...,1,2
792,yukiyuqichen,https://api.github.com/search/users?q=Digital+...,1,2
793,zimgraz,https://api.github.com/search/users?q=Digital+...,1,2
794,zkmacdon,https://api.github.com/search/users?q=Digital+...,1,2


In [68]:
test3 = pd.concat([test2[['login', 'cleaned_search_query']], search_queries_user_join_df[['login', 'cleaned_search_query']]])

In [76]:
test2[(~test2.login.isin(search_queries_user_join_df.login))].cleaned_search_query.value_counts()

https://api.github.com/search/users?q=Digital+Humanities&per_page=100       123
https://api.github.com/search/users?q=Humanidades+digitales&per_page=100      7
https://api.github.com/search/users?q=Humanidades+Digitais&per_page=100       2
https://api.github.com/search/users?q=Digitaalhumanitaaria&per_page=100       1
Name: cleaned_search_query, dtype: int64

In [52]:
search_queries_user_join_df[search_queries_user_join_df.login == "hdcaicyt"].cleaned_search_query.values[0]

'https://api.github.com/search/users?q=Humanidades+digitales&per_page=100'

In [49]:
merged_test[merged_test.type_search.isna()].cleaned_search_query.unique()

array(['https://api.github.com/search/users?q=Humanidades+digitales&per_page=100',
       'https://api.github.com/search/users?q=Informatica+umanistica&per_page=100',
       'https://api.github.com/search/users?q=Digital+Humanities&per_page=100'],
      dtype=object)

In [53]:
test2.groupby(subset_fields).size().reset_index(name='counts').sort_values(by=['counts'], ascending=False)

Unnamed: 0,login,cleaned_search_query,counts
0,1r3n3,https://api.github.com/search/users?q=Digital+...,1
624,johlei,https://api.github.com/search/users?q=Digital+...,1
612,jessgrimmer,https://api.github.com/search/users?q=Digital+...,1
613,jessprof,https://api.github.com/search/users?q=Humanida...,1
614,jeyrena1,https://api.github.com/search/users?q=Digital+...,1
...,...,...,...
314,aergithub,https://api.github.com/search/users?q=Digital+...,1
315,agnesecam,https://api.github.com/search/users?q=Informat...,1
316,agustinjaramillo,https://api.github.com/search/users?q=Digital+...,1
317,aiucd,https://api.github.com/search/users?q=Informat...,1


#### Check if all items exist in entity files

In [3]:
missing_repos = search_queries_repo_join_df[~search_queries_repo_join_df.full_name.isin(repo_df.full_name)]
missing_users = search_queries_user_join_df[(~search_queries_user_join_df.login.isin(user_df.login)) & (search_queries_user_join_df['type'] == 'User')]
missing_orgs = search_queries_user_join_df[(~search_queries_user_join_df.login.isin(user_df.login)) & (search_queries_user_join_df['type'] == 'Organization')]

len(missing_repos), len(missing_users), len(missing_orgs)

(0, 0, 0)

In [4]:
if len(missing_repos) > 0:
    repo_df = check_add_repos(missing_repos, '../data/large_files/entity_files/repos_dataset.csv', True)
if len(missing_orgs) > 0:
    org_df = check_add_orgs(missing_orgs, '../data/entity_files/orgs_dataset.csv', True, False)
if len(missing_users) > 0:
    user_df = check_add_users(missing_users, '../data/entity_files/users_dataset.csv', True, False)

In [5]:
core_repos = pd.merge(repo_df, search_queries_repo_join_df[['full_name', 'finalized_language', 'keep_resource']], on='full_name', how='inner')
core_repos = core_repos.drop_duplicates(subset=['full_name'])
core_users = pd.merge(user_df, search_queries_user_join_df[['login', 'finalized_language', 'keep_resource']], on='login', how='inner')
core_users = core_users.drop_duplicates(subset=['login'])
core_orgs = core_users[core_users['type'] == 'Organization']
core_users = core_users[core_users['type'] == 'User']

len(core_repos), len(core_users), len(core_orgs)

(2485, 736, 190)

In [None]:
core_users_path = "../data/derived_files/core_users_dataset.csv"
core_repos_path = "../data/derived_files/core_repos_dataset.csv"
core_orgs_path = "../data/derived_files/core_orgs_dataset.csv"

if os.path.exists(core_users_path):
    existing_core_users = pd.read_csv(core_users_path)

    missing_cols = [col for col in existing_core_users.columns if col not in core_users.columns]
    if len(missing_cols) > 0:
        missing_cols = missing_cols + ['login']
        added_cols = existing_core_users[missing_cols]
        core_users = pd.merge(core_users, added_cols, on='login', how='left')
        core_users = core_users.drop_duplicates(subset=['login'])

    if len(core_users) > len(existing_core_users):
        updated_core_users = core_users[~core_users.login.isin(existing_core_users.login)]
        core_users = pd.concat([existing_core_users, updated_core_users])

In [132]:
core_users.to_csv("../data/derived_files/initial_core_users.csv", index=False)
core_orgs.to_csv("../data/derived_files/initial_core_orgs.csv", index=False)
core_repos.to_csv("../data/derived_files/initial_core_repos.csv", index=False)