In [4]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append("../../")
from data_generation_scripts.utils import check_rate_limit, check_return_error_file, read_combine_files, check_total_pages, check_for_joins_in_older_queries
from data_generation_scripts.generate_search_data import get_initial_search_datasets
from data_generation_scripts.generate_repo_metadata import get_repo_languages, get_repo_labels, get_repo_tags,  get_repo_profile, get_total_commits
from data_generation_scripts.generate_repo_users_interactions import get_repos_user_actors
from data_generation_scripts.generate_repo_metadata import check_total_results

In [5]:
subset_terms = ["Digital Humanities"]
# console = Console()
initial_repo_output_path = "../../data/repo_data/"
repo_output_path = "../../data/large_files/entity_files/repos_dataset.csv"
initial_repo_join_output_path = "../../data/large_files/join_files/search_queries_repo_join_dataset.csv"
existing_search_queries_repo_file_path = "../../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv"

initial_user_output_path = "../../data/user_data/"
user_output_path = "../../data/entity_files/users_dataset.csv"
org_output_path = "../../data/entity_files/orgs_dataset.csv"
initial_user_join_output_path = "../../data/join_files/search_queries_user_join_dataset.csv"
existing_search_queries_user_file_path = "../../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv"

In [6]:
repo_join_output_path = "search_queries_repo_join_dataset.csv"
user_join_output_path = "search_queries_user_join_dataset.csv"
join_unique_field = 'search_query'
filter_fields = ['id', 'cleaned_search_query']

In [8]:
existing_search_queries_user_df = pd.read_csv(existing_search_queries_user_file_path)
existing_search_queries_repo_df = pd.read_csv(existing_search_queries_repo_file_path)

existing_search_queries_user_df = existing_search_queries_user_df[existing_search_queries_user_df.search_term_source.isin(subset_terms)]
existing_search_queries_repo_df = existing_search_queries_repo_df[existing_search_queries_repo_df.search_term_source.isin(subset_terms)]
existing_search_queries_user_df['cleaned_search_query'] = existing_search_queries_user_df['search_query'].str.replace('%22', '').str.replace('"', '').str.replace('%3A', ':').str.split('&page').str[0]
existing_search_queries_repo_df['cleaned_search_query'] = existing_search_queries_repo_df['search_query'].str.replace('%22', '"').str.replace('"', '').str.replace('%3A', ':').str.split('&page').str[0]

updated_search_queries_repo_df = check_for_joins_in_older_queries(repo_join_output_path, existing_search_queries_repo_df, join_unique_field, filter_fields)
updated_search_queries_user_df = check_for_joins_in_older_queries(user_join_output_path, existing_search_queries_user_df, join_unique_field, filter_fields)
# updated_search_queries_repo_df = updated_search_queries_repo_df.drop_duplicates(subset=['id', 'cleaned_search_query'])
# updated_search_queries_user_df = updated_search_queries_user_df.drop_duplicates(subset=['id', 'cleaned_search_query'])

In [10]:
len(existing_search_queries_repo_df), len(updated_search_queries_repo_df), len(existing_search_queries_user_df), len(updated_search_queries_user_df)

(2649, 2649, 932, 932)

In [9]:
len(existing_search_queries_repo_df[existing_search_queries_repo_df.finalized_language.isna()]), len(existing_search_queries_user_df[existing_search_queries_user_df.finalized_language.isna()])

(74, 0)

In [12]:
len(updated_search_queries_repo_df[updated_search_queries_repo_df.finalized_language.isna()]), len(updated_search_queries_user_df[updated_search_queries_user_df.finalized_language.isna()])

(74, 0)

In [11]:
updated_search_queries_repo_df.finalized_language.value_counts()

en         1984
zh-CN       125
it          104
fr           71
es           69
de           56
pt           36
ru           29
vi           16
zh-CH        12
he            8
zh-TW         8
da            7
ko            7
en, de        6
cs            5
sv            4
ja            3
nl            3
no            2
lb            2
zh            2
de, en        2
bg-Latn       1
pl            1
fil           1
en, he        1
fi            1
en, nl        1
id            1
la            1
th            1
en, ru        1
hu            1
it, en        1
en, it        1
et            1
Name: finalized_language, dtype: int64

In [25]:
initial_search_queries_repo_df = pd.read_csv(initial_repo_join_output_path)
initial_search_queries_user_df  = pd.read_csv(initial_user_join_output_path)
initial_search_queries_user_df['cleaned_search_query'] = initial_search_queries_user_df['search_query'].str.replace('%22', '"').str.replace('%3A', ':').str.split('&page').str[0]
initial_search_queries_repo_df['cleaned_search_query'] = initial_search_queries_repo_df['search_query'].str.replace('%22', '"').str.replace('%3A', ':').str.split('&page').str[0]

initial_search_queries_repo_df = initial_search_queries_repo_df[initial_search_queries_repo_df.search_term_source.isin(subset_terms)]
initial_search_queries_user_df = initial_search_queries_user_df[initial_search_queries_user_df.search_term_source.isin(subset_terms)]


search_queries_repo_df = pd.concat([updated_search_queries_repo_df, initial_search_queries_repo_df])
search_queries_user_df = pd.concat([updated_search_queries_user_df, initial_search_queries_user_df])

In [15]:
len(search_queries_repo_df[search_queries_repo_df.finalized_language.isna()]), len(updated_search_queries_repo_df[updated_search_queries_repo_df.finalized_language.isna()])

(2514, 74)

In [16]:
cols = ['natural_language',
 'search_type',
 'search_query_time',
 'detected_language',
 'detected_language_confidence',
 'finalized_language',
 'keep_resource',
 'query',
 'updated_natural_language',
 'double_check',
 'cleaned_search_query']

In [17]:
search_queries_repo_df.natural_language.value_counts()

en, ny, ha, ig, lb, mg, sm, sn, st, tl, yo    4459
zh                                             153
it                                             133
es                                             113
fr, ln                                          91
pt                                              33
vi                                              32
en                                              23
xh, zu                                          12
de                                               8
da                                               7
fy                                               6
id                                               6
sv                                               3
et                                               2
fr                                               2
ru                                               2
fi                                               2
ko                                               2
Name: natural_language, dtype: 

In [18]:
from tqdm import tqdm

In [27]:
def fill_missing_language_data(rows, is_repo):
    if len(rows[rows.finalized_language.notna()]) == 0:
        entity_type = 'Repo' if is_repo else 'User'
        field = 'full_name' if is_repo else 'login'
        print(f"No finalized language {len(rows)}, {rows.finalized_language.unique()}, {entity_type} {rows[rows[field].notna()][field].unique()[0]}") 
    detected_language = rows[rows.detected_language.notnull()].detected_language.unique()
    rows.detected_language = detected_language[0] if len(detected_language) > 0 else None
    detected_language_confidence = rows[rows.detected_language_confidence.notnull()].detected_language_confidence.unique()
    if len(detected_language_confidence) > 1:
        detected_language_confidence = [rows[rows.detected_language_confidence.notnull()].detected_language_confidence.max()]
    rows.detected_language_confidence = detected_language_confidence[0] if len(detected_language_confidence) > 0 else None
    finalized_language = rows[rows.finalized_language.notna()].finalized_language.unique()
    if len(finalized_language) > 1:
        print(finalized_language)
        finalized_language = [lang for lang in finalized_language if lang != None]
        print(finalized_language)
    rows.finalized_language = finalized_language[0] if len(finalized_language) > 0 else None
    keep_resource = rows[rows.keep_resource.notna()].keep_resource.unique()
    rows.keep_resource = keep_resource[0] if len(keep_resource) > 0 else None
    if (len(detected_language) > 1) | (len(detected_language_confidence) > 1) | (len(finalized_language) > 1) | (len(keep_resource) > 1):
        entity_type = 'Repo' if is_repo else 'User'
        field = 'full_name' if is_repo else 'login'
        unique_id = rows[rows[field].notna()][field].unique()[0]
        print(f"{entity_type} {unique_id}: Detected: {len(detected_language)}, Confidence: {detected_language_confidence}, Finalized: {len(finalized_language)}, Keep: {len(keep_resource)}")
 
    return rows

In [28]:
tqdm.pandas(desc="Fill missing language data")
cleaned_search_queries_repo_df = search_queries_repo_df.groupby(['full_name']).progress_apply(fill_missing_language_data, is_repo=True)
cleaned_search_queries_user_df = search_queries_user_df.groupby(['login']).progress_apply(fill_missing_language_data, is_repo=False)

Fill missing language data:   6%|▋         | 158/2498 [00:00<00:02, 801.78it/s]

No finalized language 3, [nan], Repo AlenaRehder/Digital-Humanities
No finalized language 3, [nan], Repo GiacomoZanon/Esame-Informatica-Umanistica
No finalized language 3, [nan], Repo Ginasanchez1/digital-humanities


Fill missing language data:  18%|█▊        | 456/2498 [00:00<00:02, 947.60it/s]

No finalized language 3, [nan], Repo Jennysarah1995/Digital-Humanities
No finalized language 3, [nan], Repo John-M-Walls/Digital-Humanities
No finalized language 3, [nan], Repo Jukka-Sarasti/Digital-Humanities
No finalized language 3, [nan], Repo OverlandBaggles/Digital-Humanities
No finalized language 3, [nan], Repo PiaWstmr/Digital-Humanities-Versionierung
No finalized language 3, [nan], Repo PlayDHCU5000/Oblique-Generosity-Play-Digital-Humanities-


Fill missing language data:  30%|███       | 753/2498 [00:00<00:01, 961.95it/s]

No finalized language 3, [nan], Repo RumeysaSara/DigitalHumanitiesTraining
No finalized language 2, [nan], Repo StagerLee/To-Add-Digital-Humanities-LAB
No finalized language 3, [nan], Repo Tapasi92/digital-humanities
No finalized language 3, [nan], Repo acmk19/DigitalHumanitiesAMK
No finalized language 3, [nan], Repo ada-FU/digital-humanities
No finalized language 3, [nan], Repo allanchoubc/Digital-Humanities


Fill missing language data:  38%|███▊      | 944/2498 [00:01<00:01, 916.29it/s]

No finalized language 3, [nan], Repo arnobaer/schweickhardt-data
No finalized language 3, [nan], Repo bgammans/Digital-Humanities
No finalized language 3, [nan], Repo biancabook/DigitalHumanities
No finalized language 3, [nan], Repo bishin/Digital-Humanities
No finalized language 3, [nan], Repo chiaraaileen/Digital_Humanities_Project2020


Fill missing language data:  50%|████▉     | 1244/2498 [00:01<00:01, 967.53it/s]

No finalized language 3, [nan], Repo derekjarvis15/DigitalHumanities-Fa19
No finalized language 3, [nan], Repo digihumanities/Digital-Humanities


Fill missing language data:  58%|█████▊    | 1437/2498 [00:01<00:01, 890.30it/s]

No finalized language 3, [nan], Repo hanazaply/Group11-DigitalHumanities
No finalized language 3, [nan], Repo jeanlammie/DigitalHumanities
No finalized language 3, [nan], Repo jessicastimel/Digital-Humanities


Fill missing language data:  73%|███████▎  | 1836/2498 [00:01<00:00, 966.48it/s]

No finalized language 3, [nan], Repo juliahoribe/Digital-Humanities-
No finalized language 3, [nan], Repo juliamendros/digital-humanities
No finalized language 3, [nan], Repo kristofhlavacka/digital-humanities
No finalized language 3, [nan], Repo mcait9/Digital-Humanities-Glossing


Fill missing language data:  82%|████████▏ | 2037/2498 [00:02<00:00, 964.51it/s]

No finalized language 3, [nan], Repo mnluke/Like-Digital-Humanities
No finalized language 3, [nan], Repo muhamadrofiq/Digital-Humanities
No finalized language 3, [nan], Repo nikolettarok/Digital_Humanities-1


Fill missing language data:  94%|█████████▍| 2346/2498 [00:02<00:00, 997.23it/s]

No finalized language 3, [nan], Repo sgsinclair/digital-humanities
No finalized language 3, [nan], Repo srand294/Digital-Humanities
No finalized language 3, [nan], Repo tsinasky/digital-humanities
No finalized language 3, [nan], Repo vica5699/digital-humanities


Fill missing language data: 100%|██████████| 2498/2498 [00:03<00:00, 664.05it/s] 
Fill missing language data: 100%|██████████| 926/926 [00:01<00:00, 695.35it/s]


In [29]:
print(len(initial_search_queries_repo_df), len(search_queries_repo_df))
print(len(initial_search_queries_user_df), len(search_queries_user_df))

2440 5089
1710 2642


In [30]:
cleaned_search_queries_repo_df[cleaned_search_queries_repo_df.finalized_language.isna()].shape[0]

107

In [31]:
cleaned_search_queries_repo_df.loc[cleaned_search_queries_repo_df.search_query_time.isna(), 'search_query_time'] = "2022-10-10"
cleaned_search_queries_repo_df['search_query_time'] = pd.to_datetime(cleaned_search_queries_repo_df['search_query_time'], errors='coerce')
cleaned_search_queries_repo_df = cleaned_search_queries_repo_df.sort_values(by=['search_query_time'], ascending=False).drop_duplicates(subset=['id', 'cleaned_search_query'], keep='first')

cleaned_search_queries_user_df.loc[cleaned_search_queries_user_df.search_query_time.isna(), 'search_query_time'] = "2022-10-10"
cleaned_search_queries_user_df['search_query_time'] = pd.to_datetime(cleaned_search_queries_user_df['search_query_time'], errors='coerce')
cleaned_search_queries_user_df = cleaned_search_queries_user_df.sort_values(by=['search_query_time'], ascending=False).drop_duplicates(subset=['id',  'cleaned_search_query'], keep='first')

In [32]:
print(len(cleaned_search_queries_repo_df), len(cleaned_search_queries_user_df))

2982 1767


In [33]:
fix_repo_queries = cleaned_search_queries_repo_df[(cleaned_search_queries_repo_df.cleaned_search_query.str.contains('q="Humanities"')) & (cleaned_search_queries_repo_df.search_term_source == "Digital Humanities")]
fix_user_queries = cleaned_search_queries_user_df[(cleaned_search_queries_user_df.cleaned_search_query.str.contains('q="Humanities"')) & (cleaned_search_queries_user_df.search_term_source == "Digital Humanities")]
print(len(fix_repo_queries))
if len(fix_repo_queries) > 0:
    replace_repo_queries = cleaned_search_queries_repo_df[(cleaned_search_queries_repo_df.full_name.isin(fix_repo_queries.full_name)) & (cleaned_search_queries_repo_df.search_term_source == "Digital Humanities")][['full_name', 'search_query']]
    cleaned_search_queries_repo_df.loc[cleaned_search_queries_repo_df.full_name.isin(fix_repo_queries.full_name), 'cleaned_search_query'] = cleaned_search_queries_repo_df.loc[cleaned_search_queries_repo_df.full_name.isin(fix_repo_queries.full_name), 'full_name'].map(replace_repo_queries.set_index('full_name').to_dict()['search_query'])
    
if len(fix_user_queries) > 0:
    replace_user_queries = cleaned_search_queries_user_df[(cleaned_search_queries_user_df.login.isin(fix_user_queries.login)) & (cleaned_search_queries_user_df.search_term_source == "Digital Humanities")][['login', 'search_query']]
    cleaned_search_queries_user_df.loc[cleaned_search_queries_user_df.login.isin(fix_user_queries.login), 'cleaned_search_query'] = cleaned_search_queries_user_df.loc[cleaned_search_queries_user_df.login.isin(fix_user_queries.login), 'login'].map(replace_user_queries.set_index('login').to_dict()['search_query'])

0


In [34]:
join_user_field = 'login'
search_user_type = 'user'
join_repo_field = 'full_name'
search_repo_type = 'repo'

In [36]:
search_repo_df = cleaned_search_queries_repo_df.drop_duplicates(
    subset=[join_repo_field, 'cleaned_search_query'])

search_user_df = cleaned_search_queries_user_df.drop_duplicates(subset=[join_user_field, 'cleaned_search_query'])

if 'keep_resource' not in search_repo_df.columns:
    search_repo_df['keep_resource'] = True
else:
    search_repo_df.loc[search_repo_df.keep_resource == 'None'] = None

if 'keep_resource' not in search_user_df.columns:
    search_user_df['keep_resource'] = True
else:
    search_user_df.loc[search_user_df.keep_resource == 'None'] = None

if 'finalized_language' not in search_repo_df.columns:
    search_repo_df['finalized_language'] = None
else:
    search_repo_df.loc[search_repo_df.finalized_language == 'None'] = None

if 'finalized_language' not in search_user_df.columns:
    search_user_df['finalized_language'] = None
else:
    search_user_df.loc[search_user_df.finalized_language == 'None'] = None


subset_search_repo_df = search_repo_df[(search_repo_df.detected_language.isna()) & (search_repo_df.finalized_language.isna())]
subset_search_user_df = search_user_df[(search_user_df.detected_language.isna()) & (search_user_df.finalized_language.isna())]

existing_search_repo_df = search_repo_df[(search_repo_df.detected_language.notna()) & (search_repo_df.finalized_language.notna())]
existing_search_user_df = search_user_df[(search_user_df.detected_language.notna()) & (search_user_df.finalized_language.notna())]

print(len(subset_search_repo_df), len(existing_search_repo_df))
print(len(subset_search_user_df), len(existing_search_user_df))

# subset_search_repo_df.loc[(subset_search_repo_df.id.isin(existing_search_repo_df.id.unique()), 'detected_language')] = existing_search_repo_df.set_index('id').to_dict()['detected_language']
# subset_search_user_df.loc[(subset_search_user_df.id.isin(existing_search_user_df.id.unique()), 'detected_language')] = existing_search_user_df.set_index('id').to_dict()['detected_language']

# subset_search_repo_df.loc[(subset_search_repo_df.id.isin(existing_search_repo_df.id.unique()), 'detected_language_confidence')] = existing_search_repo_df.set_index('id').to_dict()['detected_language_confidence']
# subset_search_user_df.loc[(subset_search_user_df.id.isin(existing_search_user_df.id.unique()), 'detected_language_confidence')] = existing_search_user_df.set_index('id').to_dict()['detected_language_confidence']

# subset_search_repo_df.loc[(subset_search_repo_df.id.isin(existing_search_repo_df.id.unique()), 'finalized_language')] = existing_search_repo_df.set_index('id').to_dict()['finalized_language']
# subset_search_user_df.loc[(subset_search_user_df.id.isin(existing_search_user_df.id.unique()), 'finalized_language')] = existing_search_user_df.set_index('id').to_dict()['finalized_language']

# print(len(subset_search_repo_df), len(existing_search_repo_df))
# print(len(subset_search_user_df), len(existing_search_user_df))


38 2645
1 1513


In [38]:
subset_search_repo_df

Unnamed: 0,id,node_id,name,full_name,private,html_url,description,fork,url,forks_url,...,search_type,search_query_time,detected_language,detected_language_confidence,finalized_language,cleaned_search_query,keep_resource,query,updated_natural_language,double_check
5487,234679428.0,MDEwOlJlcG9zaXRvcnkyMzQ2Nzk0Mjg=,digital-humanities,vica5699/digital-humanities,0.0,https://github.com/vica5699/digital-humanities,,0.0,https://api.github.com/repos/vica5699/digital-...,https://api.github.com/repos/vica5699/digital-...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,False,,,
5494,245225767.0,MDEwOlJlcG9zaXRvcnkyNDUyMjU3Njc=,Digital-Humanities,srand294/Digital-Humanities,0.0,https://github.com/srand294/Digital-Humanities,,0.0,https://api.github.com/repos/srand294/Digital-...,https://api.github.com/repos/srand294/Digital-...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,False,,,
5258,339725593.0,MDEwOlJlcG9zaXRvcnkzMzk3MjU1OTM=,digital-humanities,kristofhlavacka/digital-humanities,0.0,https://github.com/kristofhlavacka/digital-hum...,,0.0,https://api.github.com/repos/kristofhlavacka/d...,https://api.github.com/repos/kristofhlavacka/d...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,False,,,
5261,436578763.0,R_kgDOGgWpyw,Digital-Humanities,jessicastimel/Digital-Humanities,0.0,https://github.com/jessicastimel/Digital-Human...,,0.0,https://api.github.com/repos/jessicastimel/Dig...,https://api.github.com/repos/jessicastimel/Dig...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,False,,,
5266,429894035.0,R_kgDOGZ-pkw,digital-humanities,Ginasanchez1/digital-humanities,0.0,https://github.com/Ginasanchez1/digital-humani...,,0.0,https://api.github.com/repos/Ginasanchez1/digi...,https://api.github.com/repos/Ginasanchez1/digi...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,False,,,
5265,397700199.0,MDEwOlJlcG9zaXRvcnkzOTc3MDAxOTk=,Digital-Humanities,digihumanities/Digital-Humanities,0.0,https://github.com/digihumanities/Digital-Huma...,,0.0,https://api.github.com/repos/digihumanities/Di...,https://api.github.com/repos/digihumanities/Di...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,False,,,
5262,423425205.0,R_kgDOGTz0tQ,Digital-Humanities,AlenaRehder/Digital-Humanities,0.0,https://github.com/AlenaRehder/Digital-Humanities,,0.0,https://api.github.com/repos/AlenaRehder/Digit...,https://api.github.com/repos/AlenaRehder/Digit...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,False,,,
5241,423459974.0,R_kgDOGT18hg,DigitalHumanitiesTraining,RumeysaSara/DigitalHumanitiesTraining,0.0,https://github.com/RumeysaSara/DigitalHumaniti...,,0.0,https://api.github.com/repos/RumeysaSara/Digit...,https://api.github.com/repos/RumeysaSara/Digit...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,False,,,
5307,375665107.0,MDEwOlJlcG9zaXRvcnkzNzU2NjUxMDc=,Digital_Humanities-1,nikolettarok/Digital_Humanities-1,0.0,https://github.com/nikolettarok/Digital_Humani...,,0.0,https://api.github.com/repos/nikolettarok/Digi...,https://api.github.com/repos/nikolettarok/Digi...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,False,,,
5732,580109807.0,R_kgDOIpPF7w,Group11-DigitalHumanities,hanazaply/Group11-DigitalHumanities,0.0,https://github.com/hanazaply/Group11-DigitalHu...,,0.0,https://api.github.com/repos/hanazaply/Group11...,https://api.github.com/repos/hanazaply/Group11...,...,searched,2023-03-30,,,,https://api.github.com/search/repositories?q=D...,False,,,


In [48]:
search_repo_df = pd.concat([existing_search_repo_df, subset_search_repo_df])
search_user_df = pd.concat([existing_search_user_df, subset_search_user_df])
print(len(search_repo_df))
print(len(search_user_df))
subset_search_repo_df = search_repo_df[(search_repo_df.detected_language.isna()) & (search_repo_df.finalized_language.isna())]
subset_search_user_df = search_user_df[(search_user_df.detected_language.isna()) & (search_user_df.finalized_language.isna())]
print(len(subset_search_repo_df))
print(len(subset_search_user_df))

2937
1706
2599
1706
