# Exploring Distribution and Results from Search Queries

## Load Data and Libraries

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import altair as alt
alt.renderers.enable('mimetype')
import os
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append("..")



In [2]:
search_queries_repo_df = pd.read_csv("../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv")
search_queries_user_df = pd.read_csv("../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv")


## Explore All Search Queries

In [3]:
search_queries_repo_df.search_query_time.value_counts()

2023-03-19    1525
2022-12-12     783
2022-12-18     275
2023-03-30     207
2023-03-16      27
2022-10-10       7
2022-11-02       2
2022-11-27       1
2022-11-01       1
Name: search_query_time, dtype: int64

In [None]:
search_queries_user_df.search_term_source.value_counts()


In [None]:
search_queries_repo_df.search_term_source.value_counts()

In [4]:
subset_terms =["Digital Humanities"]

In [16]:
subset_search_queries_repo_df = search_queries_repo_df[search_queries_repo_df.search_term_source.isin(subset_terms)]
subset_search_queries_user_df = search_queries_user_df[search_queries_user_df.search_term_source.isin(subset_terms)]

### Clean Errors and Subset DH Queries

In [24]:
new_dh_repos = search_queries_repo_df[search_queries_repo_df.search_term_source == "Digital Humanities"]
new_dh_users = search_queries_user_df[search_queries_user_df.search_term_source == "Digital Humanities"]

In [25]:
subset_search_queries_repo_df = pd.read_csv(
    '../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv')
subset_search_queries_user_df = pd.read_csv('../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv')

In [60]:
older_search_queries_repo_df = read_combine_files(dir_path="../data/", check_all_dirs=True, file_path_contains="search_queries_repo_join_dataset", large_files=False)
older_search_queries_user_df = read_combine_files(dir_path="../data/", check_all_dirs=True, file_path_contains="search_queries_user_join_dataset", large_files=False)

In [61]:
older_search_queries_repo_df = older_search_queries_repo_df[older_search_queries_repo_df.search_term_source == "Digital Humanities"]
older_search_queries_user_df = older_search_queries_user_df[older_search_queries_user_df.search_term_source == "Digital Humanities"]

In [62]:
len(older_search_queries_repo_df), len(search_queries_repo_df), len(subset_search_queries_repo_df)

(17940, 7929, 2649)

In [63]:
older_search_queries_repo_df = pd.concat([older_search_queries_repo_df, search_queries_repo_df[search_queries_repo_df.search_term_source == "Digital Humanities"]])
older_search_queries_user_df = pd.concat([older_search_queries_user_df, search_queries_user_df[search_queries_user_df.search_term_source == "Digital Humanities"]])

In [64]:
len(older_search_queries_repo_df), len(older_search_queries_user_df)

(20380, 10413)

In [65]:
join_unique_field = 'search_query'
filter_field = 'id'
older_search_queries_user_df = older_search_queries_user_df[older_search_queries_user_df[join_unique_field].notna()]
older_search_queries_repo_df = older_search_queries_repo_df[older_search_queries_repo_df[join_unique_field].notna()]

missing_user_join = older_search_queries_user_df[~older_search_queries_user_df[filter_field].isin(subset_search_queries_user_df[filter_field])]
missing_repo_join = older_search_queries_repo_df[~older_search_queries_repo_df[filter_field].isin(subset_search_queries_repo_df[filter_field])]

In [66]:
len(missing_user_join), len(missing_repo_join)

(0, 12)

In [67]:
time_field = 'search_query_time' 
cleaned_field = 'cleaned_search_query_time' 
missing_user_join[cleaned_field] = None
missing_repo_join[cleaned_field] = None
missing_user_join.loc[missing_user_join[time_field].isna(), cleaned_field] = '2022-10-10'
missing_repo_join.loc[missing_repo_join[time_field].isna(), cleaned_field] = '2022-10-10'
missing_user_join[cleaned_field] = pd.to_datetime(missing_user_join[time_field], errors='coerce')
missing_repo_join[cleaned_field] = pd.to_datetime(missing_repo_join[time_field], errors='coerce')
missing_user_join['cleaned_search_query'] = missing_user_join['search_query'].str.replace('%22', '"').str.replace('%3A', ':').str.split('&page').str[0]
missing_repo_join['cleaned_search_query'] = missing_repo_join['search_query'].str.replace('%22', '"').str.replace('%3A', ':').str.split('&page').str[0]
missing_user_join = missing_user_join.sort_values(by=[cleaned_field]).drop_duplicates(subset=['id', 'cleaned_search_query'], keep='first').drop(columns=[cleaned_field, 'cleaned_search_query'])
missing_repo_join = missing_repo_join.sort_values(by=[cleaned_field]).drop_duplicates(subset=['id', 'cleaned_search_query'], keep='first').drop(columns=[cleaned_field, 'cleaned_search_query'])

In [68]:
join_user_files_df = pd.concat([subset_search_queries_user_df, missing_user_join])
join_user_files_df = join_user_files_df.drop_duplicates(subset=['id',join_unique_field])
join_repo_files_df = pd.concat([subset_search_queries_repo_df, missing_repo_join])
join_repo_files_df = join_repo_files_df.drop_duplicates(subset=['id',join_unique_field])

In [69]:
len(join_repo_files_df), len(join_user_files_df)

(2608, 928)

In [59]:
join_user_files_df[join_user_files_df.login == "mromanello"][['cleaned_search_query', 'detected_language', 'detected_language_confidence', 'search_query', 'search_query_time', 'search_term_source']]

Unnamed: 0,cleaned_search_query,detected_language,detected_language_confidence,search_query,search_query_time,search_term_source
2191,,,,"https://api.github.com/search/users?q=""Humanit...",2023-03-30,Humanities
3321,,,,https://api.github.com/search/users?q=Computat...,2023-03-30,Computational Humanities
3369,,,,https://api.github.com/search/users?q=Humaniti...,2023-03-30,Computational Humanities
3416,,,,"https://api.github.com/search/users?q=""Computa...",2023-03-30,Computational Humanities


In [7]:
missing_repos = new_dh_repos[~new_dh_repos.full_name.isin(subset_search_queries_repo_df.full_name)]
missing_users = new_dh_users[~new_dh_users.login.isin(subset_search_queries_user_df.login)]

In [8]:
len(missing_repos), len(missing_users), len(missing_repos[missing_repos.full_name.isin(repo_df.full_name.unique())]), len(missing_users[missing_users.login.isin(user_df.login.unique())])

(0, 0, 0, 0)

###### Initial Fix for Errors

In [10]:
fix_queries = subset_search_queries_repo_df[subset_search_queries_repo_df.cleaned_search_query.str.contains('q="Humanities"')]

In [11]:
replace_queries = search_queries_repo_df[(search_queries_repo_df.full_name.isin(fix_queries.full_name)) & (search_queries_repo_df.search_term_source == "Digital Humanities")][['full_name', 'search_query']]

In [12]:
subset_search_queries_repo_df.loc[subset_search_queries_repo_df.full_name.isin(fix_queries.full_name), 'cleaned_search_query'] = subset_search_queries_repo_df.loc[subset_search_queries_repo_df.full_name.isin(fix_queries.full_name), 'full_name'].map(replace_queries.set_index('full_name').to_dict()['search_query'])

In [65]:
subset_search_queries_repo_df = subset_search_queries_repo_df[subset_search_queries_repo_df.finalized_language != 'Gallo 便告知美国大使馆工作人员在广场上并未目击到群众遭遇枪击 在这期间只有听到零星的枪声 251']
subset_search_queries_repo_df = subset_search_queries_repo_df[subset_search_queries_repo_df.search_term != 'I-Digital Humanities']

In [14]:
subset_search_queries_repo_df.loc[(subset_search_queries_repo_df.finalized_language == 'None'), 'finalized_language'] = None
subset_search_queries_repo_df.loc[(subset_search_queries_repo_df.keep_resource == True) & (subset_search_queries_repo_df.finalized_language.isna()), 'finalized_language'] = subset_search_queries_repo_df.loc[(subset_search_queries_repo_df.keep_resource == True) & (subset_search_queries_repo_df.finalized_language.isna()), 'detected_language']

In [15]:
subset_search_queries_repo_df['updated_natural_language'] = None
subset_search_queries_repo_df.loc[~subset_search_queries_repo_df.natural_language.str.contains(','), 'updated_natural_language'] = subset_search_queries_repo_df.natural_language

subset_search_queries_repo_df.loc[(subset_search_queries_repo_df.natural_language.str.contains(',')) & (subset_search_queries_repo_df.natural_language.str.contains('fr')), 'updated_natural_language'] = 'fr'
subset_search_queries_repo_df.loc[(subset_search_queries_repo_df.natural_language.str.contains(',')) & (subset_search_queries_repo_df.natural_language.str.contains('en')), 'updated_natural_language'] = 'en'

In [71]:
subset_search_queries_repo_df.loc[(subset_search_queries_repo_df.finalized_language.str.contains(',', na=False)) , 'updated_natural_language'] = subset_search_queries_repo_df.finalized_language.str.split(',').str[0]

In [13]:
subset_search_queries_user_df.finalized_language.value_counts()

en        785
it         35
es         26
pt         17
en, de     10
de          8
en, it      7
fr          7
en, es      7
en, fi      5
it, en      4
en, ru      3
de, en      3
zh          1
he, en      1
ru, en      1
None        1
zh-TW       1
pl          1
en, nl      1
ko          1
no          1
en, el      1
et          1
en, sr      1
ru          1
en, fr      1
fr, en      1
Name: finalized_language, dtype: int64

In [11]:
subset_search_queries_repo_df[(subset_search_queries_repo_df.finalized_language.str.contains(',', na=False))].updated_natural_language.value_counts()

en    6
de    1
fy    1
Name: updated_natural_language, dtype: int64

In [9]:
subset_search_queries_repo_df.updated_natural_language.value_counts()

en    2114
zh     142
it      62
es      57
fr      34
pt      18
vi      16
de       4
sv       3
fy       3
id       3
et       1
fi       1
ru       1
Name: updated_natural_language, dtype: int64

In [16]:
len(subset_search_queries_repo_df), len(new_dh_repos), len(subset_search_queries_user_df), len(new_dh_users)

(2424, 2440, 862, 1710)

In [17]:
missing_repos = subset_search_queries_repo_df[~subset_search_queries_repo_df.full_name.isin(new_dh_repos.full_name)]
missing_users = subset_search_queries_user_df[~subset_search_queries_user_df.login.isin(new_dh_users.login)]

In [18]:
len(missing_repos), len(missing_users)

(172, 0)

##### Check Older Files

In [102]:
import sys
sys.path.append("..")

from data_generation_scripts.utils import read_combine_files

In [41]:

older_user_search_queries = read_combine_files("../data/older_files/join_files/", "search_queries_user_join_dataset")
older_repo_search_queries = read_combine_files("../data/older_files/join_files/", "search_queries_repo_join_dataset")

In [42]:
all_user_search_queries = pd.concat([search_queries_user_df, older_user_search_queries])
all_repo_search_queries = pd.concat([search_queries_repo_df, older_repo_search_queries])
len(all_user_search_queries), len(all_repo_search_queries)

(33193, 58073)

In [12]:
join_unique_field = "search_query"

In [16]:
join_repo_file_path = "../data/join_files/search_queries_repo_join_dataset.csv"

In [18]:
older_join_file_path = join_repo_file_path.replace('data/', 'data/older_files/')
older_join_file_path

'../data/older_files/join_files/search_queries_repo_join_dataset.csv'

In [21]:
large_join_file_path = join_repo_file_path.replace('data/', 'data/large_files/')
large_join_file_path = large_join_file_path.replace('data/', 'data/older_files/')

In [23]:
older_join_file_dir = os.path.dirname(older_join_file_path) + '/'

In [22]:
large_join_file_path

'../data/older_files/large_files/join_files/search_queries_repo_join_dataset.csv'

In [None]:
test = read_combine_files("../data", "search_queries_repo_join_dataset")

In [43]:

join_user_file_path = "../data/join_files/search_queries_user_join_dataset.csv"
join_repo_file_path = "../data/join_files/search_queries_repo_join_dataset.csv"
initial_user_output_path = "../data/user_data/"
join_user_type = join_user_file_path.split('/')[-1].split('_dataset')[0]
join_repo_type = join_repo_file_path.split('/')[-1].split('_dataset')[0]

older_join_user_file_path = join_user_file_path.replace('data/', 'data/older_files/')
older_join_repo_file_path = join_repo_file_path.replace('data/', 'data/older_files/')
older_join_user_file_dir = os.path.dirname(older_join_user_file_path) + '/'
older_join_repo_file_dir = os.path.dirname(older_join_repo_file_path) + '/'

In [44]:
older_join_user_df = all_user_search_queries[all_user_search_queries[join_unique_field].notna()]
older_join_repo_df = all_repo_search_queries[all_repo_search_queries[join_unique_field].notna()]
len(older_join_user_df), len(older_join_repo_df)

(33193, 51570)

In [46]:
missing_user_join = older_join_user_df[~older_join_user_df.id.isin(subset_search_queries_user_df.id)]
missing_repo_join = older_join_repo_df[~older_join_repo_df.id.isin(subset_search_queries_repo_df.id)]
missing_user_join = missing_user_join[missing_user_join.search_term_source == 'Digital Humanities']
missing_repo_join = missing_repo_join[missing_repo_join.search_term_source == 'Digital Humanities']
len(missing_user_join), len(missing_repo_join)

(238, 482)

In [14]:
missing_user_join = missing_users
missing_repo_join = missing_repos
len(missing_user_join), len(missing_repo_join)

(4, 8)

In [15]:
time_field = 'search_query_time'
cleaned_field = 'cleaned_search_query_time'
missing_user_join[cleaned_field] = None
missing_user_join.loc[missing_user_join[time_field].isna(), cleaned_field] = '2022-10-10'
missing_user_join[cleaned_field] = pd.to_datetime(missing_user_join[time_field], errors='coerce')
missing_user_join['cleaned_search_query'] = missing_user_join['search_query'].str.replace('%22', '"').str.replace('%3A', ':').str.split('&page').str[0]
missing_user_join = missing_user_join.sort_values(by=[cleaned_field]).drop_duplicates(subset=['id', 'cleaned_search_query'], keep='first').drop(columns=[cleaned_field, 'cleaned_search_query'])


missing_repo_join[cleaned_field] = None
missing_repo_join.loc[missing_repo_join[time_field].isna(), cleaned_field] = '2022-10-10'
missing_repo_join[cleaned_field] = pd.to_datetime(missing_repo_join[time_field], errors='coerce')
missing_repo_join['cleaned_search_query'] = missing_repo_join['search_query'].str.replace('%22', '"').str.replace('%3A', ':').str.split('&page').str[0]
missing_repo_join = missing_repo_join.sort_values(by=[cleaned_field]).drop_duplicates(subset=['id', 'cleaned_search_query'], keep='first').drop(columns=[cleaned_field, 'cleaned_search_query'])

In [16]:
len(missing_user_join), len(missing_repo_join)

(4, 8)

In [17]:
join_user_files_df = pd.concat([subset_search_queries_user_df, missing_user_join])
join_user_files_df = join_user_files_df.drop_duplicates(subset=['id',join_unique_field])

join_repo_files_df = pd.concat([subset_search_queries_repo_df, missing_repo_join])
join_repo_files_df = join_repo_files_df.drop_duplicates(subset=['id',join_unique_field])

In [22]:
excluded_users = pd.read_csv('../data/metadata_files/excluded_users.csv')
missing_users = join_user_files_df[(~join_user_files_df.login.isin(user_df.login.unique())) & (~join_user_files_df.login.isin(excluded_users.login.unique()))]

errored_repos = pd.read_csv('../data/error_logs/potential_repos_errors.csv')
missing_repos = join_repo_files_df[(~join_repo_files_df.full_name.isin(repo_df.full_name.unique())) & (~join_repo_files_df.full_name.isin(errored_repos.full_name.unique()))]
len(missing_users), len(missing_repos)

(0, 8)

In [13]:
if len(missing_users) > 0:
    import sys
    sys.path.append("..")
    from data_generation_scripts.utils import check_add_users
    missing_users["url"] = missing_users.login.apply(
        lambda x: f"https://api.github.com/users/{x}")
    check_add_users(missing_users, users_output_path="../data/entity_files/users_dataset.csv",
                    return_df=False, overwrite_existing_temp_files=False)

Number of new users: 0


In [15]:
if len(missing_repos) > 0:
    import sys
    sys.path.append("..")
    from data_generation_scripts.utils import check_add_repos
    check_add_repos(missing_repos, repo_output_path, return_df=False)

Number of repos: 344846 1684605091.435793
Number of new repos: 0 1684605091.4506822
Number of repos: 344846 1684605092.72048 inner else statement
Number of repos: 344846, checking if older file exists 1684605092.7206862
Repo file updated 1684605112.282505


In [18]:
join_user_files_df['cleaned_search_query'] = join_user_files_df['search_query'].str.replace('%22', '"').str.replace('%3A', ':').str.split('&page').str[0]

join_repo_files_df['cleaned_search_query'] = join_repo_files_df['search_query'].str.replace('%22', '"').str.replace('%3A', ':').str.split('&page').str[0]

In [19]:
len(join_user_files_df), len(subset_search_queries_user_df), len(join_repo_files_df), len(subset_search_queries_repo_df)

(928, 986, 2604, 2966)

In [26]:
join_repo_files_df[(join_repo_files_df.full_name.isin(missing_repos.full_name.unique()))][['full_name', 'detected_language','description']]

Unnamed: 0,full_name,detected_language,description
5203,KUBDatalab/Digital-humaniora-og-programmering-...,,Python
5294,AydanPirani/Digital-Humanities-Project,,Source code for digital humanities project con...
5794,siwariraqi/Digital-Humanities---Women-role-in-...,,
5858,WayerLiu/fdh_jerusalem.github.io,,Project Jerusalem: locating the colonies and n...
5878,gu-gridh/dh-master-programming,,A repository for the programming course at the...
5891,tx-student-dh/capitol_project,,This is a digital humanities project analyzing...
5906,quadrismegistus/intertxt,,"Code, data, and models to examine cultural pro..."
7796,realeroberto/sorrento,,Folk poetry from Sorrento Peninsula.


In [27]:
missing_user_languages = join_user_files_df[(join_user_files_df['detected_language'].isna()) & (join_user_files_df.bio.str.len() > 0 )]
missing_repo_languages = join_repo_files_df[(join_repo_files_df['detected_language'].isna()) & (join_repo_files_df.description.str.len() > 0 )]
len(missing_user_languages), len(missing_repo_languages)

(0, 24)

In [28]:
if len(missing_user_languages)> 0:
    import sys
    sys.path.append("..")
    from data_generation_scripts.generate_translations import check_detect_language
    from tqdm import tqdm
    tqdm.pandas(desc='Detecting language')
    missing_user_languages.bio = missing_user_languages.bio.fillna('')
    join_user_files_df.loc[missing_user_languages.index] = missing_user_languages.progress_apply(check_detect_language, axis=1, is_repo=False)


if len(missing_repo_languages)> 0:
    import sys
    sys.path.append("..")
    from data_generation_scripts.generate_translations import check_detect_language
    from tqdm import tqdm
    tqdm.pandas(desc='Detecting language')
    missing_repo_languages.description = missing_repo_languages.description.fillna('')
    join_repo_files_df.loc[missing_repo_languages.index] = missing_repo_languages.progress_apply(check_detect_language, axis=1, is_repo=True)


Detecting language: 100%|██████████| 24/24 [00:01<00:00, 15.33it/s]


In [29]:
updated_user_join_path = "../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv"
updated_repo_join_path = "../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv"
check_if_older_file_exists(updated_user_join_path)
check_if_older_file_exists(updated_repo_join_path)

In [30]:
join_repo_files_df.to_csv(updated_repo_join_path, index=False)
join_user_files_df.to_csv(updated_user_join_path, index=False)

In [109]:
subset_search_queries_repo_df['cleaned_search_query'] = subset_search_queries_repo_df.search_query.str.replace('%22', '"').str.replace('%3A', ':').str.split('&page').str[0]
subset_search_queries_user_df['cleaned_search_query'] = subset_search_queries_user_df.search_query.str.replace('%22', '"').str.replace('%3A', ':').str.split('&page').str[0]


In [115]:
subset_search_queries_repo_df[['search_type', 'search_term', 'search_term_source', 'full_name', 'updated_natural_language']]

Unnamed: 0,search_type,search_term,search_term_source,full_name,updated_natural_language
0,searched,Digitaalhumanitaaria,Digital Humanities,Digitaalhumanitaaria/Digitaalhumanitaaria.gith...,et
1,searched,Digitaaliset ihmistieteet,Digital Humanities,TurkuNLP/Digi_menetelmat,fi
2,searched,Digital humaniora,Digital Humanities,waahlstrand/metoder-inom-digital-humaniora,sv
3,searched,Digital humaniora,Digital Humanities,KUBDatalab/Digital-humaniora-og-programmering-1-3,sv
4,searched,Digital humaniora,Digital Humanities,KUBDatalab/tekst-repraesenteret-som-tal,sv
...,...,...,...,...,...
2474,searched,Digital Humanities,Digital Humanities,rebelsky/fundhum,en
2475,searched,Digital Humanities,Digital Humanities,truongr2468/Jekyll-and-Hyde,en
2476,tagged,Digital Humanities,Digital Humanities,pacian/Digital-Humanities-Toolkit,en
2477,tagged,Digital Humanities,Digital Humanities,Pyrrhulla/scc_hub,en


In [116]:
print(
    f"From {len(subset_search_queries_repo_df['cleaned_search_query'].unique())} unique queries, we found {len(subset_search_queries_repo_df)} repos, of which {subset_search_queries_repo_df.id.nunique()} are unique. Similarly for users we had {subset_search_queries_user_df.cleaned_search_query.nunique()} unique queries, and we found {len(subset_search_queries_user_df)} total users, of which {subset_search_queries_user_df.id.nunique()} are unique.")


From 37 unique queries, we found 2424 repos, of which 2322 are unique. Similarly for users we had 7 unique queries, and we found 862 total users, of which 860 are unique.


### Explore distribution of search results
How often do users tag and how often do they not tag their repos DH?

*Also worth noting we don't grab DH even though that's likely to be a common term*

In [173]:
cleaned_dh_terms = pd.read_csv(
        '../data/derived_files/grouped_cleaned_translated_dh_terms.csv', encoding='utf-8-sig')
cleaned_dh_terms = cleaned_dh_terms.rename(columns={'language_code': 'natural_language', 'term': 'search_term', 'term_source': 'search_term_source'})

In [174]:
lang_options = pd.read_csv('../data/metadata_files/iso_639_choices.csv')
lang_options = lang_options.rename(columns={'language': 'natural_language', 'name': 'language_name'})

In [175]:
merged_lang = pd.merge(cleaned_dh_terms, lang_options, on='natural_language', how='left')

In [176]:
translated_terms = pd.read_csv('../data/derived_files/translated_dh_terms.csv', encoding='utf-8-sig')

In [177]:
grouped_terms = pd.read_csv('../data/derived_files/grouped_cleaned_translated_dh_terms.csv', encoding='utf-8-sig')

In [178]:
language_df = pd.read_csv("../data/metadata_files/iso_639_choices_directionality_wikimedia.csv")

In [186]:
language_df[language_df.code == 'nl']

Unnamed: 0,code,English language name,directionality,local language name,local or English Wikipedia article,comment
168,nl,Dutch,ltr,Nederlands,nl:Nederlands,


In [225]:
existing_lang_dict ={ 
    'English language name': {
        'zh-TW': 'Chinese (Taiwan)',
        'zh-CN': 'Chinese (Mainland China)', 
        'en, nl': 'English and Dutch', 
        'en, de': 'English and German', 
        'en, he': 'English and Hebrew', 
        'bg-Latn': 'Bulgarian (Latin)',
        'fil': 'Filipino',
        'zh-CH': 'Chinese', 
        'en, ru': 'English and Russian',
    },
    'local language name': {
        'zh-TW': '中文（台灣）',
        'zh-CN': '中文（中国）',
        'en, nl': 'English & Nederlands (Dutch)',
        'en, de': 'English & Deutsch (German)',
        'en, he': 'English & עברית (Hebrew)',
        'bg-Latn': 'Български (Bulgarian)',
        'fil': 'Filipino',
        'zh-CH': '中文',
        'en, ru': 'English & Русский (Russian)',
    }
}

In [226]:
def create_language_dict(languages, language_df):
    lang_dict = {}
    lang_dict['English language name'] = {}
    lang_dict['local language name'] = {}
    for lang in languages:
        split_lang = lang.split(',')
        first_lang = split_lang[0].strip()
        if len(language_df[language_df.code == first_lang]) > 0:
            first_english_lang = language_df[language_df.code == first_lang]['English language name'].values[0]
            first_local_lang = language_df[language_df.code == first_lang]['local language name'].values[0]
            if first_local_lang != first_english_lang:
                first_local_lang = f"{first_local_lang} ({first_english_lang})"
        else:
            first_english_lang = existing_lang_dict['English language name'][first_lang]
            first_local_lang = existing_lang_dict['local language name'][first_lang]
            if first_local_lang != first_english_lang:
                first_local_lang = f"{first_local_lang} ({first_english_lang})"
        
        if len(split_lang) > 1:
            second_lang = split_lang[1].strip()
            second_english_lang = language_df[language_df.code == second_lang]['English language name'].values[0]
            second_local_lang = language_df[language_df.code == second_lang]['local language name'].values[0]
            if second_local_lang != second_english_lang:
                second_local_lang = f"{second_local_lang} ({second_english_lang})"
            lang_dict['English language name'][lang] = f"{first_english_lang} and {second_english_lang}"
            lang_dict['local language name'][lang] = f"{first_local_lang} & {second_local_lang}"
        else:
            lang_dict['English language name'][lang] = first_english_lang
            lang_dict['local language name'][lang] = first_local_lang
    return lang_dict

In [227]:
repo_languages= subset_search_queries_repo_df[(subset_search_queries_repo_df.finalized_language.str.contains(',', na=False)) | (subset_search_queries_repo_df.finalized_language.str.contains('-', na=False))].finalized_language.unique().tolist()
updated_repo_language_dict = create_language_dict(repo_languages, language_df)

user_languages = subset_search_queries_user_df[(subset_search_queries_user_df.finalized_language.str.contains(',', na=False)) | (subset_search_queries_user_df.finalized_language.str.contains('-', na=False))].finalized_language.unique().tolist()
updated_user_language_dict = create_language_dict(user_languages, language_df)

In [228]:
merged_repo_df = pd.merge(subset_search_queries_repo_df, language_df[['code', 'English language name', 'directionality', 'local language name']], left_on='finalized_language', right_on='code', how='left')

merged_user_df = pd.merge(subset_search_queries_user_df, language_df[['code', 'English language name', 'directionality', 'local language name']], left_on='finalized_language', right_on='code', how='left')

In [229]:
# subset_search_queries_repo_df[subset_search_queries_repo_df.updated_natural_language != subset_search_queries_repo_df.finalized_language]

In [230]:
merged_repo_df.loc[merged_repo_df.code.isna(), 'English language name'] = merged_repo_df.finalized_language.map(updated_repo_language_dict['English language name'])
merged_repo_df.loc[merged_repo_df.code.isna(), 'local language name'] = merged_repo_df.finalized_language.map(updated_repo_language_dict['local language name'])

merged_user_df.loc[merged_user_df.code.isna(), 'English language name'] = merged_user_df.finalized_language.map(updated_user_language_dict['English language name'])

merged_user_df.loc[merged_user_df.code.isna(), 'local language name'] = merged_user_df.finalized_language.map(updated_user_language_dict['local language name'])

In [231]:
all_potential_languages = translated_terms[(translated_terms.translated_term.notna()) & (translated_terms.term_source == 'Digital Humanities')]

In [232]:
grouped_repo_languages = merged_repo_df.groupby(['English language name', 'local language name', 'search_type', 'finalized_language', 'search_term']).size().reset_index(name='repo_counts')

grouped_user_languages = merged_user_df.groupby(['English language name', 'local language name','search_type', 'finalized_language', 'search_term']).size().reset_index(name='user_counts')

In [233]:
all_potential_languages.loc[(all_potential_languages.term.notna()) & (all_potential_languages.term != all_potential_languages.translated_term) & (all_potential_languages.language == 'de'), 'term'] = all_potential_languages.translated_term
all_potential_languages.loc[(all_potential_languages.term.isna() == True), 'term'] = all_potential_languages.translated_term
all_potential_languages = all_potential_languages.rename(columns={'term': 'search_term', 'language': 'finalized_language', 'term_source': 'search_term_source'})

In [234]:
cleaned_langs = all_potential_languages[['finalized_language', 'language_name', 'search_term']]
cleaned_langs = cleaned_langs.drop_duplicates()
cleaned_langs_one = cleaned_langs.copy()
cleaned_langs_one['search_type'] = 'tagged'
cleaned_langs_two = cleaned_langs.copy()
cleaned_langs_two['search_type'] = 'searched'

cleaned_langs = pd.concat([cleaned_langs_one, cleaned_langs_two])
cleaned_langs = pd.merge(cleaned_langs, language_df[['code', 'English language name', 'directionality', 'local language name']], left_on='finalized_language', right_on='code', how='left')

In [235]:
merged_repo_langs = pd.merge(grouped_repo_languages, cleaned_langs[['finalized_language', 'search_term', 'search_type', 'English language name', 'local language name']], on=['search_term', 'finalized_language', 'search_type', 'English language name', 'local language name'], how='outer')

merged_user_langs = pd.merge(grouped_user_languages, cleaned_langs[['finalized_language', 'search_term', 'search_type', 'English language name', 'local language name']], on=['search_term', 'finalized_language', 'search_type', 'English language name', 'local language name'], how='outer')

In [236]:
def update_missing_columns(rows):
    if rows['English language name'].isna().any():
        lang_name = rows[rows['English language name'].notna()]['English language name'].unique()
        if len(lang_name) < 1:
            lang_name = rows.language_name.unique()
        rows['English language name'] = lang_name[0]
    if rows['local language name'].isna().any():
        lang_name = rows[rows['local language name'].notna()]['local language name'].unique()
        if len(lang_name) < 1:
            lang_name = rows.language_name.unique()
        rows['local language name'] = lang_name[0]
    return rows

In [237]:
finalized_repo_languages = merged_repo_langs.groupby(['finalized_language']).apply(update_missing_columns)
finalized_user_languages = merged_user_langs.groupby(['finalized_language']).apply(update_missing_columns)

In [238]:
merged_repo_langs = merged_repo_langs.combine_first(finalized_repo_languages)
merged_repo_langs.repo_counts.fillna(0, inplace=True)

merged_user_langs = merged_user_langs.combine_first(finalized_user_languages)
merged_user_langs.user_counts.fillna(0, inplace=True)

In [239]:
# merged_repo_langs = merged_repo_langs.groupby(['finalized_language', 'search_type', 'search_term']).apply(update_columns)
# merged_repo_langs.repo_counts.fillna(0, inplace=True)
# merged_repo_langs['final_term'] = merged_repo_langs.search_term + ' (' + merged_repo_langs.search_type + ')'

# merged_user_langs = merged_user_langs.groupby(['finalized_language', 'search_term']).apply(update_columns)
# merged_user_langs.user_counts.fillna(0, inplace=True)
# merged_user_langs['final_term'] = merged_user_langs.search_term + ' (' + merged_user_langs.search_type + ')'

In [240]:
print(f"From our initial {translated_terms[translated_terms.term_source == 'Digital Humanities'].language.nunique()} language codes from Wikimedia, we have found {all_potential_languages.finalized_language.nunique()} languages with a term for DH, which results in {all_potential_languages.search_term.nunique()} unique terms.")


From our initial 185 language codes from Wikimedia, we have found 123 languages with a term for DH, which results in 105 unique terms.


In [241]:
merged_repo_langs = merged_repo_langs[merged_repo_langs.search_term != "I-Digital Humanities"]
merged_user_langs = merged_user_langs[merged_user_langs.search_term != "I-Digital Humanities"]

In [242]:
# Language (English Language Name) : search_term
merged_repo_langs['graph_label'] = None
merged_repo_langs.loc[merged_repo_langs.finalized_language == 'nb', 'graph_label'] = merged_repo_langs['local language name'] + ' (Norwegian)' +': ' + merged_repo_langs['search_term']
merged_repo_langs.loc[merged_repo_langs['English language name'] == merged_repo_langs['local language name'], 'graph_label'] = merged_repo_langs['local language name'] +': ' + merged_repo_langs['search_term']
merged_repo_langs.loc[(merged_repo_langs['English language name'] != merged_repo_langs['local language name']) & (merged_repo_langs['local language name'].str.contains('\(', na=False) == False), 'graph_label'] = merged_repo_langs['local language name'] + ' (' + merged_repo_langs['English language name'] +')' +': ' + merged_repo_langs['search_term']
merged_repo_langs.loc[(merged_repo_langs['English language name'] != merged_repo_langs['local language name']) & (merged_repo_langs.graph_label.isna()), 'graph_label'] = merged_repo_langs['local language name'] +': ' + merged_repo_langs['search_term']


In [247]:
merged_user_langs['graph_label'] = None
merged_user_langs.loc[merged_user_langs.finalized_language == 'nb', 'graph_label'] = merged_user_langs['local language name'] + ' (Norwegian)' +': ' + merged_user_langs['search_term']
merged_user_langs.loc[merged_user_langs['English language name'] == merged_user_langs['local language name'], 'graph_label'] = merged_user_langs['local language name'] +': ' + merged_user_langs['search_term']
merged_user_langs.loc[(merged_user_langs['English language name'] != merged_user_langs['local language name']) & (merged_user_langs['local language name'].str.contains('\(', na=False) == False), 'graph_label'] = merged_user_langs['local language name'] + ' (' + merged_user_langs['English language name'] +')' +': ' + merged_user_langs['search_term']
merged_user_langs.loc[(merged_user_langs['English language name'] != merged_user_langs['local language name']) & (merged_user_langs.graph_label.isna()), 'graph_label'] = merged_user_langs['local language name'] +': ' + merged_user_langs['search_term']




git filter-repo --invert-paths --path 'data_generation_scripts/search_queries_repo_join_dataset.csv' --use-base-name

In [264]:
merged_repo_langs['cleaned_search_language'] = merged_repo_langs.graph_label.str.split(':').str[0]

merged_user_langs['cleaned_search_language'] = merged_user_langs.graph_label.str.split(':').str[0]

In [265]:
merged_repo_langs['search_label'] = merged_repo_langs['search_term'] + ' (' + merged_repo_langs['search_type'] + ')'

merged_user_langs['search_label'] = merged_user_langs['search_term'] + ' (' + merged_user_langs['search_type'] + ')'

In [272]:
selection = alt.selection_multi(fields=['cleaned_search_language'], bind='legend')

search_repo_noeng_chart = alt.Chart(merged_repo_langs[(merged_repo_langs.cleaned_search_language != 'English') & (merged_repo_langs.search_type == 'searched')]).mark_bar().encode(
    y=alt.Y('search_term:N', sort='-x'),
    x=alt.X('repo_counts:Q', title='Number of Repositories'),
    color=alt.Color('cleaned_search_language:N', legend=alt.Legend(title="Language", columns=4, symbolLimit=0), sort='-x', scale=alt.Scale(scheme='category20b')),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
    tooltip=['search_term', 'repo_counts', 'cleaned_search_language']
).add_params(selection).properties(
    width=400,
    height=1400
)

search_repo_eng_chart = alt.Chart(merged_repo_langs[(merged_repo_langs.search_type == 'searched')]).mark_bar().encode(
    y=alt.Y('search_term:N', sort='-x'),
    x=alt.X('repo_counts:Q', title='Number of Repositories'),
    color=alt.Color('cleaned_search_language:N', legend=alt.Legend(title="Language", columns=3, symbolLimit=0), sort='-x', scale=alt.Scale(scheme='category20b')),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
    tooltip=['search_term', 'repo_counts', 'cleaned_search_language']
).add_params(selection).properties(
    width=400,
    height=1400,
    title='The Frequency of Repositories Returned from GitHub Search API for Digital Humanities Terms'
)

alt.hconcat(search_repo_eng_chart, search_repo_noeng_chart).configure_axis(labelLimit=1000).properties(title="Test")


<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [243]:
non_eng = alt.Chart(merged_repo_langs[merged_repo_langs['English language name'] != 'English']).mark_bar().encode(
    y=alt.Y('graph_label', sort='-x', title=''),
    x=alt.X('sum(repo_counts)', title='Number of Repositories'),
    color=alt.Color('search_type', scale=alt.Scale(range=['#4c78a8', '#f58518'])),
    # tooltip=['English language name', 'search_term', 'search_type', 'repo_counts']
).properties(
    width=500,
    height=2000
).configure_axis(labelLimit=1000)

eng = alt.Chart(merged_repo_langs).mark_bar().encode(
    y=alt.Y('graph_label', sort='-x', title=''),
    x=alt.X('sum(repo_counts)', title='Number of Repositories'),
    color=alt.Color('search_type', scale=alt.Scale(range=['#4c78a8', '#f58518'])),
    # tooltip=['English language name', 'search_term', 'search_type', 'repo_counts']
).properties(
    width=500,
    height=2000
).configure_axis(labelLimit=1000)


In [244]:
non_eng

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [248]:
eng

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


#### Explore cooccurrence of terms

In [18]:
grouped_repos = combined_queries.groupby('full_name').size().reset_index(name='counts').sort_values('counts', ascending=False)
grouped_list = grouped_repos[grouped_repos['counts'] > 1].full_name.tolist()
freq_queries = combined_queries[combined_queries['full_name'].isin(grouped_list)].groupby('full_name')['cleaned_query'].apply(list).reset_index()

In [19]:
exploded_df = freq_queries.explode('cleaned_query')
grouped_exploded = exploded_df.groupby(['full_name','cleaned_query']).size().reset_index(name='val')
pivoted_df = grouped_exploded.pivot(index='full_name', columns='cleaned_query', values='val').fillna(0).astype(int)
final_df = pivoted_df.T.dot(pivoted_df)

In [20]:
reset_final = final_df.reset_index().rename_axis(None, axis=1)
cols = reset_final.columns
cols = cols[1:]
melted_df = pd.melt(reset_final, id_vars=['cleaned_query'], value_vars=cols)

In [21]:
top_results = alt.Chart(melted_df).mark_rect().encode(
    x='cleaned_query',
    y=alt.Y('variable'),
    color='value'
)

rest_results = alt.Chart(melted_df[melted_df.value < 10]).mark_rect().encode(
    x='cleaned_query',
    y=alt.Y('variable'),
    color='value'
)

alt.hconcat(top_results, rest_results).resolve_scale(color='independent')

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html
