# Notebook for Generating Search Queries

### Load Libraries and Data

In [3]:
import json
import os
import pandas as pd
import codecs
from tqdm import tqdm
import apikey
import warnings
warnings.filterwarnings('ignore')
import time

In [2]:
from google.cloud import translate_v2 as translate
from google.oauth2 import service_account

key_path = apikey.load("GOOGLE_TRANSLATE_CREDENTIALS")
key_path = key_path.replace('/Volumes/Samsung_T5/','/Users/zleblanc/')
credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

translate_client = translate.Client(
    credentials=credentials)



In [3]:
dh_df = pd.DataFrame([json.load(codecs.open(
    '../data/metadata_files/en.Digital humanities.json', 'r', 'utf-8-sig'))])
dh_df = dh_df.melt()
dh_df.columns = ['language', 'term']


In [4]:
iso_languages = pd.read_csv("../data/metadata_files/iso_639_choices.csv")
iso_languages = iso_languages.rename(
    columns={'name': 'language_name'})


In [5]:
merged_dh = pd.merge(dh_df, iso_languages, on='language', how='outer')

merged_dh['term_source'] = 'Digital Humanities'


In [6]:
target_terms = ["Humanities", "Public History", "Digital History", "Digital Cultural Heritage", "Cultural Analytics", "Computational Humanities"]

In [7]:
languages_dfs = []
for term in target_terms:
    humanities_df = iso_languages.copy()
    humanities_df['term_source'] = term
    languages_dfs.append(humanities_df)
languages_dfs.append(merged_dh)
final_df = pd.concat(languages_dfs)
final_df = final_df.reset_index(drop=True)
final_df

Unnamed: 0,language,language_name,term_source,term
0,ab,Abkhaz,Humanities,
1,aa,Afar,Humanities,
2,af,Afrikaans,Humanities,
3,ak,Akan,Humanities,
4,sq,Albanian,Humanities,
...,...,...,...,...
1290,xh,Xhosa,Digital Humanities,
1291,yi,Yiddish,Digital Humanities,
1292,yo,Yoruba,Digital Humanities,
1293,za,"Zhuang, Chuang",Digital Humanities,


In [4]:
cleaned_df = pd.read_csv('../../datasets/derived_files/cleaned_translated_dh_terms.csv')

In [5]:
translated_df = pd.read_csv("../../datasets/derived_files/translated_dh_terms.csv")

In [16]:
directionality_df = pd.read_csv("../../datasets/metadata_files/iso_639_choices_directionality_wikimedia.csv")

In [18]:
translated_df

Unnamed: 0,language,language_name,term_source,term,translated_term
0,ab,Abkhaz,Humanities,,
1,aa,Afar,Humanities,,
2,af,Afrikaans,Humanities,,Geesteswetenskappe
3,ak,Akan,Humanities,,Nnipa ho adesua
4,sq,Albanian,Humanities,,shkencat humane
...,...,...,...,...,...
1290,xh,Xhosa,Digital Humanities,,I-Digital Humanities
1291,yi,Yiddish,Digital Humanities,,דיגיטאַל הומאַניטיעס
1292,yo,Yoruba,Digital Humanities,,Digital Humanities
1293,za,"Zhuang, Chuang",Digital Humanities,,


In [17]:
directionality_df

Unnamed: 0,code,English language name,directionality,local language name,local or English Wikipedia article,comment
0,aa,Afar,ltr,Afar,en:Afar language,
1,ab,Abkhazian,ltr,Аҧсуа,en:Abkhazian language,
2,af,Afrikaans,ltr,Afrikaans,en:Afrikaans language,
3,ak,Akan,ltr,Akana,en:Akan language,
4,als,Alemannic,ltr,Alemannisch,en:Alemannic language,en:ISO 639-3: gsw (als is en:Tosk Albanian)
...,...,...,...,...,...,...
267,closed-zh-tw,Traditional Chinese,ltr,‪中文(台灣)‬,en:Chinese language,closed
268,nb,Norwegian Bokmål,ltr,Norsk (bokmål),en:Bokmål,redirects to no
269,zh-tw,Traditional Chinese,ltr,‪中文(台灣)‬,en:Chinese language,redirects to zh
270,tokipona,tokipona,ltr,tokipona,en:Toki Pona,moved to http://tokipona.wikia.com/


In [10]:
translated_df.term_source.unique(), translated_df.columns

(array(['Humanities', 'Public History', 'Digital History',
        'Digital Cultural Heritage', 'Cultural Analytics',
        'Computational Humanities', 'Digital Humanities'], dtype=object),
 Index(['language', 'language_name', 'term_source', 'term', 'translated_term'], dtype='object'))

In [19]:
cleaned_df

Unnamed: 0,language,language_name,term_source,term,translated_term
0,af,Afrikaans,Humanities,Geesteswetenskappe,Geesteswetenskappe
1,ak,Akan,Humanities,Nnipa ho adesua,Nnipa ho adesua
2,sq,Albanian,Humanities,shkencat humane,shkencat humane
3,am,Amharic,Humanities,ሰብአዊነት,ሰብአዊነት
4,ar,Arabic,Humanities,إنسانيات,العلوم الإنسانية
...,...,...,...,...,...
855,fy,Western Frisian,Digital Humanities,Digitale Humanities,Digitale Humanities
856,xh,Xhosa,Digital Humanities,I-Digital Humanities,I-Digital Humanities
857,yi,Yiddish,Digital Humanities,דיגיטאַל הומאַניטיעס,דיגיטאַל הומאַניטיעס
858,yo,Yoruba,Digital Humanities,Digital Humanities,Digital Humanities


In [12]:
translated_df[translated_df.translated_term != translated_df.term]

Unnamed: 0,language,language_name,term_source,term,translated_term
0,ab,Abkhaz,Humanities,,
1,aa,Afar,Humanities,,
2,af,Afrikaans,Humanities,,Geesteswetenskappe
3,ak,Akan,Humanities,,Nnipa ho adesua
4,sq,Albanian,Humanities,,shkencat humane
...,...,...,...,...,...
1290,xh,Xhosa,Digital Humanities,,I-Digital Humanities
1291,yi,Yiddish,Digital Humanities,,דיגיטאַל הומאַניטיעס
1292,yo,Yoruba,Digital Humanities,,Digital Humanities
1293,za,"Zhuang, Chuang",Digital Humanities,,


In [9]:
cleaned_df[cleaned_df.translated_term != cleaned_df.term]

Unnamed: 0,language,language_name,term_source,term,translated_term
4,ar,Arabic,Humanities,إنسانيات,العلوم الإنسانية
31,fr,French,Humanities,Humanités,Sciences humaines
47,it,Italian,Humanities,umanistica,Studi umanistici
84,ru,Russian,Humanities,гуманитарные,Гуманитарные науки
739,la,Latin,Digital Humanities,Humanitates digitales,Digital Humanities
740,ta,Tamil,Digital Humanities,எண்ணிம மனிதவியல்,டிஜிட்டல் மனிதநேயம்
745,it,Italian,Digital Humanities,Informatica umanistica,Scienze umanistiche digitali
747,cy,Welsh,Digital Humanities,Dyniaethau digidol,Dyniaethau Digidol
748,ar,Arabic,Digital Humanities,إنسانيات رقمية,العلوم الإنسانية الرقمية
751,fi,Finnish,Digital Humanities,Digitaaliset ihmistieteet,Digitaaliset humanistiset tieteet


In [11]:
output_path = "../data/derived_files/translated_dh_terms.csv"
if os.path.exists(output_path):
    final_df = pd.read_csv(output_path)
else:
    for index, row in tqdm(final_df.iterrows(), total=final_df.shape[0], desc="Translating"):
        time.sleep(2)
        try:
            dh_term = row.term_source
            target_language = row.language
            text_result = translate_client.translate(
                dh_term, target_language=target_language)
            translated_text = text_result['translatedText']
            final_df.loc[index, 'translated_term'] = translated_text
        except:
            print(f"Error on {row.language}")
            continue
        
    final_df.to_csv(output_path, index=False)

In [13]:
subset_translated_df = translated_df[translated_df.translated_term.notna()]

In [15]:
subset_translated_df[subset_translated_df.term.notna()]

Unnamed: 0,language,language_name,term_source,term,translated_term
1110,ru,Russian,Digital Humanities,Цифровые гуманитарные науки,Цифровые гуманитарные науки
1111,fr,French,Digital Humanities,Humanités numériques,Humanités numériques
1112,la,Latin,Digital Humanities,Humanitates digitales,Digital Humanities
1113,ta,Tamil,Digital Humanities,எண்ணிம மனிதவியல்,டிஜிட்டல் மனிதநேயம்
1114,ca,Catalan; Valencian,Digital Humanities,Humanitats digitals,Humanitats digitals
1115,en,English,Digital Humanities,Digital Humanities,Digital Humanities
1116,de,German,Digital Humanities,Digital Humanities,Digitale Geisteswissenschaften
1117,ko,Korean,Digital Humanities,디지털 인문학,디지털 인문학
1118,it,Italian,Digital Humanities,Informatica umanistica,Scienze umanistiche digitali
1119,sh,Serbo-Crotian,Digital Humanities,Дигиталне хуманистичке науке,Дигиталне хуманистичке науке


In [12]:
cleaned_dh = final_df[(final_df.translated_term.notna())]

In [13]:
cleaned_dh[cleaned_dh.language == 'de']

Unnamed: 0,language,language_name,term_source,term,translated_term
51,de,German,Humanities,,Geisteswissenschaften
236,de,German,Public History,,Öffentliche Geschichte
421,de,German,Digital History,,Digitale Geschichte
606,de,German,Digital Cultural Heritage,,Digitales Kulturerbe
791,de,German,Cultural Analytics,,Kulturanalyse
976,de,German,Computational Humanities,,Computational Humanities
1116,de,German,Digital Humanities,Digital Humanities,Digitale Geisteswissenschaften


In [14]:
cleaned_dh.loc[(cleaned_dh.term.notna() == True) & (
    cleaned_dh.language == 'de'), 'term'] = cleaned_dh.translated_term
cleaned_dh.loc[(cleaned_dh.term.isna() == True), 'term'] = cleaned_dh.translated_term


In [15]:
import requests
from bs4 import BeautifulSoup
import lxml

In [30]:
output_path = "../data/metadata_files/iso_639_choices_directionality_wikimedia.csv"
if os.path.exists(output_path):
    df = pd.read_csv(output_path)
else:
    url = "https://meta.wikimedia.org/wiki/Template:List_of_language_names_ordered_by_code"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find_all('table')[0]
    df = pd.read_html(str(table))[0]
    df.to_csv("../data/metadata_files/iso_639_choices_directionality_wikimedia.csv", index=False)

In [38]:
df = df[df.directionality.isin(['ltr', 'rtl'])]

In [42]:
len(subset_dh[subset_dh.code.isin(df.code.unique())]) + len(df[~df.code.isin(subset_dh.code.unique())])

269

: 

In [40]:
df[~df.code.isin(subset_dh.code.unique())]

Unnamed: 0,code,English language name,directionality,local language name,local or English Wikipedia article,comment
0,aa,Afar,ltr,Afar,en:Afar language,
1,ab,Abkhazian,ltr,Аҧсуа,en:Abkhazian language,
4,als,Alemannic,ltr,Alemannisch,en:Alemannic language,en:ISO 639-3: gsw (als is en:Tosk Albanian)
6,an,Aragonese,ltr,Aragonés,en:Aragonese language,
7,ang,Angal,ltr,Angal Heneng,en:Angal language,
...,...,...,...,...,...,...
262,zh-min-nan,Minnan,ltr,Bân-lâm-gú,en:Min Nan language,ISO 639-3: nan
263,zh-yue,Cantonese,ltr,粵語 / 粤语,zh-yue:粵語,ISO 639-3: yue
267,closed-zh-tw,Traditional Chinese,ltr,‪中文(台灣)‬,en:Chinese language,closed
269,zh-tw,Traditional Chinese,ltr,‪中文(台灣)‬,en:Chinese language,redirects to zh


In [17]:
import html
# html.unescape("Al&#39;adun Dijital")
cleaned_dh['term'] = cleaned_dh['term'].apply(lambda x: html.unescape(x))
cleaned_dh['translated_term'] = cleaned_dh['translated_term'].apply(lambda x: html.unescape(x))

In [18]:
cleaned_dh = cleaned_dh.rename(columns={'language': 'code'})

In [19]:
merged_lang_terms = pd.merge(df[['code', 'directionality', 'English language name', 'local language name']], cleaned_dh, on='code', how="outer")

In [20]:
merged_lang_terms = merged_lang_terms[merged_lang_terms.code != "see also Test languages"]

In [21]:
merged_lang_terms

Unnamed: 0,code,directionality,English language name,local language name,language_name,term_source,term,translated_term
0,aa,ltr,Afar,Afar,,,,
1,ab,ltr,Abkhazian,Аҧсуа,,,,
2,af,ltr,Afrikaans,Afrikaans,Afrikaans,Humanities,Geesteswetenskappe,Geesteswetenskappe
3,af,ltr,Afrikaans,Afrikaans,Afrikaans,Public History,Openbare Geskiedenis,Openbare Geskiedenis
4,af,ltr,Afrikaans,Afrikaans,Afrikaans,Digital History,Digitale geskiedenis,Digitale geskiedenis
...,...,...,...,...,...,...,...,...
1003,nb,ltr,Norwegian Bokmål,Norsk (bokmål),Norwegian Bokmål,Cultural Analytics,Kulturanalyse,Kulturanalyse
1004,nb,ltr,Norwegian Bokmål,Norsk (bokmål),Norwegian Bokmål,Computational Humanities,Computational Humanities,Computational Humanities
1005,nb,ltr,Norwegian Bokmål,Norsk (bokmål),Norwegian Bokmål,Digital Humanities,Digital humaniora,Digital humaniora
1006,zh-tw,ltr,Traditional Chinese,‪中文(台灣)‬,,,,


In [22]:

print(f"Our data now contains info for {merged_lang_terms[merged_lang_terms.term.notna()]['English language name'].nunique()} but we also are missing terms for the following number of languages {merged_lang_terms[merged_lang_terms.term.isna()]['English language name'].nunique()}")

Our data now contains info for 123 but we also are missing terms for the following number of languages 147


In [23]:
subset_dh = merged_lang_terms[merged_lang_terms.term_source == 'Digital Humanities']
print(f"Our data now contains info for {subset_dh[subset_dh.term.notna()]['English language name'].nunique()} but we also are missing terms for the following number of languages {subset_dh[subset_dh.term.isna()]['English language name'].nunique()}")

Our data now contains info for 123 but we also are missing terms for the following number of languages 0


In [32]:
subset_dh

Unnamed: 0,code,directionality,English language name,local language name,language_name,term_source,term,translated_term
8,af,ltr,Afrikaans,Afrikaans,Afrikaans,Digital Humanities,Digitale Geesteswetenskappe,Digitale Geesteswetenskappe
15,ak,ltr,Akan,Akana,Akan,Digital Humanities,Digitals Nnipa Ho Adesua,Digitals Nnipa Ho Adesua
23,am,ltr,Amharic,አማርኛ,Amharic,Digital Humanities,ዲጂታል ሰብአዊነት,ዲጂታል ሰብአዊነት
33,ar,rtl,Arabic,العربية,Arabic,Digital Humanities,إنسانيات رقمية,العلوم الإنسانية الرقمية
42,as,ltr,Assamese,অসমীয়া,Assamese,Digital Humanities,ডিজিটেল মানৱীয় বিজ্ঞান,ডিজিটেল মানৱীয় বিজ্ঞান
...,...,...,...,...,...,...,...,...
970,yi,rtl,Yiddish,ייִדיש,Yiddish,Digital Humanities,דיגיטאַל הומאַניטיעס,דיגיטאַל הומאַניטיעס
977,yo,ltr,Yoruba,Yorùbá,Yoruba,Digital Humanities,Digital Humanities,Digital Humanities
985,zh,ltr,Chinese,中文,Chinese,Digital Humanities,数字人文,数字人文
995,zu,ltr,Zulu,isiZulu,Zulu,Digital Humanities,I-Digital Humanities,I-Digital Humanities


In [55]:
iso_languages[~iso_languages.language.isin(subset_dh.code.unique())]

Unnamed: 0,language,language_name
0,ab,Abkhaz
1,aa,Afar
7,an,Aragonese
10,av,Avaric
11,ae,Avestan
...,...,...
173,ve,Venda
175,vo,Volapük
176,wa,Walloon
178,wo,Wolof


In [24]:
subset_dh.groupby(['term_source','term']).agg({'code': ','.join, 'term': 'count', 'English language name': ', '.join }).reset_index(level=0)

Unnamed: 0_level_0,term_source,code,term,English language name
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aadamaha Dijital ah,Digital Humanities,so,1,Somalia
Binadamu wa Kidijitali,Digital Humanities,sw,1,Swahili
Ciferecaj Homaroj,Digital Humanities,eo,1,Esperanto
Daonnachdan didseatach,Digital Humanities,gd,1,Scottish Gaelic
Daonnachtaí Digiteacha,Digital Humanities,ga,1,Irish
...,...,...,...,...
ዲጂታል ሰብኣዊ ስነፍልጠት,Digital Humanities,ti,1,Tigrinya
មនុស្សធម៌ឌីជីថល,Digital Humanities,km,1,Cambodian
デジタル・ヒューマニティーズ,Digital Humanities,ja,1,Japanese
数字人文,Digital Humanities,zh,1,Chinese


In [25]:
merged_lang_terms[merged_lang_terms.term.notna()]
grouped_dh_terms = subset_dh.groupby(['term_source','term']).agg({'code': ','.join, 'term': 'count', 'English language name': ', '.join }).reset_index(level=0)
grouped_dh_terms['final_term'] = grouped_dh_terms.index
grouped_dh_terms = grouped_dh_terms.reset_index(level=0, drop=True).sort_values(by='term', ascending=False)

In [29]:
len(grouped_dh_terms)

105

In [80]:
grouped_dh_terms[grouped_dh_terms.code.str.contains(',')][[ 'English language name', 'final_term']].to_csv('../data/derived_files/dh_terms_with_multiple_codes.csv', index=False)

In [None]:
# Load Google Libraries
from google.cloud import translate_v2 as translate
from google.oauth2 import service_account

# Load API Key
key_path = apikey.load("GOOGLE_TRANSLATE_CREDENTIALS")

# Create Translate Client
credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)
translate_client = translate.Client(
    credentials=credentials)

# Translate Text
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Translating"):
        time.sleep(2)
        try:
            dh_term = row.term_source
            target_language = row.language
            text_result = translate_client.translate(
                dh_term, target_language=target_language)
            translated_text = text_result['translatedText']
            df.loc[index, 'translated_term'] = translated_text
        except:
            print(f"Error on {row.language}")
            continue