In [1]:
import json
import os
import pandas as pd
import codecs
from tqdm import tqdm
import apikey

In [3]:
from google.cloud import translate_v2 as translate
from google.oauth2 import service_account

key_path = apikey.load("GOOGLE_TRANSLATE_CREDENTIALS")
credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

translate_client = translate.Client(
    credentials=credentials)


In [4]:
dh_df = pd.DataFrame([json.load(codecs.open(
    '../data/metadata_files/en.Digital humanities.json', 'r', 'utf-8-sig'))])
dh_df = dh_df.melt()
dh_df.columns = ['language', 'term']


In [5]:
iso_languages = pd.read_csv("../data/metadata_files/iso_639_choices.csv")
iso_languages = iso_languages.rename(
    columns={'name': 'language_name'})


In [6]:
merged_dh = pd.merge(dh_df, iso_languages, on='language', how='outer')

merged_dh['term_source'] = 'Digital Humanities'


In [7]:
target_terms = ["Humanities", "Public History", "Digital History", "Digital Cultural Heritage", "Cultural Analytics", "Computational Humanities"]

In [8]:
languages_dfs = []
for term in target_terms:
    humanities_df = iso_languages.copy()
    humanities_df['term_source'] = term
    languages_dfs.append(humanities_df)


In [9]:
languages_dfs.append(merged_dh)
final_df = pd.concat(languages_dfs)
final_df = final_df.reset_index(drop=True)
final_df

Unnamed: 0,language,language_name,term_source,term
0,ab,Abkhaz,Humanities,
1,aa,Afar,Humanities,
2,af,Afrikaans,Humanities,
3,ak,Akan,Humanities,
4,sq,Albanian,Humanities,
...,...,...,...,...
1290,xh,Xhosa,Digital Humanities,
1291,yi,Yiddish,Digital Humanities,
1292,yo,Yoruba,Digital Humanities,
1293,za,"Zhuang, Chuang",Digital Humanities,


In [10]:
cleaned_df = pd.read_csv('../data/derived_files/cleaned_translated_dh_terms.csv')

In [12]:
cleaned_df[cleaned_df.translated_term != cleaned_df.term]

Unnamed: 0,language,language_name,term_source,term,translated_term
4,ar,Arabic,Humanities,إنسانيات,العلوم الإنسانية
31,fr,French,Humanities,Humanités,Sciences humaines
47,it,Italian,Humanities,umanistica,Studi umanistici
84,ru,Russian,Humanities,гуманитарные,Гуманитарные науки
739,la,Latin,Digital Humanities,Humanitates digitales,Digital Humanities
740,ta,Tamil,Digital Humanities,எண்ணிம மனிதவியல்,டிஜிட்டல் மனிதநேயம்
745,it,Italian,Digital Humanities,Informatica umanistica,Scienze umanistiche digitali
747,cy,Welsh,Digital Humanities,Dyniaethau digidol,Dyniaethau Digidol
748,ar,Arabic,Digital Humanities,إنسانيات رقمية,العلوم الإنسانية الرقمية
751,fi,Finnish,Digital Humanities,Digitaaliset ihmistieteet,Digitaaliset humanistiset tieteet


In [9]:
import time

In [180]:
for index, row in tqdm(final_df.iterrows(), total=final_df.shape[0], desc="Translating"):
    time.sleep(2)
    try:
        dh_term = row.term_source
        target_language = row.language
        text_result = translate_client.translate(
            dh_term, target_language=target_language)
        translated_text = text_result['translatedText']
        final_df.loc[index, 'translated_term'] = translated_text
    except:
        # print(f"Error on {row.language}")
        continue
    


Translating: 100%|██████████| 1295/1295 [1:03:19<00:00,  2.93s/it] 


In [182]:
final_df.to_csv("../data/derived_files/translated_dh_terms.csv", index=False)

In [183]:
import warnings
warnings.filterwarnings('ignore')

In [193]:
cleaned_dh = final_df[(final_df.translated_term.notna())]

In [194]:
cleaned_dh[cleaned_dh.language == 'de']

Unnamed: 0,language,language_name,term_source,term,translated_term
51,de,German,Humanities,,Geisteswissenschaften
236,de,German,Public History,,Öffentliche Geschichte
421,de,German,Digital History,,Digitale Geschichte
606,de,German,Digital Cultural Heritage,,Digitales Kulturerbe
791,de,German,Cultural Analytics,,Kulturanalyse
976,de,German,Computational Humanities,,Computational Humanities
1116,de,German,Digital Humanities,Digital Humanities,Digitale Geisteswissenschaften


In [195]:
cleaned_dh.loc[(cleaned_dh.term.notna() == True) & (
    cleaned_dh.language == 'de'), 'term'] = cleaned_dh.translated_term
cleaned_dh.loc[(cleaned_dh.term.isna() == True), 'term'] = cleaned_dh.translated_term


In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
url = "https://meta.wikimedia.org/wiki/Template:List_of_language_names_ordered_by_code"
response = requests.get(url)


In [4]:
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
import lxml

In [8]:
table = soup.find_all('table')[0]
df = pd.read_html(str(table))[0]


In [11]:
df.to_csv("../data/metadata_files/iso_639_choices_directionality_wikimedia.csv", index=False)

In [20]:
pd.merge(df[['code', 'directionality', 'English language name', 'local language name']], cleaned_dh_terms, on='code', how="left")

Unnamed: 0,code,directionality,English language name,local language name,term,counts,term_source
0,aa,ltr,Afar,Afar,,,
1,ab,ltr,Abkhazian,Аҧсуа,,,
2,af,ltr,Afrikaans,Afrikaans,Digitale Geesteswetenskappe,1.0,Digital Humanities
3,af,ltr,Afrikaans,Afrikaans,Digitale geskiedenis,1.0,Digital History
4,af,ltr,Afrikaans,Afrikaans,Digitale kulturele erfenis,1.0,Digital Cultural Heritage
...,...,...,...,...,...,...,...
849,closed-zh-tw,ltr,Traditional Chinese,‪中文(台灣)‬,,,
850,nb,ltr,Norwegian Bokmål,Norsk (bokmål),,,
851,zh-tw,ltr,Traditional Chinese,‪中文(台灣)‬,,,
852,tokipona,ltr,tokipona,tokipona,,,


In [56]:
import html
# html.unescape("Al&#39;adun Dijital")


In [395]:
cleaned_dh['term'] = cleaned_dh['term'].apply(lambda x: html.unescape(x))
cleaned_dh['translated_term'] = cleaned_dh['translated_term'].apply(lambda x: html.unescape(x))

In [307]:
search_queries_repo_df = pd.read_csv("../data/join_files/search_queries_repo_join_dataset.csv")
search_queries_user_df = pd.read_csv("../data/join_files/search_queries_user_join_dataset.csv")

In [308]:
print(len(search_queries_repo_df), len(search_queries_user_df))

8839 2688
