In [1]:
import json
import os
import pandas as pd
import codecs
from tqdm import tqdm
import apikey

In [2]:
from google.cloud import translate_v2 as translate
from google.oauth2 import service_account

key_path = apikey.load("GOOGLE_TRANSLATE_CREDENTIALS")
credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

translate_client = translate.Client(
    credentials=credentials)


In [3]:
dh_df = pd.DataFrame([json.load(codecs.open(
    '../data/metadata_files/en.Digital humanities.json', 'r', 'utf-8-sig'))])
dh_df = dh_df.melt()
dh_df.columns = ['language', 'term']


In [4]:
iso_languages = pd.read_csv("../data/metadata_files/iso_639_choices.csv")
iso_languages = iso_languages.rename(
    columns={'name': 'language_name'})


In [5]:
merged_dh = pd.merge(dh_df, iso_languages, on='language', how='outer')

merged_dh['term_source'] = 'Digital Humanities'


In [6]:
target_terms = ["Humanities", "Public History", "Digital History", "Digital Cultural Heritage", "Cultural Analytics", "Computational Humanities"]

In [7]:
languages_dfs = []
for term in target_terms:
    humanities_df = iso_languages.copy()
    humanities_df['term_source'] = term
    languages_dfs.append(humanities_df)


In [8]:
languages_dfs.append(merged_dh)
final_df = pd.concat(languages_dfs)
final_df = final_df.reset_index(drop=True)
final_df

Unnamed: 0,language,language_name,term_source,term
0,ab,Abkhaz,Humanities,
1,aa,Afar,Humanities,
2,af,Afrikaans,Humanities,
3,ak,Akan,Humanities,
4,sq,Albanian,Humanities,
...,...,...,...,...
1290,xh,Xhosa,Digital Humanities,
1291,yi,Yiddish,Digital Humanities,
1292,yo,Yoruba,Digital Humanities,
1293,za,"Zhuang, Chuang",Digital Humanities,


In [9]:
cleaned_df = pd.read_csv('../data/derived_files/cleaned_translated_dh_terms.csv')

In [10]:
cleaned_df[cleaned_df.translated_term != cleaned_df.term]

Unnamed: 0,language,language_name,term_source,term,translated_term
4,ar,Arabic,Humanities,إنسانيات,العلوم الإنسانية
31,fr,French,Humanities,Humanités,Sciences humaines
47,it,Italian,Humanities,umanistica,Studi umanistici
84,ru,Russian,Humanities,гуманитарные,Гуманитарные науки
739,la,Latin,Digital Humanities,Humanitates digitales,Digital Humanities
740,ta,Tamil,Digital Humanities,எண்ணிம மனிதவியல்,டிஜிட்டல் மனிதநேயம்
745,it,Italian,Digital Humanities,Informatica umanistica,Scienze umanistiche digitali
747,cy,Welsh,Digital Humanities,Dyniaethau digidol,Dyniaethau Digidol
748,ar,Arabic,Digital Humanities,إنسانيات رقمية,العلوم الإنسانية الرقمية
751,fi,Finnish,Digital Humanities,Digitaaliset ihmistieteet,Digitaaliset humanistiset tieteet


In [11]:
import time

In [12]:
output_path = "../data/derived_files/translated_dh_terms.csv"
if os.path.exists(output_path):
    final_df = pd.read_csv(output_path)
else:
    for index, row in tqdm(final_df.iterrows(), total=final_df.shape[0], desc="Translating"):
        time.sleep(2)
        try:
            dh_term = row.term_source
            target_language = row.language
            text_result = translate_client.translate(
                dh_term, target_language=target_language)
            translated_text = text_result['translatedText']
            final_df.loc[index, 'translated_term'] = translated_text
        except:
            # print(f"Error on {row.language}")
            continue
        
    final_df.to_csv(output_path, index=False)

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
cleaned_dh = final_df[(final_df.translated_term.notna())]

In [15]:
cleaned_dh[cleaned_dh.language == 'de']

Unnamed: 0,language,language_name,term_source,term,translated_term
51,de,German,Humanities,,Geisteswissenschaften
236,de,German,Public History,,Öffentliche Geschichte
421,de,German,Digital History,,Digitale Geschichte
606,de,German,Digital Cultural Heritage,,Digitales Kulturerbe
791,de,German,Cultural Analytics,,Kulturanalyse
976,de,German,Computational Humanities,,Computational Humanities
1116,de,German,Digital Humanities,Digital Humanities,Digitale Geisteswissenschaften


In [16]:
cleaned_dh.loc[(cleaned_dh.term.notna() == True) & (
    cleaned_dh.language == 'de'), 'term'] = cleaned_dh.translated_term
cleaned_dh.loc[(cleaned_dh.term.isna() == True), 'term'] = cleaned_dh.translated_term


In [17]:
import requests
from bs4 import BeautifulSoup
import lxml

In [22]:
output_path = "../data/metadata_files/iso_639_choices_directionality_wikimedia.csv"
if os.path.exists(output_path):
    df = pd.read_csv(output_path)
else:
    url = "https://meta.wikimedia.org/wiki/Template:List_of_language_names_ordered_by_code"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find_all('table')[0]
    df = pd.read_html(str(table))[0]
    df.to_csv("../data/metadata_files/iso_639_choices_directionality_wikimedia.csv", index=False)

In [41]:
import html
# html.unescape("Al&#39;adun Dijital")
cleaned_dh['term'] = cleaned_dh['term'].apply(lambda x: html.unescape(x))
cleaned_dh['translated_term'] = cleaned_dh['translated_term'].apply(lambda x: html.unescape(x))

In [42]:
cleaned_dh = cleaned_dh.rename(columns={'language': 'code'})

In [47]:
merged_lang_terms = pd.merge(df[['code', 'directionality', 'English language name', 'local language name']], cleaned_dh, on='code', how="outer")

In [52]:

print(f"Our data now contains info for {merged_lang_terms[merged_lang_terms.term.notna()]['English language name'].nunique()} but we also are missing terms for the following number of languages {merged_lang_terms[merged_lang_terms.term.isna()]['English language name'].nunique()}")

Our data now contains info for 123 but we also are missing terms for the following number of languages 148


In [54]:
merged_lang_terms[merged_lang_terms.term.notna()]
grouped_dh_terms = cleaned_dh.groupby(['term']).agg({'code': ','.join, 'term': 'count' })
grouped_dh_terms['final_term'] = grouped_dh_terms.index
grouped_dh_terms = grouped_dh_terms.reset_index(level=0, drop=True).sort_values(by='term', ascending=False)

In [56]:
grouped_dh_terms[grouped_dh_terms.code.str.contains(',')]

Unnamed: 0,code,term,final_term
42,"bs,ny,da,en,de,mg,nb,no,sm,sn,st,sv,tl,fy",14,Computational Humanities
74,"en,ny,ha,ig,lb,mg,sm,sn,st,tl,yo",11,Digital Humanities
45,"bs,ny,en,ky,lb,mg,sn,tl",8,Cultural Analytics
212,"en,ku,mg,sm,sn,tl,fy",7,Humanities
72,"bs,en,ky,la,sn,fy,yo",7,Digital History
69,"ny,en,sn,st,tl,yo",6,Digital Cultural Heritage
12,"id,jv,ms,su",4,Analisis Budaya
590,"hi,mr,ne,sa",4,डिजिटल इतिहास
191,"da,nb,no,sv",4,Humaniora
263,"da,de,nb,no",4,Kulturanalyse
