In [128]:
# Use svg graphics, display inline
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import glob
import re
import copy
import sys
import re

# Basic scientific computing imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Hex colors for plotting
FB_BLUE = '#1877F2'
FB_HERITAGE_BLUE = '#3B5998'
FB_LIGHT_BLUE = '#8B9DC3'
FB_VERY_LIGHT_BLUE = '#dfE3EE'
SOFT_RED = '#C23F38'
SOFT_GREEN = '#56B000'
NEUTRAL_GREY = '#A9A9A9'
GGPLOT_PALETTE = [
	'#F8766F', '#E58700', '#C99800', '#A3A500', '#6BB100',
	'#00BA38', '#00BF7D', '#00C0AF', '#00BCD8', '#00B0F6',
	'#619CFF', '#B983FF', '#E76BF3', '#FD61D1', '#FF67A4'
]

# display config
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.max_rows = 999
plt.style.use('default')
plt.rcParams['figure.figsize'] = 10, 6
np.set_printoptions(suppress=True)
np.random.seed(42)

from wikipedia import wikipedia
from nltk import tokenize
import requests
from bs4 import BeautifulSoup
import json
import html
import calendar
from string import ascii_lowercase

pd.set_option('max_colwidth', 300)

#import spacy library
import spacy
#load core english library
nlp = spacy.load("en_core_web_sm")

from google.cloud import translate_v2 as translate
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/gkj/NLLB/analyses/gcloud_authentication/fair-translations-1-a47dc4b68cca.json'
print(sys.version)

3.7.11 (default, Jul 27 2021, 07:03:16) 
[Clang 10.0.0 ]


## Analysis Ideas

- Metrics
    - Segments where we're better than Google
    - Segments where we're good and bad, overall


- Properties to group by and check for score discrepencies
    - Does the sentence contain dates?
    - PCA or clustering with embeddings
    - Topics/Categories from the API (rejoin with dataframe for `wiki_paragraph.ipynb`)
    - Named entities (have to find a way to recognize them. Might be doable with the API)
    
```python
page = wikipedia.WikipediaPage(pageid=5043734)
page.categories
```

In [76]:
df_source = pd.read_csv('df_ids_sents.tsv', sep='\t')
df_source.head()

Unnamed: 0,sentence_id,sentence
0,5043734_1,"Semantic Web: The website DBpedia, begun in 2007, extracts data from the infoboxes and category declarations of the English-language Wikipedia."
1,5043734_2,"Wikimedia has created the Wikidata project with a similar objective of storing the basic facts from each page of Wikipedia and the other WMF wikis and make it available in a queriable semantic format, RDF."
2,5043734_3,"As of April 2021, it has 93,337,731 items."
3,5043734_4,"Obtaining the full contents of Wikipedia for reuse presents challenges, since direct cloning via a web crawler is discouraged."
4,5043734_5,"Wikipedia publishes ""dumps"" of its contents, but these are text-only; as of 2007 there was no dump available of Wikipedia's images."


Character length per sentence distribution

In [118]:
code_to_language = {
    'zh': 'mandarin',
    'ig': 'igbo',
    'zu': 'zulu',
    'is': 'icelandic',
    'ha': 'hausa',
    'oc': 'occitan',
    'lg': 'luganda',
}

language_to_code = {v: k for k, v in code_to_language.items()}

languages = list(code_to_language.keys())
languages

['zh', 'ig', 'zu', 'is', 'ha', 'oc', 'lg']

In [39]:
translate_client = translate.Client()
source_segments = df_source.sentence.tolist()
batch_size = 100

translations = {}

for language in languages:
    
    # google doesn't support translation for these languages
    # just append an empty field for each of the segments
    if language in ('oc', 'lg'):
        segments = [{'translatedText': ''}] * len(source_segments)
    else:
        segments = []
        i = 0
        while i < len(source_segments):
            batch = source_segments[i:i+batch_size]
            batch_translated = translate_client.translate(
                values=batch,
                target_language=language,
                source_language='en'
            )
            segments += batch_translated
            i += batch_size
    print('completed', language)
    
    translations[language] = segments

completed zh
completed ig
completed zu
completed is
completed ha
completed oc
completed lg


In [47]:
df_google_translations = copy.deepcopy(df_source)
for language in languages:
    df_google_translations[f'google_{language}'] = [html.unescape(x['translatedText']) for x in translations[language]]

In [65]:
def doc_contains_month(doc):
    months = [calendar.month_name[x] for x in range(1, 13)]
    
    for month in months:
        if month in doc:
            return True
        
    return False

In [68]:
df_google_translations['contains_month'] = df_google_translations.sentence.apply(doc_contains_month)
df_google_translations

Unnamed: 0,sentence_id,sentence,google_zh,google_ig,google_zu,google_is,google_ha,google_oc,google_lg,contains_date,contains_month
0,5043734_1,"Semantic Web: The website DBpedia, begun in 2007, extracts data from the infoboxes and category declarations of the English-language Wikipedia.",语义网：DBpedia 网站成立于 2007 年，从英文维基百科的信息框和类别声明中提取数据。,"Weebụ Semantic: Weebụsaịtị DBpedia, malitere na 2007, na-ewepụta data sitere na igbe ozi na nkwupụta udi nke Wikipedia asụsụ bekee.","Iwebhu ye-Semantic: Iwebhusayithi i-DBpedia, eyaqala ngo-2007, ikhipha idatha emabhokisini emininingwane kanye nezimemezelo zesigaba se-Wikipedia yolimi lwesiNgisi.","Merkingarvefur: Vefsíðan DBpedia, sem hófst árið 2007, dregur gögn úr upplýsingakössum og flokkayfirlýsingum á ensku Wikipedia.","Yanar Gizon Semantic: Gidan yanar gizon DBpedia, wanda aka fara a cikin 2007, yana fitar da bayanai daga akwatunan info da bayanin nau'in Wikipedia na Turanci.",,,False,False
1,5043734_2,"Wikimedia has created the Wikidata project with a similar objective of storing the basic facts from each page of Wikipedia and the other WMF wikis and make it available in a queriable semantic format, RDF.",维基媒体创建了 Wikidata 项目，其目标类似，即存储来自维基百科和其他 WMF wiki 的每个页面的基本事实，并使其以可查询的语义格式 RDF 可用。,"Wikimedia ewepụtala ọrụ Wikidata na ebumnobi yiri ya nke ịchekwa eziokwu ndị sitere na ibe ọ bụla nke Wikipedia na wiki WMF ndị ọzọ wee mee ka ọ dị n'ụdị ntụgharị uche dị mma, RDF.","I-Wikimedia idale iphrojekthi ye-Wikidata ngenjongo efanayo yokugcina amaqiniso ayisisekelo ekhasini ngalinye le-Wikipedia nakwamanye ama-wiki e-WMF futhi iyenze itholakale ngefomethi ye-semantic engabazekayo, i-RDF.","Wikimedia hefur búið til Wikidata verkefnið með svipuðu markmiði að geyma grunnstaðreyndir frá hverri síðu á Wikipedia og öðrum WMF wikis og gera þær aðgengilegar á spurningafræðilegu merkingarformi, RDF.","Wikimedia ta ƙirƙiri aikin Wikidata tare da manufa iri ɗaya na adana ainihin bayanai daga kowane shafi na Wikipedia da sauran wikis na WMF da kuma sanya shi cikin sigar ma'ana mai mahimmanci, RDF.",,,False,False
2,5043734_3,"As of April 2021, it has 93,337,731 items.","截至 2021 年 4 月，它有 93,337,731 个项目。","Ruo Eprel 2021, o nwere ihe 93,337,731.","Kusukela ngo-Ephreli 2021, inezinto ezingama-93,337,731.",Frá og með apríl 2021 hefur það 93.337.731 hluti.,"Tun daga Afrilu 2021, tana da abubuwa 93,337,731.",,,True,True
3,5043734_4,"Obtaining the full contents of Wikipedia for reuse presents challenges, since direct cloning via a web crawler is discouraged.",获取维基百科的全部内容以供重用是一项挑战，因为不鼓励通过网络爬虫直接克隆。,"Inweta ọdịnaya zuru ezu nke Wikipedia maka ijikwa ya na-eweta ihe ịma aka, ebe ọ bụ na a na-akụda mmụọ ịchịkọta ozugbo site na crawler webụ.","Ukuthola okuqukethwe okugcwele kwe-Wikipedia ukuze kuphinde kusetshenziswe kuletha izinselele, njengoba ukuhlanganisa okuqondile ngesiseshi sewebhu akukhuthazwa.","Það felur í sér áskoranir að fá allt innihald Wikipedia til endurnotkunar, þar sem beinni einræktun í gegnum vefskrið er óráðlegt.","Samun cikakken abin da ke cikin Wikipedia don sake amfani da shi yana gabatar da ƙalubale, tun da an hana yin amfani da haɗin yanar gizo kai tsaye ta hanyar rarrafe.",,,False,False
4,5043734_5,"Wikipedia publishes ""dumps"" of its contents, but these are text-only; as of 2007 there was no dump available of Wikipedia's images.",维基百科发布其内容的“转储”，但这些都是纯文本的；截至 2007 年，没有可用的 Wikipedia 图像转储。,"Wikipedia na-ebipụta ""mkpofu"" nke ọdịnaya ya, mana ndị a bụ naanị ederede; Dị ka n'afọ 2007 ọ nweghị ihe mkpofu dị na foto Wikipedia.","I-Wikipedia ishicilela ""ukulahlwa"" kokuqukethwe kwayo, kodwa lokhu kungombhalo kuphela; kusukela ngo-2007 kwakungekho ukulahlwa okutholakala kwezithombe ze-Wikipedia.","Wikipedia birtir „sorphaugar“ af innihaldi þess, en þetta er eingöngu texti; frá og með 2007 var engin sorphaugur tiltækur af myndum Wikipedia.","Wikipedia yana buga ""juji"" na abubuwan da ke cikinsa, amma waɗannan su ne rubutu kawai; har zuwa 2007 babu wani juji da aka samu na hotunan Wikipedia.",,,False,False
5,5043734_6,Wikimedia Enterprise is a for-profit solution to this.,Wikimedia Enterprise 是一个以营利为目的的解决方案。,Ụlọ ọrụ Wikimedia bụ ihe ngwọta maka nke a.,I-Wikimedia Enterprise iyisixazululo senzuzo kulokhu.,Wikimedia Enterprise er hagnaðarlausn á þessu.,Wikimedia Enterprise mafita ce ta riba ga wannan.,,,False,False
6,21951847_1,"Billed simply as Hannah, Waddingham placed ""Our Kind of Love"" in the UK Singles Chart in October 2000, where it peaked at No. 41.",2000 年 10 月，Waddingham 将《Our Kind of Love》列入英国单曲榜，最高排名第 41 位。,"N'ịbụ onye a na-akwụ ụgwọ dị ka Hannah, Waddingham debere ""Ụdị Ịhụnanya Anyị"" na UK Singles Chart na October 2000, ebe ọ ruru na 41.","Ikhokhiswe nje ngokuthi u-Hannah, i-Waddingham yabeka ""Uhlobo Lwethu Lothando"" e-UK Singles Chart ngo-Okthoba 2000, lapho yafinyelela inani eliphakeme kuNombolo 41.","Waddingham, einfaldlega sem Hannah, setti „Our Kind of Love“ á breska smáskífulistanum í október 2000, þar sem hann náði hámarki í 41. sæti.","An biya shi kawai kamar yadda Hannah, Waddingham ya sanya ""Irin Ƙaunarmu"" a cikin Jadawalin Singles na Burtaniya a cikin Oktoba 2000, inda ya kai kololuwa a lamba 41.",,,True,True
7,21951847_2,"She later sang the role of Starbird on the soundtrack recording of Space Family Robinson, released by Pop! Records in May 2002 to coincide with the stage production (also featuring Waddingham as Starbird) which ran for three weeks at London's Pleasance Theatre in May 2002.",后来她在 Pop! 发行的 Space Family Robinson 的配乐录音中演唱了 Starbird 角色。 2002 年 5 月录制的唱片与 2002 年 5 月在伦敦普莱森斯剧院进行了三周的舞台制作（也以 Waddingham 饰演 Starbird 为主角）相吻合。,"O mechara bụrụ ọrụ Starbird na ndekọ ụda nke Space Family Robinson, nke Pop! Ihe ndekọ na Mee 2002 iji kwekọọ na mmepụta ogbo (na-egosipụtakwa Waddingham dị ka Starbird) nke gbara izu atọ na Ụlọ ihe nkiri Pleasance London na Mee 2002.","Kamuva wacula indima ka-Starbird engomeni eqoshiwe ye-Space Family Robinson, ekhishwe yi-Pop! Amarekhodi ngoMeyi 2002 ukuze ahambisane nokukhiqizwa kwesiteji (okuhlanganisa no-Waddingham njengo-Starbird) owathatha amasonto amathathu e-London's Pleasance Theatre ngoMeyi 2002.","Hún söng síðar hlutverk Starbird á hljóðrásarupptöku Space Family Robinson, gefin út af Pop! Upptökur í maí 2002 til að falla saman við sviðsframsetninguna (einnig með Waddingham sem Starbird) sem stóð í þrjár vikur í Pleasance Theatre í London í maí 2002.","Daga baya ta rera rawar Starbird akan rikodin sauti na Space Family Robinson, wanda Pop! Rubuce-rubucen a cikin Mayu 2002 don daidaitawa tare da samar da matakin (wanda kuma ke nuna Waddingham a matsayin Starbird) wanda ya yi aiki na tsawon makonni uku a gidan wasan kwaikwayo na Pleasance na Lon...",,,True,True
8,62387071_1,"On February 19, 2020, it was announced that principal photography would start in April 2020.",2020年2月19日，宣布主体摄影将于2020年4月开始。,"Na Febụwarị 19, 2020, ekwuputara na isi foto ga-amalite n'April 2020.","NgoFebhuwari 19, 2020, kwamenyezelwa ukuthi ukuthwebula izithombe okuyinhloko kuzoqala ngo-Ephreli 2020.",Þann 19. febrúar 2020 var tilkynnt að helstu myndatökur myndu hefjast í apríl 2020.,"A ranar 19 ga Fabrairu, 2020, an ba da sanarwar cewa babban ɗaukar hoto zai fara a Afrilu 2020.",,,True,True
9,62387071_2,"However, it was delayed due to the COVID-19 pandemic.",但是，由于 COVID-19 大流行，它被推迟了。,"Agbanyeghị, ọ na-egbu oge n'ihi ọrịa COVID-19.","Kodwa-ke, ibambezelekile ngenxa yobhubhane lwe-COVID-19.",Hins vegar var frestað vegna COVID-19 heimsfaraldursins.,"Koyaya, an jinkirta shi saboda cutar ta COVID-19.",,,False,False


Merge in Facebook translations

In [122]:
df_all = copy.copy(df_google_translations)

In [123]:
path = '/Users/gkj/NLLB/analyses/eval_pg_level/fb_translations/*.txt'

for file_path in glob.glob(path):
    language = file_path.split('_')[-2]
    if language == 'human':
        continue
    language_code = language_to_code[language]
    
    with open(file_path, 'r') as f:
        sentences = f.read().strip().split('\n')
        
    df_all[f'fb_{language_code}'] = sentences

In [125]:
# df_all.to_csv('./df_translations_wide.tsv', sep='\t', index=False)

## Formatting and Shuffling
---

Per Cynthia Gao, each human evaluation tsv file should have the following column schema:

|fbid|sentence1_lang|sentence2_lang|sentence1|sentence2|
|------|-----|-----|-----|-----|
|1234|en|es|Hello, friend|Hola, amigo!|

Each of the 7 languages should have it's own file with all of the samples for that language. For 5 of the languages, we have to evaluate both Facebook and Google translations.

To do this, I'm adding a character to the end of the `page_id`. Facebook samples will have an odd numbered letter (e.g. a, c, e, g, ...). Google's will have even numbered letters (b, d, f, h, ...)

In [137]:
fb_letters = [x for i, x in enumerate(list(ascii_lowercase)) if (i+1) % 2 != 0]
google_letters = [x for i, x in enumerate(list(ascii_lowercase)) if (i+1) % 2 == 0]

fb_letters, google_letters

(['a', 'c', 'e', 'g', 'i', 'k', 'm', 'o', 'q', 's', 'u', 'w', 'y'],
 ['b', 'd', 'f', 'h', 'j', 'l', 'n', 'p', 'r', 't', 'v', 'x', 'z'])

In [187]:
translation_cols = [
    'google_zh', 'google_ig', 'google_zu', 'google_is', 'google_ha',
    'fb_lg', 'fb_oc', 'fb_ha', 'fb_zu', 'fb_zh', 'fb_is', 'fb_ig'
]

two_letter_to_bcp = {
    'ha': 'hau',
    'ig': 'ibo',
    'is': 'isl',
    'zh': 'zho_Hans',
    'zu': 'zul',
    'lg': 'lug',
    'oc': 'oci'
}

bcp_to_two_letter = {v: k for k, v in two_letter_to_bcp.items()}

In [219]:
language_dfs = {}

for language in languages:
    
    df_out = pd.DataFrame()
    
    translation_sets = [x for x in translation_cols if language in x]
    for language_column in translation_sets:
        
        bcp_code = two_letter_to_bcp[language_column.split('_')[-1]]
        is_google = 'google' in language_column
        if is_google:
            chars_to_append = np.random.choice(google_letters, len(df_temp))
        else:
            chars_to_append = np.random.choice(fb_letters, len(df_temp))
        
        
        df_temp = pd.DataFrame({
            'fbid': [id + c for id, c in zip(df_all.sentence_id, chars_to_append)],
            'sentence1_lang': 'eng',
            'sentence2_lang': bcp_code,
            'sentence1': df_all.sentence.values,
            'sentence2': df_all[language_column].values
        })
        
        df_out = pd.concat([df_out, df_temp])
        df_out = df_out.sample(frac=1)   # shuffle the rows
        
        language_dfs[language] = df_out

In [248]:
language_dfs.keys()

dict_keys(['zh', 'ig', 'zu', 'is', 'ha', 'oc', 'lg'])

## File naming conventions

from Cynthia:

```
<date>_<vendor>_eval_<project label>_<lang code>.tsv

Project label: ADHOC_NLLB-WIKI-HE_H2_2021
Vendor: MORAVIA
```

In [221]:
date = 20211027
vendor = 'MORAVIA'
project_label = 'ADHOC_NLLB-WIKI-HE_H2_2021'

for lang in language_dfs:
    lang_code = two_letter_to_bcp[lang]
    file_name = f'{date}_{vendor}_eval_{project_label}_{lang_code}.tsv'
    file_path = f'~/NLLB/analyses/eval_pg_level/moravia_dataframes/{file_name}'
    
    df = language_dfs[lang]
#     df.to_csv(file_path, sep='\t', index=False)

## Sanity check / validation

In [246]:
def check_id_sentence_platform(translation_object):
    
    '''Checks that the id->translation mapping is correct'''
    
    sentence_id = translation_object.fbid[:-1]
    id_char = translation_object.fbid[-1]
    language = bcp_to_two_letter[translation_object.sentence2_lang]
    
    matching_row = df_all[df_all.sentence_id == sentence_id].iloc[0]
    
    # check that the translations match the original
    if id_char in google_letters:
        assert(matching_row[f'google_{language}'] == translation_object.sentence2)
    else:
        assert(matching_row[f'fb_{language}'] == translation_object.sentence2)

In [247]:
row_count = 0
for language in language_dfs:
    print(language)
    for test_row in language_dfs[language].itertuples():
        check_id_sentence_platform(test_row)
        row_count += 1
        
print(f'checked {row_count} rows—all good!')

zh
ig
zu
is
ha
oc
lg
checked 6132 rows—all good!


## Upload the whole thing to Hive

In [224]:
df_out_all_langs = pd.concat(language_dfs.values())

In [229]:
df_out_all_langs['is_google'] = [1 if id[-1] in google_letters else 0 for id in df_out_all_langs.fbid]

In [230]:
df_out_all_langs

Unnamed: 0,fbid,sentence1_lang,sentence2_lang,sentence1,sentence2,is_google
65,68573407_5m,eng,zho_Hans,The utility company had announced that they were planning on little to no power outages.,"公用事业公司宣布,他们计划几乎不会停电.",0
123,14533_4i,eng,zho_Hans,"The foundation of a typical Indian meal is a cereal cooked in plain fashion, and complemented with flavourful savoury dishes.","典型的印度餐的基础是以普通方式烹饪的谷物,并辅以美味的咸味菜肴.",0
456,57809627_2k,eng,zho_Hans,He is best known for his starring roles in the television series Prison Playbook (2017–2018) and Squid Game (2021).,"他最出名的是他在电视连续剧""监狱剧本""(2017'Äì2018)和""章鱼游戏""(2021)中的主演角色.",0
147,271181_5f,eng,zho_Hans,"The musical aired on August 2006 in Portland, Oregon and Hood River, Oregon, and it was presented on stage at Houston's Alley Theatre at the end of July 2007 under direction of Les R. Wood.",这部音乐剧于 2006 年 8 月在俄勒冈州波特兰市和俄勒冈州胡德河市播出，并于 2007 年 7 月末在莱斯 R.伍德的指导下在休斯顿小巷剧院上演。,1
224,4429395_3v,eng,zho_Hans,"Hughes was on a $50,000 cash bond and was charged with first-degree home invasion and malicious destruction of property.","休斯持有 50,000 美元的现金保证金，并被控一级入室盗窃和恶意破坏财产罪。",1
...,...,...,...,...,...,...
494,33790364_3e,eng,lug,He belongs to the Dalit Sikh community.,Ali mu kitundu kya Dalit Sikh.,0
336,153557_7e,eng,lug,"The amount that Collins was seeking was halved, and Satterfield and Davis (who originally brought the suit forward in California) would not have to repay any of it.","Ssente Collins ze yali anoonya zaakendeezebwa ekitundu, era Satterfield ne Davis (abaasooka okuleeta omusango mu California) tebaalina kusasula.",0
401,3848862_5m,eng,lug,I can go 100 percent out there and never have to worry about getting tired.,Nsobola okugenda mu bitundu 100 ku buli kikumi ne sirina kweraliikirira kukoowa.,0
303,35507_1k,eng,lug,A number of tools exist that crawl through a website to find pages that return 404 status codes.,Ebikozesebwa ebiwerako biriwo nga biyita mu mukutu gwa yintaneeti okuzuula empapula eziddamu ennamba z'embeera 404.,0


In [240]:
# df_out_all_langs.to_csv('./df_all_langs_pre_eval.tsv', sep='\t', index=False)