This notebook translates English terms in word lists into various big languages (French, German, etc.).

Steps:

1. Pool all words from various word lists together
2. Run the words through Google translate and save the translations to one file per language
3. Examine those files manually
4. Go through word lists and apply translations

In [1]:
!pip install requests google-cloud-translate tqdm



In [49]:
import os
import sys
import pickle
import json
import pandas as pd
import random
import numpy as np
from glob import glob
from tqdm import tqdm
import random
import time
from google.cloud import translate_v3

In [3]:
wordlist_paths = glob('dicts/*.txt')
wordlist_paths = [p for p in wordlist_paths 
                  if 'whitelist' not in p.lower() and 'blacklist' not in p.lower()
                  and 'allLocDict' not in p and 'sw_dict' not in p]

In [4]:
files = []
for p in wordlist_paths:
    with open(p) as f:
        files.append(f.readlines())

In [5]:
[(len(lines), path) for lines, path in zip(files, wordlist_paths)]

[(120, 'dicts/typesDict.txt'),
 (1705, 'dicts/subjectsDict.txt'),
 (343, 'dicts/subjectModifiersDict.txt'),
 (24, 'dicts/companyTypes.txt'),
 (23, 'dicts/commonSubjectsDict.txt'),
 (9, 'dicts/orgModifiersDict.txt'),
 (79, 'dicts/wordEndingsDict.txt'),
 (15521, 'dicts/subjectsDict-new.txt'),
 (30, 'dicts/univKeywords.txt'),
 (9, 'dicts/companySuffixes.txt'),
 (5, 'dicts/connectorsDict.txt'),
 (543, 'dicts/companyNames.txt')]

In [6]:
phrases = list(set(line.strip() for s in files for line in s))
phrases = [p for p in phrases if p] # remove empty string

In [7]:
len(phrases)

17800

## Eye-balling some words

In [8]:
np.random.choice(phrases, size=20)

array(['Choc', 'Surgeon', 'Weld', 'Neuromicrobiology', 'Biotechn',
       'Adjudic', 'Opthamology', 'first-fuel-software', 'ANFBEREIT-TECH',
       'gastroenterology peidiatraiceach', 'turn',
       'Community Health Sciences', 'Nuerosci', 'Pneumonology',
       'Carcinog', 'neuroscienze', 'Ultim', 'diMatematica',
       'Oftalmologiga', 'TRUBOPPROVODN'], dtype='<U57')

In [9]:
!grep Verhaltensmgiger dicts/*.txt

dicts/subjectsDict-new.txt:Verhaltensmgiger


In [10]:
!grep KELLOQ dicts/*.txt

dicts/subjectsDict-new.txt:KELLOQ


In [11]:
!grep Gemol dicts/*.txt

dicts/subjectsDict-new.txt:Gemol


# Setting up and checking Google Translate API

In [63]:
assert os.path.exists("../google-api-key.json"), "You might want to point to your Google API key and provide a project ID"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "../google-api-key.json"
project = 'projects/4995602461'

In [14]:
target_languages = ["fr", "es", "pt", "ca", 'nl', 'de', 'fi', 'da', 'no', 'tr', 'hu', 'pl', 'el']

In [16]:
client = translate_v3.TranslationServiceClient()
supported_languages = client.get_supported_languages(project)

In [17]:
supported_languages = set(lang.language_code for lang in supported_languages.languages)
assert all(l in supported_languages for l in target_languages)

# Translate all English phrases into each of chosen languages

To run the code below, you'll need to provide an API key. I run it once and saved the results to `../output/translation.csv` so you could also skip to the cell that has `pd.read_csv(...)` to read it back in.

In [20]:
translations_per_lang = [[] for _ in target_languages]

In [39]:
def translate(phrases, target_lang, translations=[], batch_size=100, sleep_sec=1.5): 
    # big batch sizes might lead to DeadlineExceeded
    # sleeping to avoid hitting against the quota (ResourceExhausted error)
    num_translated = len(phrases) 
    for i in tqdm(range(len(translations), num_translated, batch_size), position=0, leave=True):
        orig_phrases = phrases[i:i+batch_size]
        response = client.translate_text(contents=orig_phrases, target_language_code=target_lang,
                                         parent=project, source_language_code='en',
                                         model=project + '/locations/global/models/general/nmt')
        translated_phrases = [t.translated_text for t in response.translations]
        assert len(translated_phrases) == len(orig_phrases)
        translations.extend(zip(orig_phrases, translated_phrases))
        time.sleep(sleep_sec)
    return translations

In [41]:
_ = translate(phrases, target_languages[0], translations_per_lang[0])

0it [00:00, ?it/s]


## Filter out some dubious phrases to save money

In [32]:
# many of the phrases are not actually English, we can ignore them
translations0 = translations_per_lang[0]
dubious_phrases = [src for src, dest in translations0 if src.lower() == dest.lower()]
true_english_phrases = [src for src, dest in translations0 if src.lower() != dest.lower()]

In [50]:
random.sample(dubious_phrases, 20)

['Socio-Med',
 'Direct',
 'borgyogyaszat',
 'Quimica-Bioquimica',
 'Adaptat',
 'Electrom',
 'Landwirtsch',
 'Svar',
 'Landentwickl',
 'WAERME-STOFFUEBERTRAG',
 'Enterol',
 'Nephrologische',
 'Intellig',
 'assistenza infermieristica',
 'Alergia',
 'TURBERK',
 'Beob',
 'Infektologie',
 'Ethno-Natl',
 'Irregul']

In [52]:
random.sample(true_english_phrases, 20)

['Environmental Sciences',
 'Methad',
 'healthc',
 'Period',
 'Kinemat',
 'coop',
 'account',
 'Cellular',
 'Fin',
 'Agronomy',
 'PlasticSurgery',
 'Facial',
 'biochemie',
 'ofOtolaryngology/Head',
 'Orb',
 "d'Electronique",
 'Documents',
 'russell-investments',
 'Educational',
 'PhysicalChemistry']

In [56]:
len(true_english_phrases)

7567

In [57]:
translations_per_lang[0] = [(src, dest) for src, dest in translations0 if src.lower() != dest.lower()]

## Translate remaining phrases into remaining languages

In [42]:
for target_lang, translations in zip(target_languages, translations_per_lang):
    print('Translating en --> %s' % target_lang)
    translate(true_english_phrases, target_lang, translations)    

0it [00:00, ?it/s]
0it [00:00, ?it/s]
  0%|          | 0/44 [00:00<?, ?it/s]

Translating en --> fr
Translating en --> es
Translating en --> pt


100%|██████████| 44/44 [01:18<00:00,  1.79s/it]
  0%|          | 0/76 [00:00<?, ?it/s]

Translating en --> ca


100%|██████████| 76/76 [02:16<00:00,  1.80s/it]
  0%|          | 0/76 [00:00<?, ?it/s]

Translating en --> nl


100%|██████████| 76/76 [02:11<00:00,  1.73s/it]
  0%|          | 0/76 [00:00<?, ?it/s]

Translating en --> de


100%|██████████| 76/76 [02:33<00:00,  2.03s/it]
  0%|          | 0/76 [00:00<?, ?it/s]

Translating en --> fi


100%|██████████| 76/76 [02:16<00:00,  1.80s/it]
  0%|          | 0/76 [00:00<?, ?it/s]

Translating en --> da


100%|██████████| 76/76 [02:13<00:00,  1.76s/it]
  0%|          | 0/76 [00:00<?, ?it/s]

Translating en --> no


100%|██████████| 76/76 [02:14<00:00,  1.77s/it]
  0%|          | 0/76 [00:00<?, ?it/s]

Translating en --> tr


100%|██████████| 76/76 [02:11<00:00,  1.73s/it]
  0%|          | 0/76 [00:00<?, ?it/s]

Translating en --> hu


100%|██████████| 76/76 [02:16<00:00,  1.80s/it]
  0%|          | 0/76 [00:00<?, ?it/s]

Translating en --> pl


100%|██████████| 76/76 [02:13<00:00,  1.76s/it]
  0%|          | 0/76 [00:00<?, ?it/s]

Translating en --> el


100%|██████████| 76/76 [02:37<00:00,  2.08s/it]


## Eye-balling

In [53]:
random.sample(random.choice(translations_per_lang), 20)

[('andBiological', 'andBiological'),
 ('Gastroenterology/Hematology', 'Gastroenterology / Hematology'),
 ('Endocrinolgy', 'Endocrinolgy'),
 ('Science/Plant', 'Tudomány / Plant'),
 ('Vigil', 'Virrasztás'),
 ('Clinical Psychology', 'Klinikai pszichológia'),
 ('Self-Organiz', 'Self-Organiz'),
 ('Sport and Exercise Science', 'Sport és testmozgás tudomány'),
 ('Sport Science', 'Sporttudomány'),
 ('Gynaecology/', 'Nőgyógyászat/'),
 ('Antipod', 'Antipod'),
 ('Most', 'A legtöbb'),
 ('Crop/Soil', 'Crop / talaj'),
 ('Neurotraumatology', 'Neurotraumatology'),
 ('Disc', 'Lemez'),
 ('Non-West', 'Non-West'),
 ('Syntax-Semant', 'Syntax-Semant'),
 ('Drug-Facil', 'Kábítószer-Facil'),
 ('Histocompatibility', 'hisztokompatibilitási'),
 ('medycyna laboratoryjna', 'medycyna laboratórium')]

In [54]:
random.sample(random.choice(translations_per_lang), 20)

[('SurgeryK', 'Chirurgia'),
 ('Multitask', 'Wielozadaniowy'),
 ('Machinery', 'Maszyneria'),
 ('opencounter', 'OpenCounter'),
 ('Hydrobiology', 'Hydrobiologia'),
 ('Fright', 'Strach'),
 ('Ambassad', 'Ambasador'),
 ('Aqueous-Org', 'Org. Wodny'),
 ('Low-Temp', 'Niska temperatura'),
 ('Psychotherapy', 'Psychoterapia'),
 ('Tenn', 'Tenn'),
 ('computer-packages-inc', 'computer-Package-inc'),
 ('Foster', 'Sprzyjać'),
 ('Echography', 'Echografia'),
 ("d'Obstetricia", 'd&#39;Obstetricia'),
 ('State-Corp', 'State-Corp'),
 ('Cardiac Electrophysiology', 'Elektrofizjologia serca'),
 ('Cases', 'Skrzynie'),
 ('Inorganic', 'Nieorganiczny'),
 ('German-Dan', 'Niemiecki-Dan')]

In [55]:
random.sample(random.choice(translations_per_lang), 20)

[('Molecular Toxicology', 'Toxicología Molecular'),
 ('endokrynologia', 'endocrinologia'),
 ('Hematology/G03.647', 'Hematología / G03.647'),
 ('Medical Education', 'Educación médica'),
 ('autogrid-systems', 'sistemas de autogrid'),
 ('Community', 'Comunidad'),
 ('Economy', 'Economía'),
 ('Herbic', 'Herbic'),
 ('legal-science-partners', 'socios de ciencias jurídicas'),
 ('Animal Science', 'Ciencia Animal'),
 ('Padova/Math', 'Padua / Matemáticas'),
 ('Pestic', 'Pesticida'),
 ('Med./Radiology', 'Med./Radiology'),
 ('Rheum', 'Reuma'),
 ('medical oncology', 'Oncologia medica'),
 ('Stem Cell and Regenerative Medicine',
  'Células Madre y Medicina Regenerativa'),
 ('Amazon', 'Amazonas'),
 ('Midwifery', 'Partería'),
 ('Stem Cells', 'Células madre'),
 ('Cell Signaling', 'Señal telefónica')]

# Save results

In [58]:
all_translations = list(zip(target_languages, translations_per_lang))

In [59]:
with open('../output/translations.json', 'w') as f:
    json.dump(all_translations, f)

In [62]:
!head -c 2000 ../output/translations.json

[["fr", [["Marine", "Marin"], ["Radiation Therapy", "Radioth\u00e9rapie"], ["Immunology", "Immunologie"], ["Gastroenterol.and", "Gastroenterol.et"], ["HPV-Relat", "Relation HPV"], ["Bioengineering", "Bioing\u00e9nierie"], ["ofPoly", "dePoly"], ["Four-Vol", "Quatre-vol"], ["d'Hgmatologie", "d&#39;Hgmatologie"], ["Mange", "Gale"], ["Remodel", "Remodeler"], ["Approac", "Approche"], ["Immunology-IMM18", "Immunologie-IMM18"], ["Peruv", "P\u00e9ruv"], ["Chinese Pharmaceutics", "Pharmacie chinoise"], ["Iberoam", "Ib\u00e9ro"], ["nationwide-mutual-insurance-company", "compagnie d&#39;assurance mutuelle nationale"], ["process", "processus"], ["Manufacture", "Fabrication"], ["Truth-Tell", "V\u00e9rit\u00e9 dire"], ["Modern Textile", "Textile moderne"], ["Chest Diseases", "Maladies de la poitrine"], ["Aware", "Conscient"], ["Mat", "Tapis"], ["subst", "sous-marin"], ["Dairy-Deriv", "Produits laitiers d\u00e9riv\u00e9s"], ["Nutritional Sciences", "Sciences de la nutrition"], ["Hearing", "Audition"]

# Translate word lists

In [65]:
with open('../output/translations.json') as f:
    all_translations = json.load(f)

In [68]:
for target_lang, translations in all_translations:
    dictionary = dict(translations)
    for src_path in wordlist_paths:
        dest_path = src_path.replace('.txt', '.%s.txt' % target_lang)
        with open(src_path) as f:
            phrases = [p.strip() for p in f]
        translated_phrases = [dictionary[p] for p in phrases if p in dictionary]
        with open(dest_path, 'w') as f:
            for p in translated_phrases:
                f.write(p)
                f.write('\n')

In [80]:
!wc -l dicts/*.txt

   49457 dicts/acronym_whitelist.txt
   81257 dicts/allLocDict.txt
      18 dicts/blackListDict.txt
      21 dicts/commonSubjectsDict.ca.txt
      21 dicts/commonSubjectsDict.da.txt
      21 dicts/commonSubjectsDict.de.txt
      21 dicts/commonSubjectsDict.el.txt
      21 dicts/commonSubjectsDict.es.txt
      21 dicts/commonSubjectsDict.fi.txt
      21 dicts/commonSubjectsDict.fr.txt
      21 dicts/commonSubjectsDict.hu.txt
      21 dicts/commonSubjectsDict.nl.txt
      21 dicts/commonSubjectsDict.no.txt
      21 dicts/commonSubjectsDict.pl.txt
      21 dicts/commonSubjectsDict.pt.txt
      21 dicts/commonSubjectsDict.tr.txt
      23 dicts/commonSubjectsDict.txt
     253 dicts/companyNames.ca.txt
     253 dicts/companyNames.da.txt
     253 dicts/companyNames.de.txt
     253 dicts/companyNames.el.txt
     253 dicts/companyNames.es.txt
     253 dicts/companyNames.fi.txt
     253 dicts/companyNames.fr.txt
     253 dicts/companyNames.hu.txt
     253 dicts/companyNames.nl.txt
     253 dicts