In [1]:
from nltk.tokenize import WordPunctTokenizer
import sys

sys.path.append('../src/')

from database import query_by_alias, query_by_name


In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

ner_tokenizer = AutoTokenizer.from_pretrained("dslim/distilbert-NER") #NER Model
ner_model = AutoModelForTokenClassification.from_pretrained("dslim/distilbert-NER")

nlp = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy='average')
example = "My name is Wolfgang Johnson and I live in Berlin"

ner_results = nlp(example)
print(ner_results)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'PER', 'score': 0.9950732, 'word': 'Wolfgang Johnson', 'start': 11, 'end': 27}, {'entity_group': 'LOC', 'score': 0.9970612, 'word': 'Berlin', 'start': 42, 'end': 48}]


In [11]:
example = 'Is Texas the largest state in US?'
ner_results = nlp(example)



def merge_entities(ner_output):
    result = []

    for item in ner_output:
        name = item['word']
        ent_type = item['entity_group']

        result.append((name, ent_type))

    
    return result

merge_entities(ner_results)

[('Texas', 'LOC'), ('US', 'LOC')]

In [12]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
article_en = "The head of the United Nations says there is no military solution in Syria"
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt") 
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX") #Translator Model

model_inputs = tokenizer(article_en, return_tensors="pt")

tr_table = { #Translation table to take in our expected language names and use them for facebooks model
    'ar': 'ar_AR',
    'de': 'de_DE',
    'es': 'es_XX',
    'fr': 'fr_XX',
    'it': 'it_IT',
    'ja': 'ja_XX'
}

def translate(source, lang): #translate wrapper 
    lang = tr_table[lang]
    inputs = tokenizer(source, return_tensors="pt", padding=True, truncation=True, max_length=200)
    generated = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[lang])
    return tokenizer.batch_decode(generated, skip_special_tokens=True)


batch_sources = [
    "The quick brown fox jumps over the lazy dog",
    "Artificial intelligence is revolutionizing the world",
    "I live in Germany, Texas which is in the US",
    "Why does it keep happening",
    "Why does US keep becoming something it is not",
    "Is Texas the biggest state in the US",
]


translate(batch_sources, 'es') #example 

['El fox bruno rápido salta sobre el perro escaso',
 'La Inteligencia Artificial está revolucionando el mundo',
 'Yo vivo en Alemania, Texas que es en los Estados Unidos',
 '¿Por qué sigue ocurriendo?',
 '¿Por qué los Estados Unidos siguen siendo algo que no es?',
 'El Presidente (habla en inglés): De conformidad con el entendimiento alcanzado en las consultas previas del Consejo, consideraré que el Consejo de Seguridad desea concluir su examen del tema que figura en el orden del día.']

In [56]:
def concat_mask(source:str, lang:str):
    res = nlp(source) #get entities 
    merged = merge_entities(res)

    translated = []

    for ent in merged:
        ent_name = ent[0]
        try:
            q = query_by_name(ent_name, projections=lang) #try to get by name
            if len(q) == 0:
                q = query_by_alias(ent_name) #try to get by alias
                q = query_by_name(q[0][0], projections=lang)
            

            ent_translated = q[0][0]
            translated.append(f'<{lang}> {ent_translated} </{lang}>')

        except:
            pass
        
    source = source + ' ' + ' '.join(translated)

    return source    
        
    
    
concat_mask(example, 'ar')


def substitution_mask(source:str, lang:str):
    res = nlp(source) #get entities 
    merged = merge_entities(res)

    translated = []

    for ent in merged:
        print(ent)
        ent_name = ent[0]
        try:
            q = query_by_name(ent_name, projections=lang) #try to get by name
            if len(q) == 0:
                q = query_by_alias(ent_name) #try to get by alias
                q = query_by_name(q[0][0], projections=lang)
            

            ent_translated = q[0][0]
            translated.append((ent_name ,f'<{lang}> {ent_translated} </{lang}>'))

        except:
            pass
    
    for name, translation in translated:
        source = source.replace(name, f'{translation}')
    

    return source   

def dumbass_mask(source:str, lang:str):
    tokens = WordPunctTokenizer().tokenize(source)
    merged = []
    for i,_ in enumerate(tokens):
        for j in range(i+1, len(tokens)):
            merged.append(' '.join(tokens[i:j]))

    translated = []
    for ent in merged:
        ent_name = ent
        try:
            q = query_by_name(ent_name, projections=lang) #try to get by name
            if len(q) == 0:
                q = query_by_alias(ent_name) #try to get by alias
                q = query_by_name(q[0][0], projections=lang)
            

            ent_translated = q[0][0]
            translated.append((ent_name ,f'<{lang}> {ent_translated} </{lang}>'))

        except:
            pass
    
    print(translated)
    return source

def remove_tokens(src:str, lang:str):
    src = src.replace(f'<{lang}> ', '')
    src = src.replace(f' </{lang}>', '')
    return src
 
example = "What kind of artwork is The Signal-Man?"
answer = "¿Qué tipo de obra artistica es El guardavía?"
lang = 'es'
a = substitution_mask(example, lang)
print(a)
a = translate(a, lang)
print(a)
print(remove_tokens(a[0], lang))
print(translate(example, lang))
print(answer)

dumbass_mask(example, lang)

('Signal', 'ORG')
('- Man', 'MISC')
What kind of artwork is The Signal-Man?
['¿Qué tipo de obra de arte es The Signal-Man?']
¿Qué tipo de obra de arte es The Signal-Man?
['¿Qué tipo de obra de arte es The Signal-Man?']
¿Qué tipo de obra artistica es El guardavía?
[('is', '<es> Islandia </es>'), ('Man', '<es> varón </es>')]


'What kind of artwork is The Signal-Man?'

In [124]:
# Need to make jsons with the form
# Predictions Format
# {
#   "id": "Q627784_0",
#   "prediction": "Come viene ricordato e onorato Yu il Grande nella storia e cultura cinese di oggi?",
# }

import os
import pandas as pd

def load_testing_data():
    """Load in all XC Testing data"""
    path = f'../data/XC_test_data'
    data = {}

    for lang in os.listdir(path): #iterate over the files to load in dfs
        file = f'{path}/{lang}/test.jsonl'
        frame = pd.read_json(file, lines=True)
        data[lang] = frame

    return data 

In [None]:
from tqdm import tqdm
import json

def run_test(name:str, mask_fn=lambda x: x, batch_size=32):
    """Run test over the XC dataset

    Args:
        - name (str): Name for the test
        - mask_fn (fn(str) -> str): a Masking function for the source sentence [Default returns itself]
    """

    data = load_testing_data()

    try:
        os.mkdir(f'../test_results/{name}')
    except FileExistsError:
        print('Path already exists')
        
    for lang, df in data.items(): #iterate over all dataframes

        masked_sources = [] #Get masked version of sentences
        row_ds = []
        for i, row in  df.iterrows():
            masked_source = mask_fn(row['source'])
            masked_sources.append(masked_source)
            row_ds.append(row['id'])

        size = len(masked_sources) #size for determining batch count
        
        pairs = []
        for batch in tqdm(range(round(size/batch_size)), desc=f'Translating batches for {lang}'): #batch 
            start = batch*batch_size
            end = (batch+1) * batch_size if (batch+1) * batch_size < size else size

            
            sources = masked_sources[start:end]
            #print(len(sources))
            translated = translate(sources, lang)
            #print(len(translated))

            id_translation_pairs = list(zip(row_ds[start:end], translated))
            pairs.extend(id_translation_pairs)

        
        with open(f'../test_results/{name}/{lang}.jsonl', 'w') as f:
            shit = [json.dumps({'id': id, 'prediction': prediction})+"\n" for id, prediction in pairs]
            f.writelines(shit)




In [21]:
s = 'testing a substitution jutsu'

ent_translation_pairs = [('a', '<es> de </es>'), ('jutsu', '<es> Shinigami fuckface </es>')]
for name, trans_token in ent_translation_pairs:
    s = s.replace(name, f'{name} {trans_token}')

s

'testing a <es> de </es> substitution jutsu <es> Shinigami fuckface </es>'

In [None]:

tokens = WordPunctTokenizer().tokenize(example)



['What']
['What', 'kind']
['What', 'kind', 'of']
['What', 'kind', 'of', 'artwork']
['What', 'kind', 'of', 'artwork', 'is']
['What', 'kind', 'of', 'artwork', 'is', 'The']
['What', 'kind', 'of', 'artwork', 'is', 'The', 'Signal']
['What', 'kind', 'of', 'artwork', 'is', 'The', 'Signal', '-']
['What', 'kind', 'of', 'artwork', 'is', 'The', 'Signal', '-', 'Man']
['kind']
['kind', 'of']
['kind', 'of', 'artwork']
['kind', 'of', 'artwork', 'is']
['kind', 'of', 'artwork', 'is', 'The']
['kind', 'of', 'artwork', 'is', 'The', 'Signal']
['kind', 'of', 'artwork', 'is', 'The', 'Signal', '-']
['kind', 'of', 'artwork', 'is', 'The', 'Signal', '-', 'Man']
['of']
['of', 'artwork']
['of', 'artwork', 'is']
['of', 'artwork', 'is', 'The']
['of', 'artwork', 'is', 'The', 'Signal']
['of', 'artwork', 'is', 'The', 'Signal', '-']
['of', 'artwork', 'is', 'The', 'Signal', '-', 'Man']
['artwork']
['artwork', 'is']
['artwork', 'is', 'The']
['artwork', 'is', 'The', 'Signal']
['artwork', 'is', 'The', 'Signal', '-']
['artwo