In [None]:
!pip install simpletransformers

In [None]:
import logging

import pandas as pd
from simpletransformers.seq2seq import Seq2SeqModel

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
import pandas as pd
df = pd.read_csv("annotated_corpus.csv")

In [None]:
hall = []
for j in df.values:
#     print(j[4])
    if j[4] == 0 and j[5] == 0 and j[6] == 0 and j[7] == 0 and j[8] == 0:
        hall.append(0)
    else:
        hall.append(1)

df['hallucinate'] = hall
df['src_mt'] = df['src'].astype(str) + df['mt'].astype(str)

In [None]:
# only take non hallucinated examples 

df_ = df[df['hallucinate'] == 0]

In [None]:
## Hallucination Induction 

In [None]:
df_perturbed = df_.sample(frac=0.5)
df_nonperturbed = df_.loc[~df_.index.isin(df_perturbed.index)]


In [None]:
#misspelt dataframe 

import random

misspelt = []
word_po = []

for i in df_perturbed.src.values:
    words = i.split()
    word_pos = random.randint(0, len(words)-1) # randomly choose a word 
    word_po.append(word_pos)
    w = words[word_pos] # get the word
    i = random.randint(0, len(w)-1) # randomly choose the character to delete 
#     misspelt.append(w[:i] + w[i+1:])
    words[word_pos] = w[:i] + w[i+1:] # insert the deleted character word to the pos
    misspelt.append(" ".join(words))

In [None]:
df_perturbed['src'] = misspelt 

In [None]:
# find top tokens in the text

from collections import Counter

lis = []

for i in df_perturbed.src.values:
    lis.append(i.split())

def flatten(input):
    new_list = []
    for i in input:
        for j in i:
            new_list.append(j)
    return new_list

l = flatten(lis)

most = pd.Series(l).value_counts()[:576].index.tolist()
least = pd.Series(l).value_counts()[-577:].index.tolist()

ins = []


for (i,j) in zip(df_perturbed.src[0:len(most1)].values, most1):
    k = j + " " + i
    ins.append(k)

for (i,j) in zip(df_perturbed.src[len(most1):].values, least1):
    k = j + " " + i
    ins.append(k)


df_perturbed['src'] = ins1

In [None]:
df_ = pd.concat([df_perturbed, df_perturbed], ignore_index=True)

In [None]:
import numpy as np 
df1, df2, df3 = np.split(df_.sample(frac=1), [int(.8*len(df_)), int(0.9*len(df_))])

In [None]:
import torch 

USE_CUDA = torch.cuda.is_available()

In [None]:
USE_CUDA = False 

In [None]:
df1.to_csv("train_add.csv")
df2.to_csv("dev_add.csv")
df3.to_csv("test_add.csv")

In [None]:
train_df = df1.drop(['Unnamed: 0', 'mt', 'repetitions', 'named-entities', 'omission', 'strong-unsupport', 'full-unsupport', 'hallucinate', 'src_mt'], axis=1)

eval_df = df2.drop(['Unnamed: 0', 'mt', 'repetitions', 'named-entities', 'omission', 'strong-unsupport', 'full-unsupport', 'hallucinate', 'src_mt'], axis=1)

test_df = df3.drop(['Unnamed: 0', 'mt', 'repetitions', 'named-entities', 'omission', 'strong-unsupport', 'full-unsupport', 'hallucinate', 'src_mt'], axis=1)

In [None]:
train_df.rename(columns = {'src':'input_text'}, inplace = True)
train_df.rename(columns = {'ref':'target_text'}, inplace = True)

eval_df.rename(columns = {'src':'input_text'}, inplace = True)
eval_df.rename(columns = {'ref':'target_text'}, inplace = True)

test_df.rename(columns = {'src':'input_text'}, inplace = True)
test_df.rename(columns = {'ref':'target_text'}, inplace = True)

In [None]:
model_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "max_seq_length": 100,
    "train_batch_size": 16,
    "num_train_epochs": 10,
    "save_eval_checkpoints": False,
    "save_model_every_epoch": False,
    "evaluate_generated_text": True,
    "evaluate_during_training_verbose": True,
    "use_multiprocessing": False,
    "max_length": 400,
    "manual_seed": 4,
}

model = Seq2SeqModel(
    encoder_decoder_type="marian",
    encoder_decoder_name="Helsinki-NLP/opus-mt-de-en",
    args=model_args,
    use_cuda=True,
)

model.train_model(train_df)

results = model.eval_model(eval_df)

In [None]:
real_inp = test_df.input_text.values.tolist()
output_preds = model.predict(real_inp)

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from nltk.translate.bleu_score import corpus_bleu

from nltk.translate.meteor_score import meteor_score, single_meteor_score
nltk.download('wordnet')

In [None]:
de_en_bleu = []

for i,j in zip(output_preds, test_df.target_text.values.tolist()):
    l = word_tokenize(i)
    m = word_tokenize(j)
    de_en_bleu.append(corpus_bleu([l], [m]))


In [None]:
meteor = []
from nltk.tokenize import word_tokenize


for (i, j) in zip(output_preds, test_df.target_text.values.tolist()):
    l = word_tokenize(i)
    m = word_tokenize(j)
    print(round(single_meteor_score(l, m), 4))
#     print(i)
    meteor.append(round(single_meteor_score(l, m), 4))

In [None]:
import statistics
statistics.mean(de_en_bleu) 

In [None]:
import statistics
statistics.mean(meteor) 

In [None]:
pd.DataFrame(
    {'marian_mt': output_preds,
     'ground_truth':  test_df.target_text.values.tolist(),
     'bleu': de_en_bleu,
     'meteor': meteor1
    }).to_csv("Marian_results_ann.csv")