### Download and display the data.

In [8]:
import pandas as pd

# I am using double space separator to parse data correctly.
df = pd.read_csv(filepath_or_buffer="../data/raw/filtered.tsv", sep='	', header=0)

# Dropping unnecessary column with row indices because they are already included in pandas DataFrame.
df = df.drop('Unnamed: 0', axis=1)

# Amend the spelling mistakes... 
df =  df.rename(columns={'lenght_diff': 'length_diff'})

df

Unnamed: 0,reference,translation,similarity,length_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348
...,...,...,...,...,...,...
577772,You didn't know that Estelle had stolen some f...,you didn't know that Estelle stole your fish f...,0.870322,0.030769,0.000121,0.949143
577773,It'il suck the life out of you!,you'd be sucked out of your life!,0.722897,0.058824,0.996124,0.215794
577774,"I can't fuckin' take that, bruv.",I really can't take this.,0.617511,0.212121,0.984538,0.000049
577775,They called me a fucking hero. The truth is I ...,"they said I was a hero, but I didn't care.",0.679613,0.358209,0.991945,0.000124


### As was said in the task description, it is better for us to find pairs, where reference text has high toxicity level, and translation - low.
### We should be careful because there are references with low toxicity and translations with high toxicities.

### Swap pairs with trn_tox > ref_tox.

In [9]:
from tqdm import tqdm

for i, row in tqdm(df.iterrows()):
    if row['trn_tox'] > row['ref_tox']:
        
        # Swap toxicities
        copy_ref_tox = row['ref_tox']
        df.at[i, 'ref_tox'] = row['trn_tox']
        df.at[i, 'trn_tox'] = copy_ref_tox
        
        # Swap texts
        copy_ref_text = row['reference']
        df.at[i, 'reference'] = row['translation']
        df.at[i, 'translation'] = copy_ref_text
df

577777it [00:34, 16636.72it/s]


Unnamed: 0,reference,translation,similarity,length_diff,ref_tox,trn_tox
0,"if Alkar floods her with her mental waste, it ...","If Alkar is flooding her with psychic waste, t...",0.785171,0.010309,0.981983,0.014195
1,you're becoming disgusting.,Now you're getting nasty.,0.749687,0.071429,0.999039,0.065473
2,"well, we can spare your life.","Well, we could spare your life, for one.",0.919051,0.268293,0.985068,0.213313
3,"monkey, you have to wake up.","Ah! Monkey, you've got to snap out of it.",0.664333,0.309524,0.994215,0.053362
4,I have orders to kill her.,I've got orders to put her down.,0.726639,0.181818,0.999348,0.009402
...,...,...,...,...,...,...
577772,you didn't know that Estelle stole your fish f...,You didn't know that Estelle had stolen some f...,0.870322,0.030769,0.949143,0.000121
577773,It'il suck the life out of you!,you'd be sucked out of your life!,0.722897,0.058824,0.996124,0.215794
577774,"I can't fuckin' take that, bruv.",I really can't take this.,0.617511,0.212121,0.984538,0.000049
577775,They called me a fucking hero. The truth is I ...,"they said I was a hero, but I didn't care.",0.679613,0.358209,0.991945,0.000124


# Drop rows with toxicity greater than 0.4.

In [10]:
df = df.drop(df.loc[df['trn_tox'] > 0.4].index)
df

Unnamed: 0,reference,translation,similarity,length_diff,ref_tox,trn_tox
0,"if Alkar floods her with her mental waste, it ...","If Alkar is flooding her with psychic waste, t...",0.785171,0.010309,0.981983,0.014195
1,you're becoming disgusting.,Now you're getting nasty.,0.749687,0.071429,0.999039,0.065473
2,"well, we can spare your life.","Well, we could spare your life, for one.",0.919051,0.268293,0.985068,0.213313
3,"monkey, you have to wake up.","Ah! Monkey, you've got to snap out of it.",0.664333,0.309524,0.994215,0.053362
4,I have orders to kill her.,I've got orders to put her down.,0.726639,0.181818,0.999348,0.009402
...,...,...,...,...,...,...
577772,you didn't know that Estelle stole your fish f...,You didn't know that Estelle had stolen some f...,0.870322,0.030769,0.949143,0.000121
577773,It'il suck the life out of you!,you'd be sucked out of your life!,0.722897,0.058824,0.996124,0.215794
577774,"I can't fuckin' take that, bruv.",I really can't take this.,0.617511,0.212121,0.984538,0.000049
577775,They called me a fucking hero. The truth is I ...,"they said I was a hero, but I didn't care.",0.679613,0.358209,0.991945,0.000124


### Observe the length difference.

### I will not preprocess data based on length difference for two reasons:
### 1) The difference is insignificant.
### 2) If a model will paraphrase the sentence, then the lenght will change too, but not drastically.

In [11]:
df.loc[df['length_diff'] > 0.3]

Unnamed: 0,reference,translation,similarity,length_diff,ref_tox,trn_tox
3,"monkey, you have to wake up.","Ah! Monkey, you've got to snap out of it.",0.664333,0.309524,0.994215,0.053362
14,he's the tallest son of a bitch.,So he's the Top dog.,0.611092,0.363636,0.999639,0.000920
22,"Real life starts the first time you fuck, kid.","boy, real life starts up first.",0.866697,0.319149,0.998222,0.000114
26,I like that shit.,I love it.,0.697344,0.388889,0.999594,0.000043
28,How is this not porn? This is porn that comes ...,and this doesn't feel like porn?,0.697579,0.365385,0.933948,0.009414
...,...,...,...,...,...,...
577742,"It's so simple, it's stupid.",it's quite simple.,0.723609,0.344828,0.999640,0.000053
577749,What the hell were you thinking?,what was that about?,0.624227,0.363636,0.965742,0.000045
577751,Because I've followed you for eight weeks now ...,"I followed you for eight weeks, and you ordere...",0.803549,0.357143,0.987531,0.000042
577771,"I thought American men were bad enough, but no...","an American man is worth nothing, but for you,...",0.671444,0.371212,0.999624,0.035941


### Observe similarity.


### We don't have similarity lower than 0.6.


### Similarity value is important here because it will affect the BLEU metric that we have chosen.

### The more similar the sentences, the better, so I have chosen to retain only rows with similarity > 0.75.

In [12]:
df = df.loc[df['similarity'] > 0.75]
df

Unnamed: 0,reference,translation,similarity,length_diff,ref_tox,trn_tox
0,"if Alkar floods her with her mental waste, it ...","If Alkar is flooding her with psychic waste, t...",0.785171,0.010309,0.981983,0.014195
2,"well, we can spare your life.","Well, we could spare your life, for one.",0.919051,0.268293,0.985068,0.213313
8,"Briggs, what the hell is going on?","Briggs, what the hell's happening?",0.920373,0.000000,0.841071,0.159096
9,"another simply didn't know what to do, so when...","Another one simply had no clue what to do, so ...",0.877540,0.101695,0.930472,0.055371
10,you'd probably want me to buy you some chocola...,I suppose you want me to buy you flowers and c...,0.800661,0.160000,0.980341,0.000078
...,...,...,...,...,...,...
577768,And now nobody wanted to face this madman from...,"now, however, no one wanted to face the madman...",0.799664,0.134328,0.933597,0.042693
577769,Come here and bite me some more.,come here and bite me a little bit.,0.842079,0.083333,0.859086,0.024409
577770,I am so crazy nuts about you guys.,I'm so crazy about you guys.,0.934512,0.171429,0.973442,0.000709
577772,you didn't know that Estelle stole your fish f...,You didn't know that Estelle had stolen some f...,0.870322,0.030769,0.949143,0.000121


# Split df into train, validation, test, and convert to Hugging Face Dataset to work with pretrained model.

In [13]:
train_size = int(len(df) * 0.75)
validation_size = int(len(df) * 0.05)
test_size = int(len(df) * 0.2)

test_size

58669

In [14]:
train_df = df[:train_size]
validation_df = df[train_size:train_size + validation_size]
test_df = df[train_size + validation_size:]

test_df

Unnamed: 0,reference,translation,similarity,length_diff,ref_tox,trn_tox
461915,"I am shocked—no, Captain, sickened—to hear any...","I'm shocked - no, Captain, I'm sick - when I s...",0.751051,0.139423,0.957330,0.010467
461920,we need that monkey!,We need the monkey.,0.890185,0.047619,0.624110,0.023241
461921,"DJ, kick it.","DJ, hit it!",0.782414,0.076923,0.978920,0.000202
461923,"this morning, every fantasy of the last few da...",All my imaginings of the last few days seemed ...,0.872262,0.045455,0.810160,0.069654
461924,That's a big-ass phone.,that's an evil phone.,0.769321,0.083333,0.878251,0.003998
...,...,...,...,...,...,...
577768,And now nobody wanted to face this madman from...,"now, however, no one wanted to face the madman...",0.799664,0.134328,0.933597,0.042693
577769,Come here and bite me some more.,come here and bite me a little bit.,0.842079,0.083333,0.859086,0.024409
577770,I am so crazy nuts about you guys.,I'm so crazy about you guys.,0.934512,0.171429,0.973442,0.000709
577772,you didn't know that Estelle stole your fish f...,You didn't know that Estelle had stolen some f...,0.870322,0.030769,0.949143,0.000121


In [15]:
import datasets
from datasets import Dataset, DatasetDict

paranmt_ds = DatasetDict()

paranmt_ds['train'] = Dataset.from_pandas(train_df)
paranmt_ds['validation'] = Dataset.from_pandas(validation_df)
paranmt_ds['test'] = Dataset.from_pandas(test_df)

print(paranmt_ds)

DatasetDict({
    train: Dataset({
        features: ['reference', 'translation', 'similarity', 'length_diff', 'ref_tox', 'trn_tox', '__index_level_0__'],
        num_rows: 220011
    })
    validation: Dataset({
        features: ['reference', 'translation', 'similarity', 'length_diff', 'ref_tox', 'trn_tox', '__index_level_0__'],
        num_rows: 14667
    })
    test: Dataset({
        features: ['reference', 'translation', 'similarity', 'length_diff', 'ref_tox', 'trn_tox', '__index_level_0__'],
        num_rows: 58671
    })
})


### Crop the dataset.

In [16]:
cropped_datasets = paranmt_ds
cropped_datasets['train'] = paranmt_ds['train'].select(range(10000))
cropped_datasets['validation'] = paranmt_ds['validation'].select(range(1000))
cropped_datasets['test'] = paranmt_ds['test'].select(range(1000))
print(cropped_datasets['train'][0])

{'reference': 'if Alkar floods her with her mental waste, it would explain the high levels of neurotransmitter.', 'translation': 'If Alkar is flooding her with psychic waste, that explains the high level of neurotransmitters.', 'similarity': 0.785170556584, 'length_diff': 0.0103092783505154, 'ref_tox': 0.9819834232330322, 'trn_tox': 0.0141952484846115, '__index_level_0__': 0}


### Save datasets.

In [17]:
# Save full preprocessed Dataframe.
df.to_csv("..\data\interim\para-nmt-preprocessed.csv")

# Save preprocessed and cropped Hugging Face dataset.
cropped_datasets.save_to_disk("..\data\interim\para-nmt-preprocessed-cropped")

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]