### Download and display the data.

In [2]:
import pandas as pd

# I am using double space separator to parse data correctly.

df = pd.read_csv(filepath_or_buffer="D:/Py_Projects/Inno-PMLDL/data/raw/filtered.tsv", sep='	', header=0)

# Dropping unnecessary column with row indices because they are already included in pandas DataFrame.
df = df.drop('Unnamed: 0', axis=1)
df

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348
...,...,...,...,...,...,...
577772,You didn't know that Estelle had stolen some f...,you didn't know that Estelle stole your fish f...,0.870322,0.030769,0.000121,0.949143
577773,It'il suck the life out of you!,you'd be sucked out of your life!,0.722897,0.058824,0.996124,0.215794
577774,"I can't fuckin' take that, bruv.",I really can't take this.,0.617511,0.212121,0.984538,0.000049
577775,They called me a fucking hero. The truth is I ...,"they said I was a hero, but I didn't care.",0.679613,0.358209,0.991945,0.000124


### As was said in the assignment task, it is better for us to find pairs, where reference text has high toxicity level, and translation - low.
### We should be careful because there are references with low toxicity and translations with high toxicities.

### Find pairs with ref_tox > trn_tox (for now). In the future, we can also try to compare toxicities difference with another small value to remove similar toxicities from the training.

In [22]:
df_train = df[df['ref_tox'] > df['trn_tox']]
df_train

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
5,I'm not gonna have a child... ...with the same...,I'm not going to breed kids with a genetic dis...,0.703185,0.206522,0.950956,0.035846
6,"They're all laughing at us, so we'll kick your...",they're laughing at us. We'll show you.,0.618866,0.230769,0.999492,0.000131
7,Maine was very short on black people back then.,there wasn't much black in Maine then.,0.720482,0.187500,0.963680,0.148710
11,"So now their spirits are cursed, walking back ...","their souls are cursed, they guard the paths, ...",0.755883,0.013245,0.842509,0.143992
13,"Come on, Cal, leave that shit alone.","come on, Cal, put it down.",0.660481,0.270270,0.999637,0.000279
...,...,...,...,...,...,...
577770,I am so crazy nuts about you guys.,I'm so crazy about you guys.,0.934512,0.171429,0.973442,0.000709
577771,"I thought American men were bad enough, but no...","an American man is worth nothing, but for you,...",0.671444,0.371212,0.999624,0.035941
577773,It'il suck the life out of you!,you'd be sucked out of your life!,0.722897,0.058824,0.996124,0.215794
577774,"I can't fuckin' take that, bruv.",I really can't take this.,0.617511,0.212121,0.984538,0.000049


### Secondly, obviously, we need to perform text cleaning to reduce the data dimensionality that includes:
### lowercasing, removing punctuation, stop words, and numbers, tokenizing, and stemming both reference and translation columns.

### For this, I will use the code from the labs and slightly change it for our dataset.

In [23]:
import re

def lower_text(text: str):
    return text.lower()

def remove_numbers(text: str):
    """
    Substitute all punctuations with space in case of
    "there is5dogs".
    
    If subs with '' -> "there isdogs"
    With ' ' -> there is dogs
    """
    text_nonum = re.sub(r'\d+', ' ', text)
    return text_nonum

def remove_punctuation(text: str):
    """
    Substitute all punctiations with space in case of
    "hello!nice to meet you"
    
    If subs with '' -> "hellonice to meet you"
    With ' ' -> "hello nice to meet you"
    """
    text_nopunct = re.sub(r'[^a-z|\s]+', ' ', text)
    return text_nopunct

def remove_multiple_spaces(text: str):
    text_no_doublespace = re.sub('\s+', ' ', text).strip()
    return text_no_doublespace

### Test lowercasing, numbers, punctuation, and multiple spaces removal.

In [25]:
sample_text = df.reference[0]

_lowered = lower_text(sample_text)
_without_numbers = remove_numbers(_lowered)
_without_punct = remove_punctuation(_without_numbers)
_single_spaced = remove_multiple_spaces(_without_punct)

print("Sample text:\n", sample_text)
print('-'*10)
print("Lowercased:\n", _lowered)
print('-'*10)
print("Removed numbers:\n", _without_numbers)
print('-'*10)
print("Removed punctuation:\n", _without_punct)
print('-'*10)
print("Removed mult spaces:\n", _single_spaced)

Sample text:
 If Alkar is flooding her with psychic waste, that explains the high level of neurotransmitters.
----------
Lowercased:
 if alkar is flooding her with psychic waste, that explains the high level of neurotransmitters.
----------
Removed numbers:
 if alkar is flooding her with psychic waste, that explains the high level of neurotransmitters.
----------
Removed punctuation:
 if alkar is flooding her with psychic waste  that explains the high level of neurotransmitters 
----------
Removed mult spaces:
 if alkar is flooding her with psychic waste that explains the high level of neurotransmitters


### Download nltk packages.

In [26]:
import nltk


nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alifa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alifa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


def tokenize_text(text: str) -> list[str]:
    return word_tokenize(text)


def remove_stop_words(tokenized_text: list[str]) -> list[str]:
    stop_words = set(stopwords.words('english'))
    return [word for word in tokenized_text if word not in stop_words]


# TODO: Produces weird words, consider replacing with lemmatisation
def stem_words(tokenized_text: list[str]) -> list[str]:
    stemmer = PorterStemmer()
    return [stemmer.stem(plural) for plural in tokenized_text]

### Test tokenisation, stop words removal, and stemming.

In [28]:
_tokenized = tokenize_text(_single_spaced)
_without_sw = remove_stop_words(_tokenized)
_stemmed = stem_words(_without_sw)

print("Previous text:\n", _single_spaced)
print('-'*10)
print("Tokenised:\n", _tokenized)
print('-'*10)
print("Without stop words:\n", _without_sw)
print('-'*10)
print("Stemmed:\n", _stemmed)

Previous text:
 if alkar is flooding her with psychic waste that explains the high level of neurotransmitters
----------
Tokenised:
 ['if', 'alkar', 'is', 'flooding', 'her', 'with', 'psychic', 'waste', 'that', 'explains', 'the', 'high', 'level', 'of', 'neurotransmitters']
----------
Without stop words:
 ['alkar', 'flooding', 'psychic', 'waste', 'explains', 'high', 'level', 'neurotransmitters']
----------
Stemmed:
 ['alkar', 'flood', 'psychic', 'wast', 'explain', 'high', 'level', 'neurotransmitt']


In [29]:
def preprocessing_stage(text):
    _lowered = lower_text(text)
    _without_numbers = remove_numbers(_lowered)
    _without_punct = remove_punctuation(_without_numbers)
    _single_spaced = remove_multiple_spaces(_without_punct)
    _tokenized = tokenize_text(_single_spaced)
    _without_sw = remove_stop_words(_tokenized)
    _stemmed = stem_words(_without_sw)
    
    return _stemmed

def clean_text(df):
    df['reference'] = df['reference'].apply(preprocessing_stage)
    df['translation'] = df['translation'].apply(preprocessing_stage)
    return df

In [30]:
df_train_cleaned = clean_text(df_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['reference'] = df['reference'].apply(preprocessing_stage)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['translation'] = df['translation'].apply(preprocessing_stage)


In [31]:
df_train_cleaned

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
5,"[gon, na, child, genet, disord, gon, na, die, l]","[go, breed, kid, genet, disord, make, die]",0.703185,0.206522,0.950956,0.035846
6,"[laugh, us, kick, ass]","[laugh, us, show]",0.618866,0.230769,0.999492,0.000131
7,"[main, short, black, peopl, back]","[much, black, main]",0.720482,0.187500,0.963680,0.148710
11,"[spirit, curs, walk, back, road, waterway, fin...","[soul, curs, guard, path, say, encount, unfait...",0.755883,0.013245,0.842509,0.143992
13,"[come, cal, leav, shit, alon]","[come, cal, put]",0.660481,0.270270,0.999637,0.000279
...,...,...,...,...,...,...
577770,"[crazi, nut, guy]","[crazi, guy]",0.934512,0.171429,0.973442,0.000709
577771,"[thought, american, men, bad, enough, none, ev...","[american, man, worth, noth, feel, like, helpl...",0.671444,0.371212,0.999624,0.035941
577773,"[il, suck, life]","[suck, life]",0.722897,0.058824,0.996124,0.215794
577774,"[fuckin, take, bruv]","[realli, take]",0.617511,0.212121,0.984538,0.000049
