## INITIAL DATA ANALYSIS 

In [10]:
import pandas as pd

In [11]:
#read data
df = pd.read_csv('../data/raw_filtered.tsv', sep='\t')

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 577777 entries, 0 to 577776
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   577777 non-null  int64  
 1   reference    577777 non-null  object 
 2   translation  577777 non-null  object 
 3   similarity   577777 non-null  float64
 4   lenght_diff  577777 non-null  float64
 5   ref_tox      577777 non-null  float64
 6   trn_tox      577777 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 30.9+ MB


## [DATA PREPROCESSING]()

#### [TEXT CLEANING AND NORMALIZATION]()

In [14]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

def preprocess_text(text):
    text = clean_text(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SunagatullinAyaz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SunagatullinAyaz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SunagatullinAyaz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
df['reference'] = df['reference'].apply(preprocess_text)
df['translation'] = df['translation'].apply(preprocess_text)

In [16]:
# save as "cleaned" file
df.to_csv('../data/inter_cleaned.tsv', sep='\t', index=False)

In [17]:
cleaned = pd.read_csv('../data/inter_cleaned.tsv', sep='\t')

In [18]:
cleaned.drop("Unnamed: 0", axis=1, inplace=True)

In [19]:
cleaned.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,alkar flood psychic waste explain high level n...,alkar flood mental waste would explain high le...,0.785171,0.010309,0.014195,0.981983
1,get nasty,become disgust,0.749687,0.071429,0.065473,0.999039
2,well could spare life one,well spare life,0.919051,0.268293,0.213313,0.985068
3,ah monkey get snap,monkey wake,0.664333,0.309524,0.053362,0.994215
4,get order put,order kill,0.726639,0.181818,0.009402,0.999348


#### [Handling NONE]()

In [20]:
# delete rows with NONE
cleaned = cleaned[cleaned['reference'].notna()]
cleaned = cleaned[cleaned['translation'].notna()]

In [21]:
# show length of reference and translation
cleaned['ref_len'] = cleaned['reference'].apply(lambda x: len(x.split()))
cleaned['trn_len'] = cleaned['translation'].apply(lambda x: len(x.split()))


cleaned.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox,ref_len,trn_len
0,alkar flood psychic waste explain high level n...,alkar flood mental waste would explain high le...,0.785171,0.010309,0.014195,0.981983,8,9
1,get nasty,become disgust,0.749687,0.071429,0.065473,0.999039,2,2
2,well could spare life one,well spare life,0.919051,0.268293,0.213313,0.985068,5,3
3,ah monkey get snap,monkey wake,0.664333,0.309524,0.053362,0.994215,4,2
4,get order put,order kill,0.726639,0.181818,0.009402,0.999348,3,2


#### [HANDLING NUMERICAL VALUES]()

In [22]:
# replace numerical values with token "num"
cleaned['reference'] = cleaned['reference'].str.replace('\d+', 'num')
cleaned['translation'] = cleaned['translation'].str.replace('\d+', 'num')

In [23]:
#cleaned = cleaned.drop(['ref_len', 'trn_len'], axis=1)

In [24]:
cleaned.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox,ref_len,trn_len
0,alkar flood psychic waste explain high level n...,alkar flood mental waste would explain high le...,0.785171,0.010309,0.014195,0.981983,8,9
1,get nasty,become disgust,0.749687,0.071429,0.065473,0.999039,2,2
2,well could spare life one,well spare life,0.919051,0.268293,0.213313,0.985068,5,3
3,ah monkey get snap,monkey wake,0.664333,0.309524,0.053362,0.994215,4,2
4,get order put,order kill,0.726639,0.181818,0.009402,0.999348,3,2


In [25]:
# if translation toxicity is higher than reference toxicity, then swap them
for i in range(len(cleaned)):
    if cleaned.iloc[i]['ref_tox'] < cleaned.iloc[i]['trn_tox']:
        cleaned.iloc[i]['reference'], cleaned.iloc[i]['translation'] = cleaned.iloc[i]['translation'], cleaned.iloc[i]['reference']
        cleaned.iloc[i]['ref_tox'], cleaned.iloc[i]['trn_tox'] = cleaned.iloc[i]['trn_tox'], cleaned.iloc[i]['ref_tox']
        
cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned.iloc[i]['reference'], cleaned.iloc[i]['translation'] = cleaned.iloc[i]['translation'], cleaned.iloc[i]['reference']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned.iloc[i]['ref_tox'], cleaned.iloc[i]['trn_tox'] = cleaned.iloc[i]['trn_tox'], cleaned.iloc[i]['ref_tox']


Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox,ref_len,trn_len
0,alkar flood psychic waste explain high level n...,alkar flood mental waste would explain high le...,0.785171,0.010309,0.014195,0.981983,8,9
1,get nasty,become disgust,0.749687,0.071429,0.065473,0.999039,2,2
2,well could spare life one,well spare life,0.919051,0.268293,0.213313,0.985068,5,3
3,ah monkey get snap,monkey wake,0.664333,0.309524,0.053362,0.994215,4,2
4,get order put,order kill,0.726639,0.181818,0.009402,0.999348,3,2


In [27]:
# save cleaned data as "cleaned_new.tsv"
cleaned.to_csv('../data/final_cleaned.tsv', sep='\t', index=False)