# Create datasets with only translations or back-translations as augmented additions

In [1]:
# Import libraries
import pandas as pd
import os

## 1. Augmented dataset with only back-translations as augmentation

### Load datasets

In [2]:
# Load .csv with only back-translations
back = pd.read_csv('back-1-15.csv')
print(back.head)

<bound method NDFrame.head of                      id                                       comment_text  \
0      0002bcb3da6cb337                 FUCK OFF BEFORE YOU PISS IN MY JOB   
1      0002bcb3da6cb337                 COCKSUCKER BEFORE YOU SUCK MY WORK   
2      0002bcb3da6cb337     BLOWJOB BEFORE YOU GET PISSED OFF ABOUT MY JOB   
3      0002bcb3da6cb337          SUCK YOUR COCK BEFORE YOU PISS ON MY WORK   
4      0002bcb3da6cb337              COCKSUCKER BEFORE YOU PISS ON MY WORK   
...                 ...                                                ...   
75811  8adf8133ecb9c3b2  "\n\nLack of sources? The same Jewish newspape...   
75812  8adf8133ecb9c3b2  "\n\nLack of sources? The same Jewish newspape...   
75813  8adf8133ecb9c3b2  "\n\nLack of sources? The same Jewish newspape...   
75814  8adf8133ecb9c3b2  "\n\nLack of sources? The same Jewish newspape...   
75815  8adf8133ecb9c3b2  "\n\nShortage of sources? The same Jewish news...   

       toxic  severe_toxic  obsce

In [3]:
# Load original training data
df = pd.read_csv('jigsaw-toxic-comment-train.csv')

### Separate toxic and non-toxic samples of original training data

In [4]:

# Create 'tox' dataset where 'toxic' column is 1
tox = df[df['toxic'] == 1]

# Create 'nontox' dataset where 'toxic' column is 0
nontox = df[df['toxic'] == 0]

# Now 'tox' contains rows where 'toxic' == 1, and 'nontox' contains rows where 'toxic' == 0


### Concatenate augmented toxic data from back-translations and original toxic data

In [5]:
# Concatenate the 'tox' and 'back' DataFrames vertically (along rows)
concatenated_df = pd.concat([tox, back], ignore_index=True)


### Shuffle the non-toxic samples and truncate the dataframe at the same number, the toxic data has

Since the toxic data after augmentation with back-translation is still a less than the non-toxic data, the number of non-toxic data used for the new dataset is limited to the same length as the dataframe of the toxic data.

To ensure a randomness in the non-toxic data used, it is first shuffled.

In [9]:
# Get the length of concatenated_df
n = len(concatenated_df)

# Shuffle nontox
nontox_shuffled = nontox.sample(frac=1, random_state=42).reset_index(drop=True)

# Take the first 'n' rows
nontox_first_n = nontox_shuffled.head(n)

# Now nontox_first_n contains the first 'n' shuffled rows


### Concatenate the toxic and non-toxic

In [11]:
# Concatenate toxic and same number of samples of non-toxic
back_incl_non = pd.concat([concatenated_df, nontox_first_n], ignore_index=True)

### Shuffle the dataset to avoid having all toxic comments at the beginning

In [15]:
# Shuffle the dataset
back_incl_non = back_incl_non.sample(frac=1, random_state=42).reset_index(drop=True)

### Check result

In [16]:
print(back_incl_non.head)

<bound method NDFrame.head of                       id                                       comment_text  \
0       9391e0448ec3a85e  Mr Birdsmight, with due respect i have full ri...   
1       efd69e53c53a3127  So basically, the user Johnuniq is to follow t...   
2       b1ba84c35a13f661  I'm talking about the right comments, buddy. U...   
3       d3a86ff124309a1d  "\n\n Jimmy Wales used to make money from Porn...   
4       ec05157d086edc23  I did not ignore the hangon notice. I read you...   
...                  ...                                                ...   
194395  db304617e49b2f36  Seanad seats are the same as Dáil seats: in pr...   
194396  0f72773807de775f  Yadda Yadda Yadda. Nothing you just said is re...   
194397  6db49b1b04b6335a  I am sick of all these problems from incompete...   
194398  a0f795800ea397d1  We Bulgarians have nothing in common with Mong...   
194399  4b8465a54fa85619  If you don't see the idiotism or intentional d...   

        toxic  severe

### Save new dataset as .csv

In [27]:
back_incl_non.to_csv('back_incl_non.csv', index=False)

# 2. Augmented dataset with only translations as augmentation

## Repeat process for translations

### Load dataset with only translations

In [18]:
# Load file with only translations

tr = pd.read_csv('trans-1-15.csv')
print(tr.head)

<bound method NDFrame.head of                      id                                       comment_text  \
0      0002bcb3da6cb337                     İŞİME İŞEMEDEN ÖNCE SİKTİR GİT   
1      0002bcb3da6cb337          ХУЕСОС, ПРЕЖДЕ ЧЕМ ТЫ ОБСОСЁШЬ МОЮ РАБОТУ   
2      0002bcb3da6cb337         POMPINO PRIMA DI INCAZZARTI SUL MIO LAVORO   
3      0002bcb3da6cb337  SUÇON DE BITE AVANT QUE TU PISSES SUR MON TRAVAIL   
4      0002bcb3da6cb337  CHUPADOR DE PAU ANTES QUE VOCÊ MIJE NO MEU TRA...   
...                 ...                                                ...   
75811  8adf8133ecb9c3b2  "\n\nНехватка источников? Те же еврейские газе...   
75812  8adf8133ecb9c3b2  "\n\nCarenza di fonti? Gli stessi giornali ebr...   
75813  8adf8133ecb9c3b2  "\n\nManque de sources ? Les mêmes journaux ju...   
75814  8adf8133ecb9c3b2  "\n\nEscassez de fontes? Os mesmos jornais jud...   
75815  8adf8133ecb9c3b2  "\n\n¿Escasez de fuentes? Los mismos periódico...   

       toxic  severe_toxic  obsce

### Concatenate augmented toxic data from translations and original toxic data

In [19]:
# Concatenate the 'tox' and 'tr' DataFrames vertically (along rows)
concatenated_df = pd.concat([tox, tr], ignore_index=True)

### Shuffle the non-toxic samples and truncate the dataframe at the same number, the toxic data has

Since the toxic data after augmentation with translation is still a less than the non-toxic data, the number of non-toxic data used for the new dataset is limited to the same length as the dataframe of the toxic data.

To ensure a randomness in the non-toxic data used, it is first shuffled.

In [20]:
# Get the length of concatenated_df
n = len(concatenated_df)

# Shuffle nontox
nontox_shuffled = nontox.sample(frac=1, random_state=42).reset_index(drop=True)

# Take the first 'n' rows
nontox_first_n = nontox_shuffled.head(n)

# Now nontox_first_n contains the first 'n' shuffled rows


### Concatenate the toxic and non-toxic

In [21]:
# Concatenate toxic and same number of samples of non-toxic
tr_incl_non = pd.concat([concatenated_df, nontox_first_n], ignore_index=True)

### Shuffle the dataset to avoid having all toxic comments at the beginning

In [25]:
tr_incl_non = tr_incl_non.sample(frac=1, random_state=42).reset_index(drop=True)

### Check result

In [26]:
print(tr_incl_non.head)

<bound method NDFrame.head of                       id                                       comment_text  \
0       9391e0448ec3a85e  Mr Birdsmight, with due respect i have full ri...   
1       efd69e53c53a3127  So basically, the user Johnuniq is to follow t...   
2       b1ba84c35a13f661  Я говорю о правильных комментариях, приятель. ...   
3       d3a86ff124309a1d  "\n\n Jimmy Wales used to make money from Porn...   
4       ec05157d086edc23  I did not ignore the hangon notice. I read you...   
...                  ...                                                ...   
194395  db304617e49b2f36  Seanad seats are the same as Dáil seats: in pr...   
194396  0f72773807de775f  Yadda Yadda Yadda. Az önce söylediğin hiçbir ş...   
194397  6db49b1b04b6335a  I am sick of all these problems from incompete...   
194398  a0f795800ea397d1  Nous, les Bulgares, n'avons rien en commun ave...   
194399  4b8465a54fa85619  If you don't see the idiotism or intentional d...   

        toxic  severe

### Save new dataset as .csv

In [28]:
tr_incl_non.to_csv('tr_incl_non.csv', index=False)