# Create Different Augmented Datasets from the Chunks with Translations and Back-Translations

In [1]:
# Import libraries

import pandas as pd
import os

# Get the .csv files of the chunks that where the translation and back-translation was successful (chunk 1 to 15 without 3 and 8)
csv_files = [f"chunk-{str(i).zfill(2)}_.csv" if i != 3 and i != 8 else f"chunk-{str(i).zfill(2)}.csv" for i in range(1, 16)]

# Read and concatenate all .csv files into one DataFrame
df_list = [pd.read_csv(file) for file in csv_files if os.path.exists(file)]
back_trans_1_15 = pd.concat(df_list, ignore_index=True)

# Save the concatenated DataFrame to a new .csv file
# This file contains all translations and back-translations, but not yet in the correct format for training
back_trans_1_15.to_csv('back-trans-1-15.csv', index=False)

print("CSV files have been concatenated and saved as 'back-trans-1-15.csv'.")


CSV files have been concatenated and saved as 'back-trans-1-15.csv'.


## Create new training sets with augmented data

trans-1-15.csv: Contains only translations from English comments to different languages

back-1-15.csv: Contains only back-translations from non-English translations to English

In [2]:
# Create the trans-1-15 dataframe
# Required columns are identical to the training set
trans_columns = ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# Columns with the translations in 6 languages
trans_comment_cols = ['trans_comment-tr', 'trans_comment-ru', 'trans_comment-it', 'trans_comment-fr', 'trans_comment-pt', 'trans_comment-es']

# Create empty list
trans_data = []

# Fill list with data from relevant columns
for _, row in back_trans_1_15.iterrows():
    for col in trans_comment_cols:
        trans_data.append({
            'id': row['id'],
            'comment_text': row[col],
            'toxic': row['toxic'],
            'severe_toxic': row['severe_toxic'],
            'obscene': row['obscene'],
            'threat': row['threat'],
            'insult': row['insult'],
            'identity_hate': row['identity_hate']
        })

# Convert list to DataFrame
trans_1_15 = pd.DataFrame(trans_data)

# Save trans-1-15 to a .csv file
trans_1_15.to_csv('trans-1-15.csv', index=False)

# Create the back-1-15 dataframe
# Required columns are identical to the training set
back_columns = ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# Columns with the back-translations from 6 languages to English
back_comment_cols = ['back-trans_comment-tr', 'back-trans_comment-ru', 'back-trans_comment-it', 'back-trans_comment-fr', 'back-trans_comment-pt', 'back-trans_comment-es']

# Create empty list
back_data = []

# Fill list with data from relevant columns
for _, row in back_trans_1_15.iterrows():
    for col in back_comment_cols:
        back_data.append({
            'id': row['id'],
            'comment_text': row[col],
            'toxic': row['toxic'],
            'severe_toxic': row['severe_toxic'],
            'obscene': row['obscene'],
            'threat': row['threat'],
            'insult': row['insult'],
            'identity_hate': row['identity_hate']
        })

# Convert list to DataFrame
back_1_15 = pd.DataFrame(back_data)

# Save back-1-15 to a .csv file
back_1_15.to_csv('back-1-15.csv', index=False)

# Print confirmation
print("DataFrames 'trans-1-15' and 'back-1-15' have been created and saved.")


DataFrames 'trans-1-15' and 'back-1-15' have been created and saved.


In [3]:
# Print the shape of the two new datasets
print("Shape of trans-1-15:", trans_1_15.shape)
print("Shape of back-1-15:", back_1_15.shape)



Shape of trans-1-15: (75816, 8)
Shape of back-1-15: (75816, 8)


## Add augmented toxic data to the original training dataset

### Separate toxic and non-toxic samples of the original training data

In [4]:
# Read the original dataset into df
df = pd.read_csv('jigsaw-toxic-comment-train.csv')

# Create non-tox DataFrame: rows where toxic == 0 (= only non-toxic comments)
non_tox = df[df['toxic'] == 0]

# Create orig-tox DataFrame: rows where toxic == 1 (= only toxic comments)
orig_tox = df[df['toxic'] == 1]

# Save the new DataFrames as .csv files
non_tox.to_csv('non-tox.csv', index=False)
orig_tox.to_csv('orig-tox.csv', index=False)

# Print confirmation
print("DataFrames 'non-tox' and 'orig-tox' have been created and saved.")


DataFrames 'non-tox' and 'orig-tox' have been created and saved.


### Create dataset with all toxic data (original and translated and back-translated augmented data)

In [5]:
# Concatenate the DataFrames orig-tox, trans-1-15, and back-1-15
all_tox = pd.concat([orig_tox, trans_1_15, back_1_15], ignore_index=True)

# Save the concatenated DataFrame as .csv file
all_tox.to_csv('all-tox.csv', index=False)

# Print confirmation
print("DataFrame 'all-tox' has been created and saved.")


DataFrame 'all-tox' has been created and saved.


### Create training dataset with all toxic data and non-toxic data

In [6]:
# Concatenate the DataFrames all_tox and non_tox
all_incl = pd.concat([all_tox, non_tox], ignore_index=True)

# Shuffle the rows
all_incl_shuffle = all_incl.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the shuffled DataFrame as a .csv file
all_incl_shuffle.to_csv('all_incl_aug_shuffle.csv', index=False)

# Print confirmation
print("DataFrame 'all_incl_aug_shuffle' has been created and saved.")


DataFrame 'all_incl_aug_shuffle' has been created and saved.


In [9]:
# Print the shape of the final augmented dataset
print("Shape of all_incl_shuffle:", all_incl_shuffle.shape)

# Print the shape of the all-toxic dataset: 
print("Shape of all-tox:", all_tox.shape)

Shape of all_incl_shuffle: (375181, 8)
Shape of all-tox: (173016, 8)
