# Data Preparation

In [10]:
import pandas as pd
from numpy import random
import os
from src.Normalizer import preprocess_data
from src.utils import load_config

## Tatoeba Data Cleaning and splitting

In [11]:
## Preparing the data from https://tatoeba.org/en/downloads, dowladed on 16/07/2024
## Read and covvert into two txt files
data = pd.read_csv(
    "./data/train/Sentence pairs in English-Afrikaans - 2024-07-16.tsv", sep="\t"
)
columns = list(data.columns)
en = columns[1]
af = columns[3]
data[en].to_csv("./data/train/tatoeba.en.txt", index=False)
data[af].to_csv("./data/train/tatoeba.af.txt", index=False)

In [12]:
## Split into train and validation data
en_train_path = "./data/train/tatoeba.en.txt"
af_train_path = "./data/train/tatoeba.af.txt"


with open(en_train_path, "r") as en_train, open(af_train_path, "r") as af_train:
    en_train_data = en_train.readlines()
    af_train_data = af_train.readlines()

assert len(en_train_data) == len(
    af_train_data
), "The two files must have the same number of lines."


combined_data = list(zip(en_train_data, af_train_data))
random.shuffle(combined_data)
en_data, af_data = zip(*combined_data)

train_size = int(0.7 * len(en_train_data))  # 70% for training, 30% for validation
en_train_data, en_val_data = en_data[:train_size], en_data[train_size:]
af_train_data, af_val_data = af_data[:train_size], af_data[train_size:]

en_train_path = "./data/train/train_tatoeba.en.txt"
af_train_path = "./data/train/train_tatoeba.af.txt"
en_val_path = "./data/val/val_tatoeba.en.txt"
af_val_path = "./data/val/val_tatoeba.af.txt"

with open(en_train_path, "w") as en_train_file, open(
    af_train_path, "w"
) as af_train_file, open(en_val_path, "w") as en_val_file, open(
    af_val_path, "w"
) as af_val_file:
    for en_train_data, af_train_data in zip(en_train_data, af_train_data):
        en_train_file.write(en_train_data)
        af_train_file.write(af_train_data)
    for en_val_data, af_val_data in zip(en_val_data, af_val_data):
        en_val_file.write(en_val_data)
        af_val_file.write(af_val_data)

print("Files have been split successfully!")

Files have been split successfully!


## Augmeneted data-set

In [13]:
config = load_config()
# TRAIN_DATA
preprocess_data(config.TRAIN_RAW, config.TRAIN_DATA, config.TRAIN_SOURCE, "english")
preprocess_data(config.TRAIN_RAW, config.TRAIN_DATA, config.TRAIN_TARGET, "afrikaans")

# VAL_DATA
preprocess_data(config.VAL_RAW, config.VAL_DATA, config.VAL_SOURCE, "english")
preprocess_data(config.VAL_RAW, config.VAL_DATA, config.VAL_TARGET, "afrikaans")

Done for english!
Done for afrikaans!
Done for english!
Done for afrikaans!


## SUN only validation data

In [14]:
config = load_config("config_val_sun_only.json")
preprocess_data(config.VAL_RAW, config.VAL_DATA, config.VAL_SOURCE, "sun_english")
preprocess_data(config.VAL_RAW, config.VAL_DATA, config.VAL_TARGET, "sun_afrikaans")

Done for sun_english!
Done for sun_afrikaans!
