# Installing Dependencies

In [None]:
! pip install sentencepiece -q
! pip install wandb
import numpy
import pandas as pd
import torch
import numpy as np
import os
import random

import wandb

wandb.login()

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
set_seed(7)
from google.colab import drive
drive.mount('/content/drive')
!mkdir checkpoint

In [None]:
!git clone https://github.com/pytorch/fairseq -q
%cd fairseq
!pip uninstall numpy -q -y
!pip install wandb -q
!pip install --editable ./ -q
%cd ..


In [None]:
import torch

torch.cuda.get_device_name(0)

In [None]:
!wget -qq "https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model"
!wget -qq "https://dl.fbaipublicfiles.com/m2m_100/data_dict.128k.txt"
!wget -qq "https://dl.fbaipublicfiles.com/m2m_100/model_dict.128k.txt"
!wget -qq "https://dl.fbaipublicfiles.com/m2m_100/language_pairs_small_models.txt"
!wget     "https://dl.fbaipublicfiles.com/m2m_100/418M_last_checkpoint.pt"


# Data Preprocessing

In [None]:
!unzip /content/DATA.zip
!unzip /content/final_split.zip

In [8]:
df = pd.read_csv("/content/final_training_set.csv")
df.info()

In [None]:
#prepare the training set
PATH_TO_DATASET = "/content/"  #Where you stored the dataset

train = pd.read_csv(os.path.join(PATH_TO_DATASET, "final_training_set.csv"))
train.columns = ["Swahili","Arabic"]

#Remove any possible duplicates
train = train.drop_duplicates(subset=["Arabic", "Swahili"])

#Lowercase and remove trailing spaces
train["Arabic"] = train.apply(lambda x: (x.Arabic).strip().lower(), axis=1)
train["Swahili"] = train.Swahili.apply(lambda x: x.lower())

train = train[[ "Swahili","Arabic" ]]
train.columns = ["input_text", "target_text"]


#prepare the test set
validation = pd.read_csv(os.path.join(PATH_TO_DATASET, "final_test_set.csv"))
validation.columns = ["Swahili","Arabic"]
#Remove any possible duplicates
validation = validation.drop_duplicates(subset=["Arabic", "Swahili"])

#Lowercase and remove trailing spaces
validation["Arabic"] = validation.apply(lambda x: (x.Arabic).strip().lower(), axis=1)
validation["Swahili"] = validation.Swahili.apply(lambda x: x.lower())

validation = validation[["Swahili","Arabic"]]
validation.columns = ["input_text", "target_text"]


train.info()
validation.info()

In [17]:
# For the training set
train_txt = "\n".join(train.input_text.values.tolist())
file = open("Swahili_txt_train.txt", "w")

file.write(train_txt)
file.close()


train_target_txt = "\n".join(train.target_text.values.tolist())
file = open("Arabic_txt_train.txt", "w")

file.write(train_target_txt)
file.close()

# For the validation set
validation_txt = "\n".join(validation.input_text.values.tolist())
file = open("Swahili_txt_validation.txt", "w")

file.write(validation_txt)
file.close()


validation_target_txt = "\n".join(validation.target_text.values.tolist())
file = open("Arabic_txt_validation.txt", "w")

file.write(validation_target_txt)
file.close()

In [None]:
#BPE Tokenization
!python fairseq/scripts/spm_encode.py \
        --model spm.128k.model \
        --output_format=piece \
        --inputs=Arabic_txt_train.txt \
        --outputs=train.ar

!python fairseq/scripts/spm_encode.py \
        --model spm.128k.model \
        --output_format=piece \
        --inputs=Swahili_txt_train.txt \
        --outputs=train.sw

!python fairseq/scripts/spm_encode.py \
        --model spm.128k.model \
        --output_format=piece \
        --inputs=Arabic_txt_validation.txt \
        --outputs=val.ar
!python fairseq/scripts/spm_encode.py \
        --model spm.128k.model \
        --output_format=piece \
        --inputs=Swahili_txt_validation.txt \
        --outputs=val.sw

In [None]:
! echo $PYTHONPATH

import os
os.environ['PYTHONPATH'] += ":/content/fairseq/"

! echo $PYTHONPATH

In [None]:
#next is preparing the data to be fed to the transformer

In [None]:
!fairseq-preprocess \
    --source-lang sw --target-lang ar \
    --trainpref train \
    --validpref val \
    --thresholdsrc 0 --thresholdtgt 0 \
    --destdir data_bin \
    --srcdict model_dict.128k.txt --tgtdict model_dict.128k.txt

# Training
The M2M100 model was finetuned starting with the initial downloaded checkpoint : 418M_last_checkpoint.pt

In [None]:
!fairseq-train data_bin \
  --finetune-from-model  "/content/418M_last_checkpoint.pt"\
  --save-dir '/content/drive/MyDrive/M2M_FINAL/SWTOAR' \
  --task translation_multi_simple_epoch \
  --encoder-normalize-before \
  --lang-pairs 'sw-ar' \
  --batch-size 16 \
  --decoder-normalize-before \
  --encoder-langtok src \
  --decoder-langtok \
  --criterion cross_entropy \
  --optimizer adafactor \
  --lr-scheduler cosine \
  --lr 3e-05 \
  --max-update 40000 \
  --update-freq 2 \
  --save-interval 1 \
  --save-interval-updates 5000 \
  --keep-interval-updates 10 \
  --no-epoch-checkpoints \
  --log-format simple \
  --log-interval 2 \
  --patience 10 \
  --arch transformer_wmt_en_de_big \
  --encoder-layers 12 --decoder-layers 12 \
  --share-decoder-input-output-embed --share-all-embeddings \
  --ddp-backend no_c10d \
  --max-epoch 15 \
  --wandb-project "Swahili TO ARABIC M2M"

# Evaluation & Calculating BLEU Score

In [21]:
# !rm -rf data_bin

In [22]:
!fairseq-preprocess \
    --source-lang sw --target-lang ar \
    --testpref val \
    --thresholdsrc 0 --thresholdtgt 0 \
    --destdir data_bin \
    --srcdict data_dict.128k.txt --tgtdict data_dict.128k.txt

2023-12-07 23:12:06.724866: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-07 23:12:06.724928: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-07 23:12:06.724966: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-07 23:12:06.735795: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
INFO:fairseq.tasks.text_to_speech:Please i

In [None]:
#for calculating BLEU score with BPE
!fairseq-generate "data_bin/"  --batch-size 32 \
      --path "/content/checkpoint/checkpoint_last.pt" \
      --fixed-dictionary model_dict.128k.txt \
      -s sw -t ar \
      --beam 5 \
      --task translation_multi_simple_epoch \
      --lang-pairs language_pairs_small_models.txt \
      --decoder-langtok \
      --encoder-langtok src \
      --gen-subset test > outb.txt
#for calculating BLEU score without BPE
# !fairseq-generate "data_bin/"  --batch-size 32 \
#       --path "/content/checkpoint/checkpoint_last.pt" \
#       --fixed-dictionary model_dict.128k.txt \
#       -s sw -t ar \
#       --beam 5 \
#       --task translation_multi_simple_epoch \
#       --lang-pairs language_pairs_small_models.txt \
#       --remove-bpe 'sentencepiece' \
#       --decoder-langtok \
#       --encoder-langtok src \
#       --gen-subset test > outb.txt


In [None]:
#Without BPE
# Generate test with beam=5: BLEU4 = 12.55, 36.8/17.2/9.0/5.0 (BP=0.965, ratio=0.966, syslen=49535, reflen=51302)

#With BPE
# Generate test with beam=5: BLEU4 = 26.28, 51.5/31.0/21.5/15.1 (BP=0.979, ratio=0.979, syslen=110406, reflen=112742)