In [None]:
!nvidia-smi

In [None]:
!pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio===0.9.1 -f  https://download.pytorch.org/whl/torch_stable.html -i https://pypi.tuna.tsinghua.edu.cn/simple some-package

In [None]:
!git clone https://github.com/pytorch/fairseq -q

In [None]:
%cd ./fairseq
!pip uninstall numpy -q -y
!pip install --editable ./ -q
%cd ..

In [None]:
!pip install sentencepiece -q
!pip install tensorboardX

In [None]:
!wget "http://dl.fbaipublicfiles.com/m2m_100/spm.128k.model"
!wget "http://dl.fbaipublicfiles.com/m2m_100/data_dict.128k.txt"
!wget "http://dl.fbaipublicfiles.com/m2m_100/model_dict.128k.txt"
!wget "http://dl.fbaipublicfiles.com/m2m_100/language_pairs_small_models.txt"
!wget "http://dl.fbaipublicfiles.com/m2m_100/language_pairs_small_models.txt"
!wget "https://dl.fbaipublicfiles.com/m2m_100/418M_last_checkpoint.pt"

In [None]:
import pandas as pd
import torch
import numpy as np
import os
import random
import threading 


def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False 
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(7)

In [None]:
PATH_TO_DATASET = "./"  #Where you stored the dataset

train = pd.read_csv(os.path.join(PATH_TO_DATASET, "dataset.csv"))

#Remove any possible duplicates
train = train.drop_duplicates(subset=["korean", "english"])


#Lowercase and remove trailing spaces
train["korean"] = train.apply(lambda x: (x.korean).strip().lower(), axis=1)
#train["english"] = train.english.apply(lambda x: x.lower())



train = train[["korean", "english"]]
train.columns = ["input_text", "target_text"]

#Train 95% / Validation 5% Split
validation = train.sample(frac=0.05).astype(str)
train = train.drop(index=validation.index).astype(str)


In [None]:
train_txt = "\n".join(train.input_text.values.tolist())

file = open("korean_txt_train.txt", "w")
file.write(train_txt)
file.close()


train_target_txt = "\n".join(train.target_text.values.tolist())

file = open("english_txt_train.txt", "w")
file.write(train_target_txt)
file.close()

In [None]:
validation_txt = "\n".join(validation.input_text.values.tolist())

file = open("korean_txt_validation.txt", "w")
file.write(validation_txt)
file.close()


validation_target_txt = "\n".join(validation.target_text.values.tolist())

file = open("english_txt_validation.txt", "w")
file.write(validation_target_txt)
file.close()

In [None]:
!python fairseq/scripts/spm_encode.py \
        --model spm.128k.model \
        --output_format=piece \
        --inputs=korean_txt_train.txt \
        --outputs=train.ko

In [None]:
!python fairseq/scripts/spm_encode.py \
        --model spm.128k.model \
        --output_format=piece \
        --inputs=english_txt_train.txt \
        --outputs=train.en

In [None]:
!python fairseq/scripts/spm_encode.py \
        --model spm.128k.model \
        --output_format=piece \
        --inputs=korean_txt_validation.txt \
        --outputs=val.ko

In [None]:
!python fairseq/scripts/spm_encode.py \
        --model spm.128k.model \
        --output_format=piece \
        --inputs=english_txt_validation.txt \
        --outputs=val.en

In [None]:
!fairseq-preprocess \
    --source-lang ko --target-lang en \
    --trainpref train \
    --validpref val \
    --thresholdsrc 0 --thresholdtgt 0 \
    --destdir data_bin \
    --srcdict model_dict.128k.txt --tgtdict model_dict.128k.txt

In [None]:
!mkdir checkpoint
!ls

In [None]:
def remove_checkpoints():
    threading.Timer(5, remove_checkpoints).start()
    files = os.listdir("checkpoint")
    #print("here")
  
    for file in files:
        if file != "checkpoint_best.pt" and file.split(".")[-1] =="pt":
            #os.remove("checkpoint/"+file)
            print("X Removed " + file)

remove_checkpoints()

In [None]:
!fairseq-train data_bin \
  --finetune-from-model  "418M_last_checkpoint.pt"\
  --save-dir checkpoint \
  --task translation_multi_simple_epoch \
  --encoder-normalize-before \
  --lang-pairs 'ko-en' \
  --batch-size 10 \
  --decoder-normalize-before \
  --encoder-langtok src \
  --decoder-langtok \
  --criterion cross_entropy \
  --optimizer adafactor \
  --lr-scheduler cosine \
  --lr 3e-05 \
  --max-update 40000 \
  --update-freq 2 \
  --save-interval 1 \
  --save-interval-updates 5000 \
  --keep-interval-updates 10 \
  --no-epoch-checkpoints \
  --log-format simple \
  --log-interval 2 \
  --patience 10 \
  --arch transformer_wmt_en_de_big \
  --encoder-layers 12 --decoder-layers 12 \
  --share-decoder-input-output-embed --share-all-embeddings \
  --ddp-backend no_c10d \
  --max-epoch 10 