In [3]:
import os

target_languages = ['zu','sw','ki','kam','ig']
source_language = "en"

lc = False  # If True, lowercase the data.
seed = 42  # Random seed for shuffling.
tag = "baseline" # Give a unique name to your folder - this is to ensure you don't rewrite any models you've already submitted


In [None]:
# Install opus-tools
! pip install opustools-pkg

In [None]:
#Look into directories manenos here

In [None]:
def download_corpus(source_language, target_language):
    os.environ["src"] = source_language
    os.environ["tgt"] = target_language

    os.environ["tag"] = tag

    os.system("opus_read -d JW300 -s $src -t $tgt -wm moses -w jw300.$src jw300.$tgt -q")

    # extract the corpus file
    os.system("gunzip JW300_latest_xml_$src-$tgt.xml.gz")

In [None]:
def download_test_set(source_language, target_language):
    #Download test set for english- kiswahili, to be used for maml validation
    os.environ["trg"] = target_language
    os.environ["src"] = source_language 

    os.system("wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-$trg.en")
    os.system("mv test.en-$trg.en test.en")
    os.system("wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-$trg.$trg")
    os.system("mv test.en-$trg.$trg test.$trg")

In [4]:
#make directories for all the 5 langs, where all the downloaded data will be placed
#enzu,ensw,enki,enkam,enig

for i in range(len(target_languages)):
    target_language = target_languages[i]
    data_path = source_language+target_language
    os.system("mkdir %s" % data_path)
    
    os.system("cd %s" % data_path)
    download_corpus(source_language, target_language)
    download_test_set(source_language, target_language)
    
    os.system("cd ..")

In [None]:
# Download the global test set.
! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-any.en

In [None]:
#check your directory before
def filter_test_data:
    # Read the test data to filter from train and dev splits.
    # Store english portion in set for quick filtering checks.
    en_test_sents = set()
    filter_test_sents = "test.en-any.en"
    j = 0
    with open(filter_test_sents) as f:
      for line in f:
        en_test_sents.add(line.strip())
        j += 1
    print('Loaded {} global test sentences to filter from the training/dev data.'.format(j))

In [5]:
import pandas as pd

def create_data_frame(source_file, target_file):
    source = []
    target = []
    skip_lines = []  # Collect the line numbers of the source portion to skip the same lines for the target portion.
    with open(source_file) as f:
        for i, line in enumerate(f):
            # Skip sentences that are contained in the test set.
            if line.strip() not in en_test_sents:
                source.append(line.strip())
            else:
                skip_lines.append(i)             
    with open(target_file) as f:
        for j, line in enumerate(f):
            # Only add to corpus if corresponding source was not skipped.
            if j not in skip_lines:
                target.append(line.strip())

    print('Loaded data and skipped {}/{} lines since contained in test set.'.format(len(skip_lines), i))

    df = pd.DataFrame(zip(source, target), columns=['source_sentence', 'target_sentence'])
    # if you get TypeError: data argument can't be an iterator is because of your zip version run this below
    #df = pd.DataFrame(list(zip(source, target)), columns=['source_sentence', 'target_sentence'])
    df.head(3)
    return df

In [6]:
! pip install fuzzywuzzy
! pip install python-Levenshtein

import time
from fuzzywuzzy import process
import numpy as np
from os import cpu_count
from functools import partial
from multiprocessing import Pool

# Filtering function. Adjust pad to narrow down the candidate matches to
# within a certain length of characters of the given sample.
def fuzzfilter(sample, candidates, pad):
  candidates = [x for x in candidates if len(x) <= len(sample)+pad and len(x) >= len(sample)-pad] 
  if len(candidates) > 0:
    return process.extractOne(sample, candidates)[1]
  else:
    return np.nan


def filter(df):
    # drop duplicate translations
    df_pp = df.drop_duplicates()

    # drop conflicting translations
    # (this is optional and something that you might want to comment out 
    # depending on the size of your corpus)
    df_pp.drop_duplicates(subset='source_sentence', inplace=True)
    df_pp.drop_duplicates(subset='target_sentence', inplace=True)

    # Shuffle the data to remove bias in dev set selection.
    df_pp = df_pp.sample(frac=1, random_state=seed).reset_index(drop=True)
    
    # reset the index of the training set after previous filtering
    df_pp.reset_index(drop=False, inplace=True)
    
    start_time = time.time()
    ### iterating over pandas dataframe rows is not recomended, let use multi processing to apply the function

    with Pool(cpu_count()-1) as pool:
        scores = pool.map(partial(fuzzfilter, candidates=list(en_test_sents), pad=5), df_pp['source_sentence'])
    hours, rem = divmod(time.time() - start_time, 3600)
    minutes, seconds = divmod(rem, 60)
    print("done in {}h:{}min:{}seconds".format(hours, minutes, seconds))

    # Filter out "almost overlapping samples"
    df_pp = df_pp.assign(scores=scores)
    df_pp = df_pp[df_pp['scores'] < 95]
    
    return df_pp
    



In [None]:
# This section does the split between train/dev for the parallel corpora then saves them as separate files
# We use 1000 dev test and the given test set.
import csv
def save_files(df_pp, source_language, target_language):
    # Do the split between dev/train and create parallel corpora
    num_dev_patterns = 1000

    # Optional: lower case the corpora - this will make it easier to generalize, but without proper casing.
    if lc:  # Julia: making lowercasing optional
        df_pp["source_sentence"] = df_pp["source_sentence"].str.lower()
        df_pp["target_sentence"] = df_pp["target_sentence"].str.lower()

    # Julia: test sets are already generated
    dev = df_pp.tail(num_dev_patterns) # Herman: Error in original
    stripped = df_pp.drop(df_pp.tail(num_dev_patterns).index)


    train_file = os.path.join("train.")
    dev_file = os.path.join("dev.")

    with open(train_file+source_language, "w") as src_file, open(train_file+target_language, "w") as trg_file:
      for index, row in stripped.iterrows():
        src_file.write(row["source_sentence"]+"\n")
        trg_file.write(row["target_sentence"]+"\n")

    with open(dev_file+source_language, "w") as src_file, open(dev_file+target_language, "w") as trg_file:
      for index, row in dev.iterrows():
        src_file.write(row["source_sentence"]+"\n")
        trg_file.write(row["target_sentence"]+"\n")


    # Doublecheck the format below. There should be no extra quotation marks or weird characters.
    os.system("head train.*")
    os.system("head dev.*")

In [None]:
from os import path
def tokenize(source_language, target_language):
    os.environ["src"] = source_language # Sets them in bash as well, since we often use bash scripts
    os.environ["tgt"] = target_language

    # Learn BPEs on the training data.
    #os.environ["data_path"] = path.join("..","joeynmt", "data", source_language + target_language) # Herman!

    !pwd;ls

    ! subword-nmt learn-joint-bpe-and-vocab --input train.$src train.$tgt -s 4000 -o bpe.codes.4000 --write-vocabulary vocab.$src vocab.$tgt

    # Apply BPE splits to the development and test data.
    ! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$src < train.$src > train.bpe.$src
    ! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$tgt < train.$tgt > train.bpe.$tgt

    ! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$src < dev.$src > dev.bpe.$src
    ! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$tgt < dev.$tgt > dev.bpe.$tgt
    ! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$src < test.$src > test.bpe.$src
    ! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$tgt < test.$tgt > test.bpe.$tgt
    
    
    
    
# Create directory, move everyone we care about to the correct location
# ! mkdir -p $data_path
# ! cp train.* $data_path
# ! cp test.* $data_path
# ! cp dev.* $data_path
# ! cp bpe.codes.4000 $data_path
# ! ls $data_path

In [None]:
###### figure out how to call this properly

#build vocab and print sample sents
# ! sudo chmod 777 ../joeynmtmaml/scripts/build_vocab.py
# ! ../joeynmtmaml/scripts/build_vocab.py ../joeynmt/data/$src$tgt/train.bpe.$src ../joeynmt/data/$src$tgt/train.bpe.$tgt --output_path ../joeynmt/data/$src$tgt/vocab.txt

# # Some output
# ! echo "BPE Xhosa Sentences"
# ! tail -n 5 test.bpe.$tgt
# ! echo "Combined BPE Vocab"
# ! tail -n 10 ../joeynmt/data/$src$tgt/vocab.txt



### JoeyNMTMaml installation

In [None]:
!git clone https://github.com/Freshia/joeynmtmaml.git
!cd joeynmtmaml; pip3 install .

#### Bringing it all together!!!

In [None]:
#cd to lang specific directory??
def pre_process_data(source_language, target_language):
    source_file = os.path.join('jw300.' + source_language)
    target_file = os.path.join('jw300.' + target_language)
    print(source_file)
    print(target_file)
    
    df = create_data_frame(source_file, target_file)
    df_pp = filter(df)
    
    
    save_files(df_pp, source_language, target_language)
    
    tokenize(source_language, target_language)
    
    #make vocab file
    


In [None]:
#process data for all languages, and store in their respective directories
for i in range(len(target_languages)):
    target_language = target_languages[i]
    data_path = source_language+target_language
    os.system("cd %s" % data_path)
    
    pre_process_data(source_language, target_language)
    
    os.system("cd ..")

### Torch and Learn2Learn Installations

In [None]:
pip install torch==1.8.0 torchvision==0.9.0 

In [None]:
!pip3 install learn2learn

### Creating the JoeyNMT Config

In [None]:
# This creates the config file for our JoeyNMT system. It might seem overwhelming so we've provided a couple of useful parameters you'll need to update
# (You can of course play with all the parameters if you'd like!)

name = 'langexp'
#gdrive_path = os.environ["gdrive_path"]

# Create the config
config = """
name: "{name}_transformer"

data:
    src: ["swen","kien","kamen","igen","zuen"]
    trg: ["sw","ki","kam","ig","zu"]
    train: "data/langdata/train.bpe"
    dev:   "data/langdata/dev.bpe"
    test:  "data/langdata/test.bpe"
    level: "bpe"
    lowercase: False
    max_sent_length: 100
    #src_vocab: "data/{name}/vocab.txt"
    #trg_vocab: "data/{name}/vocab.txt"

testing:
    beam_size: 5
    alpha: 1.0

training:
    #load_model: "{gdrive_path}/models/{name}_transformer/1.ckpt" # if uncommented, load a pre-trained model from this checkpoint
    random_seed: 42
    optimizer: "adam"
    normalization: "tokens"
    adam_betas: [0.9, 0.999] 
    scheduling: "plateau"           # TODO: try switching from plateau to Noam scheduling
    patience: 5                     # For plateau: decrease learning rate by decrease_factor if validation score has not improved for this many validation rounds.
    learning_rate_factor: 0.5       # factor for Noam scheduler (used with Transformer)
    learning_rate_warmup: 1000      # warmup steps for Noam scheduler (used with Transformer)
    decrease_factor: 0.7
    loss: "crossentropy"
    learning_rate: 0.0002
    learning_rate_min: 0.00000001
    maml_lr: 0.001
    label_smoothing: 0.0
    weight_decay: 0.0
    label_smoothing: 0.1
    batch_size: 4096
    batch_type: "token"
    eval_batch_size: 3600
    valid_batch_size: 3600       #for validation per task
    eval_batch_type: "token"
    batch_multiplier: 1
    early_stopping_metric: "ppl"
    epochs: 30                     # TODO: Decrease for when playing around and checking of working. Around 30 is sufficient to check if its working at all
    iterations: 50
    adaptation_steps: 5000
    validation_freq: 5
    valid_config: "joeynmtmaml/configs/transformer_ensw_validation.yaml"
    logging_freq: 100
    eval_metric: "bleu"
    model_dir: "models/{name}_transformer"
    overwrite: False               # TODO: Set to True if you want to overwrite possibly existing models. 
    shuffle: True
    use_cuda: True
    max_output_length: 100
    print_valid_sents: [0, 1, 2, 3]
    keep_last_ckpts: 3

model:
    initializer: "xavier"
    bias_initializer: "zeros"
    init_gain: 1.0
    embed_initializer: "xavier"
    embed_init_gain: 1.0
    tied_embeddings: True
    tied_softmax: True
    encoder:
        type: "transformer"
        num_layers: 6
        num_heads: 4             # TODO: Increase to 8 for larger data.
        embeddings:
            embedding_dim: 256   # TODO: Increase to 512 for larger data.
            scale: True
            dropout: 0.2
        # typically ff_size = 4 x hidden_size
        hidden_size: 256         # TODO: Increase to 512 for larger data.
        ff_size: 1024            # TODO: Increase to 2048 for larger data.
        dropout: 0.3
    decoder:
        type: "transformer"
        num_layers: 6
        num_heads: 4              # TODO: Increase to 8 for larger data.
        embeddings:
            embedding_dim: 256    # TODO: Increase to 512 for larger data.
            scale: True
            dropout: 0.2
        # typically ff_size = 4 x hidden_size
        hidden_size: 256         # TODO: Increase to 512 for larger data.
        ff_size: 1024            # TODO: Increase to 2048 for larger data.
        dropout: 0.3
"""
with open("joeynmtmaml/configs/transformer_lang_maml_exp.yaml".format(name=name),'w') as f:
    f.write(config)

### Creating the validation config for maml. Swahili language

In [None]:
# Create the config
config = """
name: "transformer_langexp_valid.yaml"

data:
    src: "swen"
    trg: "sw"
    train: "data/langdata/train.bpe"
    dev:   "data/langdata/dev.bpe"
    test:  "data/langdata/test.bpe"
    level: "bpe"
    lowercase: False
    max_sent_length: 100
    src_vocab: "data/ensw/vocab.txt"
    trg_vocab: "data/ensw/vocab.txt"

testing:
    beam_size: 5
    alpha: 1.0

training:
    #load_model: "{gdrive_path}/models/{name}_transformer/1.ckpt" # if uncommented, load a pre-trained model from this checkpoint
    random_seed: 42
    optimizer: "adam"
    normalization: "tokens"
    adam_betas: [0.9, 0.999] 
    scheduling: "plateau"           # TODO: try switching from plateau to Noam scheduling
    patience: 5                     # For plateau: decrease learning rate by decrease_factor if validation score has not improved for this many validation rounds.
    learning_rate_factor: 0.5       # factor for Noam scheduler (used with Transformer)
    learning_rate_warmup: 1000      # warmup steps for Noam scheduler (used with Transformer)
    decrease_factor: 0.7
    loss: "crossentropy"
    learning_rate: 0.0003
    learning_rate_min: 0.00000001
    weight_decay: 0.0
    label_smoothing: 0.1
    batch_size: 4096
    batch_type: "token"
    eval_batch_size: 3600
    eval_batch_type: "token"
    batch_multiplier: 1
    early_stopping_metric: "ppl"
    epochs: 30                     # TODO: Decrease for when playing around and checking of working. Around 30 is sufficient to check if its working at all
    validation_freq: 1000          # TODO: Set to at least once per epoch.
    logging_freq: 100
    eval_metric: "bleu"
    model_dir: "models/ensw_valid_transformer"
    overwrite: True               # TODO: Set to True if you want to overwrite possibly existing models. 
    shuffle: True
    use_cuda: True
    max_output_length: 100
    print_valid_sents: [0, 1, 2, 3]
    keep_last_ckpts: 3

model:
    initializer: "xavier"
    bias_initializer: "zeros"
    init_gain: 1.0
    embed_initializer: "xavier"
    embed_init_gain: 1.0
    tied_embeddings: True
    tied_softmax: True
    encoder:
        type: "transformer"
        num_layers: 6
        num_heads: 4             # TODO: Increase to 8 for larger data.
        embeddings:
            embedding_dim: 256   # TODO: Increase to 512 for larger data.
            scale: True
            dropout: 0.2
        # typically ff_size = 4 x hidden_size
        hidden_size: 256         # TODO: Increase to 512 for larger data.
        ff_size: 1024            # TODO: Increase to 2048 for larger data.
        dropout: 0.3
    decoder:
        type: "transformer"
        num_layers: 6
        num_heads: 4              # TODO: Increase to 8 for larger data.
        embeddings:
            embedding_dim: 256    # TODO: Increase to 512 for larger data.
            scale: True
            dropout: 0.2
        # typically ff_size = 4 x hidden_size
        hidden_size: 256         # TODO: Increase to 512 for larger data.
        ff_size: 1024            # TODO: Increase to 2048 for larger data.
        dropout: 0.3
"""
with open("joeynmtmaml/configs/transformer_ensw_validation.yaml",'w') as f:
    f.write(config)

### Train the model

In [None]:
!cd joeynmt;python3 -m joeynmt train ../joeynmtmaml/configs/transformer_lang_maml_exp.yaml/