In [None]:
#!pip install "adapter-transformers@git+https://github.com/akufeldt/adapter-transformers.git@debug#egg=adapter-transformers&subdirectory=adapter-transformers"

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import Softmax

from typing import List, Optional, Tuple, Union, Dict, Any

from datasets import load_dataset, Dataset, DatasetDict, load_metric, load_from_disk, concatenate_datasets
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, EarlyStoppingCallback
from transformers import PreTrainedModel, TrainingArguments, Trainer
#from transformers.adapters import AdapterTrainer

import pandas as pd
import numpy as np
import evaluate

import random
import math
import time
from tqdm import tqdm
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
seed = 42
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
_numpy_rng = np.random.default_rng(seed)
random.seed(seed)
np.random.seed(seed)
torch.use_deterministic_algorithms(False)
os.environ['PYTHONHASHSEED'] = str(seed)

In [3]:
os.environ["WANDB_DISABLED"] = "true"

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load in model

In [5]:
model_name = 'm2m100_418M'
experiment = 'en-ha_finetune_base_model-1'
dataset_name = 'data/en-ha'

In [6]:
model = M2M100ForConditionalGeneration.from_pretrained(f"facebook/{model_name}")
# model = torch.nn.DataParallel(model, device_ids=[2, 3, 4])
model = model.to(device)
tokenizer = M2M100Tokenizer.from_pretrained(f"facebook/{model_name}")

# Prepare data

In [22]:
src_lang = 'en'
tgt_lang = 'ha'
tokenizer.src_lang = "en"
tokenizer.tgt_lang = "ha"

In [23]:
# Load the datasets from Hugging Face Hub
original_train_dataset = load_dataset("pranjali97/ha-en_RL-grow1_train", split='train')
original_valid_dataset = load_dataset("pranjali97/ha-en_RL-grow1_valid", split='train')  # Assuming the split is also 'train'


In [24]:
# Filter the datasets to only include samples with a score > 0.6
filtered_train_dataset = original_train_dataset.filter(lambda example: example['score'] > 0.6)
filtered_valid_dataset = original_valid_dataset.filter(lambda example: example['score'] > 0.6)

In [25]:
filtered_train_dataset = filtered_train_dataset.remove_columns(['score', 'ref'])
filtered_valid_dataset = filtered_valid_dataset.remove_columns(['score', 'ref'])

In [28]:
train_dataset = load_dataset('csv', data_files='/home/spandey7/Language-Adapters/Data/en-ha/cleaned_train.csv', split='train')
valid_dataset = load_dataset('csv', data_files='/home/spandey7/Language-Adapters/Data/en-ha/cleaned_dev.csv', split='train')

In [30]:
train_dataset

Dataset({
    features: ['en', 'ha'],
    num_rows: 9818
})

In [31]:
train_dataset = train_dataset.rename_column('ha', 'src')
valid_dataset = valid_dataset.rename_column('ha', 'src')
train_dataset = train_dataset.rename_column('en', 'mt')
valid_dataset = valid_dataset.rename_column('en', 'mt')



In [32]:
train_dataset

Dataset({
    features: ['mt', 'src'],
    num_rows: 9818
})

In [33]:
train_dataset = concatenate_datasets([train_dataset, filtered_train_dataset])
valid_dataset = concatenate_datasets([valid_dataset, filtered_valid_dataset])  
train_dataset = train_dataset.shuffle(seed=42)
valid_dataset = valid_dataset.shuffle(seed=42)

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [34]:
# Define the preprocess function
def preprocess_function(examples):
    inputs = examples['src']  # Hausa sentences
    targets = examples['mt']  # English translations
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    # Return the tokenized inputs and labels
    return {'input_ids': model_inputs['input_ids'], 'attention_mask': model_inputs['attention_mask'], 'labels': labels['input_ids']}

# Apply the preprocess function to the datasets
tokenized_train_dataset = train_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=['src',  'mt']  # Specify the correct columns to remove
)
tokenized_valid_dataset = valid_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=['src',  'mt']  # Specify the correct columns to remove
)

# Create the DatasetDict
tokenized_dataset = DatasetDict({
    'train': tokenized_train_dataset,  # Directly assign the processed dataset
    'validation': tokenized_valid_dataset  # Directly assign the processed dataset
})

Map: 100%|██████████| 12638/12638 [00:03<00:00, 3344.36 examples/s]
Map: 100%|██████████| 1449/1449 [00:00<00:00, 3428.60 examples/s]


In [35]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 12638
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1449
    })
})

# not run

In [None]:
dataset = DatasetDict({'train':Dataset.from_pandas(pd.read_csv(f'{dataset_name}/cleaned_train.csv')).shuffle(seed=seed),
                        'validation':Dataset.from_pandas(pd.read_csv(f'{dataset_name}/cleaned_dev.csv')).shuffle(seed=seed),
                        'test':Dataset.from_pandas(pd.read_csv(f'{dataset_name}/test.csv')).shuffle(seed=seed),
})

In [None]:
dataset['test'] = dataset['test'].rename_column('sentence_eng_Latn','en')
dataset['test'] = dataset['test'].rename_column('sentence_hau_Latn','ha')

In [None]:
dataset

In [None]:
def preprocess_function(examples):
    inputs = [example for example in examples[src_lang]]
    targets = [example for example in examples[tgt_lang]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True, padding="max_length")
    return model_inputs

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names['train'])

In [None]:
tokenized_dataset

# Training Setup

In [36]:
sacrebleu = evaluate.load("sacrebleu")
wer = evaluate.load("wer")

In [37]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    labels = eval_preds.label_ids
    pred_ids = eval_preds.predictions
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]
    
    preds = np.argmax(pred_ids, axis=-1)

    # removeme
    #import warnings
    #warnings.warn(f"unprocessed preds: {preds[0]}\n)")
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) 

    # removeme
    #warnings.warn(f"unprocessed decoded labels: {tokenizer.batch_decode(labels)[0]}\n)")

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # remove me
    #inputs = eval_preds.input_ids
    #decoded_inputs = tokenizer.batch_decode(inputs)
    
    # Removeme
    import warnings
    warnings.warn(f"preds: {decoded_preds[0]}\n)")
    warnings.warn(f"labels: {decoded_labels[0]}\n)")

    bleu_result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    metrics = {"bleu": bleu_result["score"]}

    flattened_decoded_labels = [' '.join([str(x) for x in l]) for l in decoded_labels]
    wer_score = wer.compute(predictions=decoded_preds, references=flattened_decoded_labels)
    metrics["wer"] = wer_score

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    metrics["gen_len"] = np.mean(prediction_lens)
    metrics = {k: round(v, 4) for k, v in metrics.items()}
    return metrics


In [38]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [39]:
training_args = TrainingArguments(
    f"./lang_adapters/{experiment}/model",
    # evaluation_strategy="steps",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=15,
    warmup_steps=0,
    # lr_scheduler_type='cosine_with_restarts',
    # gradient_accumulation_steps=4,
    eval_accumulation_steps=16,
    # gradient_checkpointing=True,
    # predict_with_generate=True,
    fp16=True,
    do_train=True,
    do_eval=True,
    logging_steps=5,
    # eval_steps=5,
    save_strategy="epoch",
    metric_for_best_model="bleu",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    #optimizers=(optimizer, lr_scheduler),
    compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [40]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Wer,Gen Len
1,0.2106,0.26061,25.2531,0.9944,33.4092
2,0.2085,0.241981,28.1086,0.9944,33.4092
3,0.1684,0.23536,29.9674,0.9941,33.4092
4,0.1378,0.237399,30.6495,0.9943,33.4092
5,0.1269,0.239288,31.4478,0.9942,33.4092
6,0.0904,0.243654,31.8386,0.9946,33.4092
7,0.0728,0.246907,32.488,0.9947,33.4092
8,0.0618,0.254085,32.7722,0.9945,33.4092
9,0.041,0.258592,32.92,0.9948,33.4092
10,0.0336,0.264998,33.3092,0.9949,33.4092


)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=11850, training_loss=0.12267591539157342, metrics={'train_runtime': 7012.8254, 'train_samples_per_second': 27.032, 'train_steps_per_second': 1.69, 'total_flos': 1.0270450485559296e+17, 'train_loss': 0.12267591539157342, 'epoch': 15.0})

In [43]:
base_model_path = f'./base_model/{experiment}'

if not os.path.exists(base_model_path):
    os.makedirs(base_model_path)

trainer.save_model(base_model_path)

In [42]:
# Save model
if not os.path.exists(f'./base_model/{experiment}'):
    os.mkdir(f'/base_model/{experiment}')
    
trainer.save_model(f"./base_model/{experiment}")

FileNotFoundError: [Errno 2] No such file or directory: '/base_model/en-ha_finetune_base_model-1'

# Eval finetuned model on test set

In [44]:
# Evaluate performance
src_lang = 'en'
tgt_lang = 'ha'
tokenizer.src_lang = "en"
tokenizer.tgt_lang = "ha"

In [45]:
model = M2M100ForConditionalGeneration.from_pretrained(f"./base_model/{experiment}")
# model = torch.nn.DataParallel(model, device_ids=[2, 3, 4])
model = model.to(device)
tokenizer = M2M100Tokenizer.from_pretrained(f"./base_model/{experiment}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [51]:
test_dataset = load_dataset('csv', data_files='/home/spandey7/Language-Adapters/Data/en-ha/test.csv', split='train')

In [52]:
test_dataset

Dataset({
    features: ['sentence_eng_Latn', 'sentence_hau_Latn'],
    num_rows: 1012
})

In [53]:
#rename 'sentence_eng_Latn' to 'en' and 'sentence_hau_Latn' to 'ha'
test_dataset = test_dataset.rename_column('sentence_eng_Latn','mt')
test_dataset = test_dataset.rename_column('sentence_hau_Latn','src')

In [54]:
#tokenize the dataset
tokenized_test_dataset = test_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=['src',  'mt']  # Specify the correct columns to remove
)

Map: 100%|██████████| 1012/1012 [00:00<00:00, 3014.62 examples/s]


In [57]:
#convert the dataset into dataset dict
tokenized_test_dataset = DatasetDict({
    'test': tokenized_test_dataset  # Directly assign the processed dataset
})

In [58]:
tokenized_test_dataset

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1012
    })
})

In [60]:
#run evaluation
eval_results = trainer.evaluate(tokenized_test_dataset["test"])

)
)


In [61]:
eval_results

{'eval_loss': 0.46018558740615845,
 'eval_bleu': 14.5322,
 'eval_wer': 0.9948,
 'eval_gen_len': 34.2757,
 'eval_runtime': 174.9085,
 'eval_samples_per_second': 5.786,
 'eval_steps_per_second': 1.446,
 'epoch': 15.0}

# 0.7 Filtered Data

In [7]:
src_lang = 'en'
tgt_lang = 'ha'
tokenizer.src_lang = "en"
tokenizer.tgt_lang = "ha"

In [9]:
# Load the datasets from Hugging Face Hub
original_train_dataset = load_dataset("pranjali97/ha-en_RL-grow1_train", split='train')
original_valid_dataset = load_dataset("pranjali97/ha-en_RL-grow1_valid", split='train')

new_filtered_train_dataset = original_train_dataset.filter(lambda example: example['score'] > 0.6)
new_filtered_valid_dataset = original_valid_dataset.filter(lambda example: example['score'] > 0.6)
new_filtered_train_dataset = new_filtered_train_dataset.remove_columns(['score', 'ref'])
new_filtered_valid_dataset = new_filtered_valid_dataset.remove_columns(['score', 'ref'])

train_dataset = load_dataset('csv', data_files='/home/spandey7/Language-Adapters/Data/en-ha/cleaned_train.csv', split='train')
valid_dataset = load_dataset('csv', data_files='/home/spandey7/Language-Adapters/Data/en-ha/cleaned_dev.csv', split='train')

train_dataset = train_dataset.rename_column('ha', 'src')
valid_dataset = valid_dataset.rename_column('ha', 'src')
train_dataset = train_dataset.rename_column('en', 'mt')
valid_dataset = valid_dataset.rename_column('en', 'mt')

new_train_dataset = concatenate_datasets([train_dataset, new_filtered_train_dataset])
new_valid_dataset = concatenate_datasets([valid_dataset, new_filtered_valid_dataset])  
new_train_dataset = train_dataset.shuffle(seed=42)
new_valid_dataset = valid_dataset.shuffle(seed=42)

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [10]:
new_train_dataset

Dataset({
    features: ['mt', 'src'],
    num_rows: 9818
})

In [11]:
# Define the preprocess function
def preprocess_function(examples):
    inputs = examples['src']  # Hausa sentences
    targets = examples['mt']  # English translations
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    # Return the tokenized inputs and labels
    return {'input_ids': model_inputs['input_ids'], 'attention_mask': model_inputs['attention_mask'], 'labels': labels['input_ids']}

# Apply the preprocess function to the datasets
new_tokenized_train_dataset = new_train_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=['src',  'mt']  # Specify the correct columns to remove
)
new_tokenized_valid_dataset = new_valid_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=['src',  'mt']  # Specify the correct columns to remove
)

# Create the DatasetDict
new_tokenized_dataset = DatasetDict({
    'train': new_tokenized_train_dataset,  # Directly assign the processed dataset
    'validation': new_tokenized_valid_dataset  # Directly assign the processed dataset
})

Map: 100%|██████████| 9818/9818 [00:03<00:00, 2980.45 examples/s]
Map: 100%|██████████| 1113/1113 [00:00<00:00, 2963.97 examples/s]


In [12]:
new_tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9818
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1113
    })
})

In [13]:
sacrebleu = evaluate.load("sacrebleu")
wer = evaluate.load("wer")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    labels = eval_preds.label_ids
    pred_ids = eval_preds.predictions
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]
    
    preds = np.argmax(pred_ids, axis=-1)

    # removeme
    #import warnings
    #warnings.warn(f"unprocessed preds: {preds[0]}\n)")
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) 

    # removeme
    #warnings.warn(f"unprocessed decoded labels: {tokenizer.batch_decode(labels)[0]}\n)")

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # remove me
    #inputs = eval_preds.input_ids
    #decoded_inputs = tokenizer.batch_decode(inputs)
    
    # Removeme
    import warnings
    warnings.warn(f"preds: {decoded_preds[0]}\n)")
    warnings.warn(f"labels: {decoded_labels[0]}\n)")

    bleu_result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    metrics = {"bleu": bleu_result["score"]}

    flattened_decoded_labels = [' '.join([str(x) for x in l]) for l in decoded_labels]
    wer_score = wer.compute(predictions=decoded_preds, references=flattened_decoded_labels)
    metrics["wer"] = wer_score

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    metrics["gen_len"] = np.mean(prediction_lens)
    metrics = {k: round(v, 4) for k, v in metrics.items()}
    return metrics



In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [17]:
training_args = TrainingArguments(
    f"./lang_adapters/{experiment}/model",
    # evaluation_strategy="steps",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=15,
    warmup_steps=0,
    # lr_scheduler_type='cosine_with_restarts',
    # gradient_accumulation_steps=4,
    eval_accumulation_steps=16,
    # gradient_checkpointing=True,
    # predict_with_generate=True,
    fp16=True,
    do_train=True,
    do_eval=True,
    logging_steps=5,
    # eval_steps=5,
    save_strategy="epoch",
    metric_for_best_model="bleu",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=new_tokenized_dataset["train"],
    eval_dataset=new_tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    #optimizers=(optimizer, lr_scheduler),
    compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Wer,Gen Len
1,0.2927,0.283912,24.0292,0.9941,35.2453
2,0.2628,0.266505,26.9938,0.9954,35.2345
3,0.2128,0.258874,28.8538,0.9948,35.2453
4,0.1743,0.253786,29.8319,0.9953,35.2453
5,0.1355,0.257171,30.0749,0.995,35.2453
6,0.1306,0.25912,31.0308,0.9952,35.2453
7,0.1074,0.261078,31.3419,0.9946,35.2453
8,0.0842,0.267473,31.6124,0.9951,35.2453
9,0.0699,0.272087,31.793,0.9948,35.2453
10,0.0619,0.276109,32.2084,0.9946,35.2453


)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=4605, training_loss=0.12034673149759166, metrics={'train_runtime': 3580.185, 'train_samples_per_second': 41.135, 'train_steps_per_second': 1.286, 'total_flos': 7.978737368825856e+16, 'train_loss': 0.12034673149759166, 'epoch': 15.0})

In [19]:
base_model_path = f'./0.7_base_model/{experiment}'

if not os.path.exists(base_model_path):
    os.makedirs(base_model_path)

trainer.save_model(base_model_path)

In [20]:
test_dataset = load_dataset('csv', data_files='/home/spandey7/Language-Adapters/Data/en-ha/test.csv', split='train')

#rename 'sentence_eng_Latn' to 'en' and 'sentence_hau_Latn' to 'ha'
test_dataset = test_dataset.rename_column('sentence_eng_Latn','mt')
test_dataset = test_dataset.rename_column('sentence_hau_Latn','src')

#tokenize the dataset
tokenized_test_dataset = test_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=['src',  'mt']  # Specify the correct columns to remove
)

#convert the dataset into dataset dict
tokenized_test_dataset = DatasetDict({
    'test': tokenized_test_dataset  # Directly assign the processed dataset
})

new_eval_results = trainer.evaluate(tokenized_test_dataset["test"])

Map: 100%|██████████| 1012/1012 [00:00<00:00, 3288.45 examples/s]


)
)


In [21]:
new_eval_results

{'eval_loss': 0.41089722514152527,
 'eval_bleu': 15.2546,
 'eval_wer': 0.9941,
 'eval_gen_len': 34.2757,
 'eval_runtime': 110.5506,
 'eval_samples_per_second': 9.154,
 'eval_steps_per_second': 1.149,
 'epoch': 15.0}