In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import Softmax

from typing import List, Optional, Tuple, Union, Dict, Any

from datasets import load_dataset, Dataset, DatasetDict, load_metric, load_from_disk
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, EarlyStoppingCallback
from transformers import PreTrainedModel, TrainingArguments

from nltk.translate.bleu_score import sentence_bleu

import pandas as pd
import numpy as np

import random
import math
import time
from tqdm import tqdm
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


# Load Model

In [2]:
model_name = 'm2m100_418M'
experiment = 'en-ha-finetune'
dataset_name = 'data/en-ha'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
model = M2M100ForConditionalGeneration.from_pretrained(f"facebook/{model_name}")
model = model.to(device)
tokenizer = M2M100Tokenizer.from_pretrained(f"facebook/{model_name}")

# Load Data

In [4]:
src_lang = 'en'
tgt_lang = 'ha'
tokenizer.src_lang = "en"
tokenizer.tgt_lang = "ha"

In [5]:
dataset = DatasetDict({'train':Dataset.from_pandas(pd.read_csv(f'{dataset_name}/cleaned_train.csv')),
                        'validation':Dataset.from_pandas(pd.read_csv(f'{dataset_name}/cleaned_dev.csv'))})

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'ha'],
        num_rows: 9818
    })
    validation: Dataset({
        features: ['en', 'ha'],
        num_rows: 1113
    })
})

In [7]:
def preprocess_function(examples):
    inputs = [example for example in examples[src_lang]]
    targets = [example for example in examples[tgt_lang]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True, padding="max_length")
    return model_inputs

In [8]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names['train'])

Map: 100%|█████████████████████████| 9818/9818 [00:03<00:00, 2882.24 examples/s]
Map: 100%|█████████████████████████| 1113/1113 [00:00<00:00, 2856.14 examples/s]


In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9818
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1113
    })
})

# Finetune Model

In [10]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    labels = eval_preds.label_ids
    pred_ids = eval_preds.predictions
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]
    
    preds = np.argmax(pred_ids, axis=-1)
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Removeme
    import warnings
    warnings.warn(f"preds: {decoded_preds[0]}\n)")
    warnings.warn(f"labels: {decoded_labels[0]}\n)")

    result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [11]:
training_args = TrainingArguments(
    f"./base_finetune/{experiment}/model",
    # evaluation_strategy="steps",
    evaluation_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    warmup_steps=1000,
    # lr_scheduler_type='constant',
    # gradient_accumulation_steps=4,
    eval_accumulation_steps=16,
    # gradient_checkpointing=True,
    # predict_with_generate=True,
    fp16=True,
    do_train=True,
    do_eval=True,
    logging_steps=5,
    # eval_steps=5,
    save_strategy="epoch",
    metric_for_best_model="bleu",
    load_best_model_at_end=True,
)

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    #optimizers=(optimizer, lr_scheduler),
    compute_metrics=compute_metrics,
)

ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX).

In [None]:
trainer.train()

# Grow Step

In [None]:
# Grow function
def grow(model, input_texts, num_samples=5):
    generated_texts = []
    for text in input_texts:
        # Generate multiple translations for each input text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=100)
        # do we need many samples or just 1?
        outputs = model.generate(**inputs, num_return_sequences=num_samples)
        generated_texts.extend([tokenizer.decode(output, skip_special_tokens=True) for output in outputs])
    return generated_texts

In [None]:
# Using sentence_bleu to start with. Replace with BleuRT and Comet-QE
# def reward_function(predictions, references):
#     return sentence_bleu(references, predictions)

In [None]:
# def improve(model, generated_texts, original_texts, tokenizer, batch_size=8, num_epochs=1, learning_rate=5e-5):
#     # Rank and filter the generated texts using the reward function
#     scores = [reward_function(text, original_texts) for text in generated_texts]
#     print(len(scores))
    
#     # Sort the generated texts based on their scores
#     sorted_texts = [x for _, x in sorted(zip(scores, generated_texts), key=lambda pair: pair[0], reverse=True)]
#     print(sorted_texts)
    
#     # Use the top-ranked texts for fine-tuning
#     # For simplicity, let's use the top 50% of the sorted_texts
#     training_data = sorted_texts[:len(sorted_texts) // 2]
#     print(training_data)
#     return None
    
    # # Convert texts to DataLoader for training
    # inputs = tokenizer(training_data, return_tensors="pt", padding=True, truncation=True, max_length=100)
    # dataset = torch.utils.data.TensorDataset(inputs["input_ids"], inputs["attention_mask"])
    # dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # # Define optimizer and scheduler
    # optimizer = AdamW(model.parameters(), lr=learning_rate)
    # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader) * num_epochs)
    
    # # Fine-tuning loop
    # model.train()
    # for epoch in range(num_epochs):
    #     for batch in dataloader:
    #         input_ids, attention_mask = batch
    #         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
    #         loss = outputs.loss
    #         loss.backward()
    #         optimizer.step()
    #         scheduler.step()
    #         optimizer.zero_grad()
    
    # return model

In [None]:
# Reinforced Self-Training
def reinforced_self_loop(model, unsupervised_data, supervised_data, optimizer, num_iterations):
    model.train()
    for iteration in range(num_iterations):
        # 1. Translate the unsupervised data using the current model
        inputs = tokenizer(unsupervised_data, return_tensors="pt", padding=True, truncation=True)
        outputs = model.generate(**inputs)
        pseudo_translations = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

        # 2. Compute the reward for the pseudo-translations
        rewards = [reward_function(pred, [ref]) for pred, ref in zip(pseudo_translations, unsupervised_data)]
        # 3. Update the model using the pseudo-translations and their rewards
        # This part is tricky since the M2M100 model isn't directly designed for RL.
        # You'd typically need to define a custom loss function that incorporates the rewards.
        # For simplicity, this step is omitted in this outline.
        # 4. Fine-tune the model on the supervised data"

In [None]:
#Data Lodaing Function
# Here, you'd typically load your data. For the sake of this example, let's use dummy data:
unsupervised_data = ["This is an unsupervised sentence."] * 10
supervised_data = [("This is a source sentence.", "This is a target sentence.")] * 10

In [None]:
# load data
import pandas as pd
dev = pd.read_csv('en-ha/cleaned_dev.csv')
train = pd.read_csv('en-ha/cleaned_train.csv')

In [None]:
train.shape

In [None]:
# create finetuned model

# Load pre-trained M2M100 model and tokenizer
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

In [None]:
# call grow step
# call reward function and score samples
# call improve step
    # for th in threshold
        #  fine-tune model and check performance for improvement
# return model 


In [None]:
# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Train the model using Reinforced Self-Training
reinforced_self_training(model, unsupervised_data, supervised_data, optimizer, num_iterations=1000)