In [2]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install unbabel-comet

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.14

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import Softmax

from typing import List, Optional, Tuple, Union, Dict, Any

from datasets import load_dataset, Dataset, DatasetDict, load_metric, load_from_disk
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, EarlyStoppingCallback
from transformers import PreTrainedModel, TrainingArguments

from nltk.translate.bleu_score import sentence_bleu

import pandas as pd
import numpy as np

import random
import math
import time
from tqdm import tqdm
import os
import json

# Load Model

In [4]:
model_name = 'm2m100_418M'
experiment = 'en-ha-finetune'
dataset_name = 'data/en-ha'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
model = M2M100ForConditionalGeneration.from_pretrained(f"facebook/{model_name}")
model = model.to(device)
tokenizer = M2M100Tokenizer.from_pretrained(f"facebook/{model_name}")

Downloading (…)lve/main/config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

# Load Data

In [6]:
src_lang = 'ha'
tgt_lang = 'en'
tokenizer.src_lang = "ha"
tokenizer.tgt_lang = "en"

In [7]:
dataset = DatasetDict({'train':Dataset.from_pandas(pd.read_csv('/content/cleaned_train.csv')),
                        'validation':Dataset.from_pandas(pd.read_csv('/content/cleaned_dev.csv'))})

# Grow Step

In [8]:
def generate_text_and_add_to_dataset(dataset, model, tokenizer, num_samples):
    # Define a function to generate text using the model
    def generate_text(prompt):
        input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, padding="max_length", max_length=128).input_ids.to(device)
        output_ids = model.generate(input_ids, forced_bos_token_id=tokenizer.get_lang_id("en"), num_return_sequences=num_samples)
        generated_text = [tokenizer.decode(output, skip_special_tokens=True) for output in output_ids]
        return generated_text

    # Add a new field to the dataset with generated text
    def add_generated_text(example):
      example["generated_text"] = generate_text(example["ha"])
      return example

    # Apply the function to each example in the dataset
    dataset = dataset.map(add_generated_text)

    return dataset

In [9]:
dataset_validation = generate_text_and_add_to_dataset(dataset['validation'], model, tokenizer, num_samples = 3)

Map:   0%|          | 0/1113 [00:00<?, ? examples/s]

In [10]:
dataset_validation

Dataset({
    features: ['en', 'ha', 'generated_text'],
    num_rows: 1113
})

In [11]:
dataset_train = generate_text_and_add_to_dataset(dataset['train'], model, tokenizer, num_samples = 3)

Map:   0%|          | 0/9818 [00:00<?, ? examples/s]

In [12]:
dataset_train

Dataset({
    features: ['en', 'ha', 'generated_text'],
    num_rows: 9818
})

In [15]:
def fixer_function(dataset):
  input = []
  generated = []
  reference = []

  for example in dataset:
    input_sentence = example["ha"]
    output_sentences = example["generated_text"]
    reference_text = example["en"]
    for output in output_sentences:
        input.append(input_sentence)
        generated.append(output)
        reference.append(reference_text)

  data_dict = {'input':input, 'reference':reference ,'generated': generated}
  new_dataset = Dataset.from_dict(data_dict)
  return new_dataset

In [16]:
valid_dataset = fixer_function(dataset_validation)

In [21]:
valid_dataset

Dataset({
    features: ['input', 'reference', 'generated'],
    num_rows: 3339
})

In [22]:
train_dataset = fixer_function(dataset_train)

In [23]:
train_dataset

Dataset({
    features: ['input', 'reference', 'generated'],
    num_rows: 29454
})

In [26]:
from comet import download_model, load_from_checkpoint

model_path = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_path)

def score_samples(dataset):
  dataset = dataset.rename_column("input", "src")
  dataset = dataset.rename_column("generated", "mt")
  dataset = dataset.rename_column("reference", "ref")
  model_output = model.predict(dataset, batch_size=8, gpus=1)
  dataset = dataset.add_column("score", model_output['scores'])
  return dataset


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading (…)5ec7e72/hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

Downloading (…)0f75ec7e72/README.md:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

Downloading (…)080f75ec7e72/LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

Downloading (…)c7e72/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.1.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`


Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:177: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [27]:
train_dataset = score_samples(train_dataset)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 3682/3682 [16:30<00:00,  3.72it/s]


In [28]:
valid_dataset = score_samples(valid_dataset)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 418/418 [01:50<00:00,  3.79it/s]


In [29]:
valid_dataset.save_to_disk("valid_generations")

Saving the dataset (0/1 shards):   0%|          | 0/3339 [00:00<?, ? examples/s]

In [30]:
train_dataset.save_to_disk("train_generations")

Saving the dataset (0/1 shards):   0%|          | 0/29454 [00:00<?, ? examples/s]

In [31]:
! huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [32]:
train_dataset.push_to_hub('pranjali97/ha-en_RL-grow1_train')

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

In [33]:
valid_dataset.push_to_hub('pranjali97/ha-en_RL-grow1_valid')

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]