#MODEL TRAINING

In [1]:
#checking for gpu

!nvidia-smi

Wed Oct 11 04:30:57 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [29]:
import os
import sys
import transformers
from datasets import load_dataset
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=WarningType)

In [5]:
model_checkpoint = "t5-small"

In [38]:
dataset = (load_dataset('findnitai/english-to-hinglish', split='train').train_test_split(test_size=0.2))
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 151281
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 37821
    })
})

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,use_fast=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [11]:
# example
preprocess_function(dataset['train'][:2])



{'input_ids': [[363, 31, 7, 8, 564, 13, 8, 1974, 1], [2018, 6, 8, 3, 14369, 35, 11395, 2604, 19, 248, 68, 8, 10531, 6800, 2604, 1330, 3, 9, 385, 731, 3, 9, 1974, 13, 48, 463, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[814, 3, 1258, 3, 3781, 9, 3, 29, 9, 265, 4244, 23, 1], [3, 13363, 8323, 6, 6819, 9, 3, 107, 76, 9, 11395, 2604, 954, 1024, 152, 4244, 23, 6, 90, 2917, 10531, 6800, 2604, 19, 4740, 9, 208, 14748, 3, 1050, 15, 3, 18118, 51, 142, 3, 189, 32, 26, 9, 6511, 50, 5497, 9, 4244, 23, 5, 1]]}

In [39]:
tokenized_data = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/151281 [00:00<?, ? examples/s]



Map:   0%|          | 0/37821 [00:00<?, ? examples/s]

In [13]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


###Hyperparatmers Tuning

In [14]:
batch_size = 64
learning_rate = 0.001
weight_decay = 0.01
num_train_epochs = 5

In [40]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors='tf')

In [41]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors='tf', pad_to_multiple_of=128)

###Defining train dataset and validation dataset


In [42]:
train_dataset = model.prepare_tf_dataset(
    tokenized_data["train"],
    batch_size = batch_size,
    shuffle=True,
    collate_fn=data_collator
)

#using previously split test as the validation data
validation_dataset = model.prepare_tf_dataset(
    tokenized_data["test"],
    batch_size = batch_size,
    shuffle=True,
    collate_fn=data_collator
)

###Model Compilation

In [18]:
optimizer= AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer, metrics=['accuracy'])

###Model Training

In [43]:
model.fit(train_dataset, epochs=num_train_epochs, validation_data=validation_dataset)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x788e24264ee0>

###Save trained model

In [44]:
model.save_pretrained("saved_models/")


###Uplaoding the saved_models folder as a zip to google drive


In [49]:
from google.colab import drive
import shutil

drive.mount('/content/drive')

folder_to_zip = '/content/saved_models/'

shutil.make_archive('/content/drive/MyDrive/saved_models', 'zip', folder_to_zip)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/MyDrive/saved_models.zip'