# Imports and Initializations

In [1]:
from datasets import Dataset
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel, get_scheduler
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import autocast, GradScaler
torch.backends.cuda.matmul.allow_tf32 = True
import pandas as pd

num_epochs = 4
lr = 5e-5                                           
batch_size = 5
warmup_steps= 750

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


# Initialising model

In [2]:
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
tokenizer = AutoTokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

optimizer = AdamW(model.parameters(), lr=lr)

# Dataset

In [3]:
dataset = pd.read_csv('malayalam_dataset.csv', nrows=100000)['Text']
dataset

0        മറ്റുള്ളവരെ കുറിച്ചു ചിന്തിക്കുന്നത്‌ മറ്റൊരു ...
1        ഓസ്ട്രേലിയൻ സർവീസ് ടീമിനെതിരെ ബാറ്റ് ചെയ്ത ബ്ര...
2                                        പുറത്ത്‌ മഞ്ഞുമഴ.
3                               അതിന് ശേഷം ഇലക്ട്രിക് ബസ്.
4        മലയാളം, കന്നഡ, തമിഴ് തുടങ്ങിയ ഭാഷകളിൽ ശ്രദ്ധേയ...
                               ...                        
99995    സര്‍ക്കാര്‍ ഇക്കാര്യം ഇതുവരെ ഔദ്യോഗികമായി പുറത...
99996                             വാര്‍ത്താ സമ്മേളനത്തില്‍
99997                   എന്ന പേരിനോടു സാമ്യമുള്ള അർദാശിർ. 
99998    ഇത് 27 bhp കരുത്തും 28 Nm torque ഉം പരമാവധി സൃ...
99999    പക്ഷെ ടെസ്റ്റ് ക്രിക്കറ്റ് തികച്ചും വ്യത്യസ്തമ...
Name: Text, Length: 100000, dtype: object

In [4]:
new_dataset = {'input_ids': [], 'attention_mask': []}   
wrong_data = 0
for i,data in enumerate(dataset):
    if i%1e5 == 0: print(i)
    try:
        encoded_data = tokenizer(data+'', truncation=True, max_length=768, padding="max_length")
        new_dataset['input_ids'].append(encoded_data['input_ids'])
        new_dataset['attention_mask'].append(encoded_data['attention_mask'])
    except:
        wrong_data += 1
wrong_data

0


0

In [7]:
new_dataset = Dataset.from_dict(new_dataset)
new_dataset.set_format("torch")
dataloader = DataLoader(new_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)


# Inits

In [8]:
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)
model.resize_token_embeddings(len(tokenizer))
model.to(device)
model = torch.compile(model)
scaler = GradScaler()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

# Training loop

In [9]:
save_loc = '/Model_save/test_save.pt'
save_loc2 = '/Model_save/test_save2.pt'

In [10]:
progress_bar = tqdm(range(num_training_steps-1), desc='Training', unit='steps')
model.train()
ep = 0
prev_avg_train_loss = 1
for epoch in range(num_epochs):
    total_train_loss = 0
    for batch in dataloader:
        batch_data = batch['input_ids'].to(device)
        attention = batch['attention_mask'].to(device)

        optimizer.zero_grad()

        with autocast():
            outputs = model(batch_data,
                            labels=batch_data,
                            attention_mask=attention,
                            token_type_ids=None
                            )

            loss = outputs[0]
            batch_loss = loss.item()
            total_train_loss += batch_loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        lr_scheduler.step()
        progress_bar.update(1)

    avg_train_loss = total_train_loss / len(dataloader)
    ep += 1
    print('Epoch:', ep, 'Average training loss =', avg_train_loss)
    if abs(prev_avg_train_loss - avg_train_loss) < 0.0001:
        model.save_pretrained(save_loc2)
        tokenizer.save_pretrained(save_loc2)
        print("Loss is very small")
        break
    prev_avg_train_loss = avg_train_loss
    model.save_pretrained(save_loc)
    tokenizer.save_pretrained(save_loc)


Training:   0%|          | 0/19999 [00:00<?, ?steps/s]

26.329669952392578
19.366365432739258
29.764446258544922
22.0225887298584
21.419431686401367
32.90776062011719
24.520666122436523
20.886516571044922
30.28919219970703
24.455791473388672
23.001537322998047
20.35142707824707
18.025114059448242
19.71027374267578
23.416698455810547
20.192453384399414
19.301483154296875
18.62240219116211
13.820021629333496
22.14207649230957
17.58741569519043
14.035262107849121
17.973556518554688
17.366151809692383
13.137988090515137
15.302386283874512
13.356385231018066
13.932629585266113
12.347043991088867
10.809679985046387
10.122955322265625
10.074874877929688
9.48206901550293
10.110921859741211
8.390628814697266
7.797052383422852
7.928176403045654
7.299664497375488
7.358172416687012
7.017761707305908
6.1608500480651855
6.0184326171875
6.165712356567383
5.659091949462891
6.972015857696533
5.730923175811768
5.400989532470703
5.781001567840576
4.952193260192871
4.683191299438477
4.800012588500977
4.803822040557861
3.8565216064453125
4.566543102264404
4.770

KeyboardInterrupt: 