# Vitality-Mini_Project

## Importing Modules

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
import accelerate
import torch
torch.cuda.empty_cache()

## Preprocessing

In [None]:
with open("final_dataset.txt", 'r', encoding='utf-8') as file:
    file_content = file.read()
list = file_content.split("\\n")
for i in range(0,len(list)):
    if(i%2==0):
        result = list[i].find("[BOT]")
        if(result!=-1):
            list[i] = list[i].replace("[BOT]", "[USER]")
    else:
        result1 = list[i].find("[USER]")
        if(result1!=-1):
            list[i] = list[i].replace("[USER]", "[BOT]")
    list[i]=list[i]+"\n"
            
print(len(list))
print(list[:50])


In [None]:
newList = []
index = 0
for i in range(0, len(list)-1,2):
    newList.append(list[i]+list[i+1])
print(len(newList))
print(newList[0:50])

### Creating Final_dataset on which model will be trained

In [None]:
final_dataset = newList[0:25000]

In [None]:
final_dataset[:50]

## Training

### Initializing Model

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

### Initializing tokenizer padding

In [None]:
tokenizer.pad_token = tokenizer.eos_token

### Tokenizing the preprocessed dataset

In [None]:
chatbot_dataset = final_dataset
tokenized_chatbot_dataset = [tokenizer.encode(dialogue, add_special_tokens=True, truncation=True, max_length=1024) for dialogue in chatbot_dataset]

### Defining Custom class for dataset

In [None]:
class ChatbotDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {"input_ids": self.data[idx]}

text_dataset = ChatbotDataset(tokenized_chatbot_dataset)

### Initializing Training Arguements

In [None]:
training_args = TrainingArguments(
    output_dir="./vitality",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2
)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

### Initializing Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=text_dataset
)

### Defining Custom Dataloader (To prevent out of memory)

In [None]:
max_size_mb = 3000
max_size_split = max_size_mb * 1024 * 1024

class CustomDataLoader(DataLoader):
    def __init__(self, *args, max_size_split=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.max_size_split = max_size_split

    def __iter__(self):
        return super().__iter__()

    def __len__(self):
        return super().__len__()

    def __next__(self):
        batch = super().__next__()
        if self.max_size_split:
            batch = [sequence for sequence in batch if len(sequence) <= self.max_size_split]
        return batch

dataloader = CustomDataLoader(text_dataset, batch_size=4, collate_fn=lambda x: x, pin_memory=True, max_size_split=max_size_split)

### Assigning DataLoader to trainer

In [None]:
trainer.train_dataloader = dataloader

### Training The Model

In [None]:
trainer.train()

### Saving The Model

In [None]:
model.save_pretrained("vitality")
tokenizer.save_pretrained("vitality")