### Install Dependencies

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install accelerate transformers bitsandbytes datasets numpy einops torchvision matplotlib
!pip install --upgrade tensorflow-io

### Import Libraries

In [None]:
import os
import torchvision.models as models
import pandas as pd
import numpy as np
from torchvision import transforms
import torch.nn as nn
import torch
import matplotlib.pylab as plt
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

### Set Up Model and Tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2Model

model_id = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
# model = GPT2Model.from_pretrained(model_id, cache_dir='/content', trust_remote_code=True, device_map="auto", offload_folder="offload")
model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir='/content', trust_remote_code=True, device_map="auto", offload_folder="offload")

In [None]:
print(model)

### Load Dataset

In [None]:
from datasets import load_dataset

data = load_dataset("truthful_qa", "generation")

In [None]:
tokenizer.pad_token = tokenizer.eos_token

train_dataset = data['validation'].map(lambda x: {"input_text": x['question']  + "\n" + x['best_answer']})

# Tokenize the datasets
train_encodings = tokenizer(train_dataset['input_text'], truncation=True, padding=True, max_length=256, return_tensors='pt')

In [None]:
class OpenAssistantDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = item["input_ids"].clone()
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = OpenAssistantDataset(train_encodings)

In [None]:
def generate(index):

  example_text = data['validation'][index]['question']
  correct_answer = data['validation'][index]['best_answer']

  print("Question:")
  print(example_text)

  encoding = tokenizer(example_text, return_tensors="pt").to("cuda")
  output = model.generate(input_ids=encoding.input_ids, attention_mask=encoding.attention_mask, max_new_tokens=100, do_sample=True, temperature=0.000001, eos_token_id=tokenizer.eos_token_id, top_k = 0)

  print("Answer:")
  print(tokenizer.decode(output[0], skip_special_tokens=True))

  print("Best Answer:")
  print(correct_answer)

  print()

In [None]:
generate(0)

### Training

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import get_scheduler
from tqdm.auto import tqdm

batch_size = 8
learning_rate = 1e-4
num_epochs = 100

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


num_training_steps = num_epochs * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in train_dataloader:
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs, return_dict=True)

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss}")

model.save_pretrained("/content/fine_tuned_gpt2")

In [None]:
model.eval()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
generate(5)

In [None]:
print(model)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2Model, GPT2Tokenizer

model_path = "/content/fine_tuned_gpt2"
model = AutoModelForCausalLM.from_pretrained(model_path, cache_dir='/content', trust_remote_code=True, offload_folder="offload")
tokenizer = AutoTokenizer.from_pretrained("gpt2")


while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("Chatbot: Goodbye!")
        break

    input_ids = tokenizer.encode(user_input, return_tensors="pt")

    output = model.generate(input_ids, max_length=100, do_sample=True, temperature=0.7, pad_token_id=tokenizer.eos_token_id)

    response = tokenizer.decode(output[0], skip_special_tokens=True)
    print("Chatbot:", response)

In [None]:
from datasets import load_metric

metric = load_metric("truthful_qa", "generation")
model.eval()
for batch in eval_dataloader: # Define the eval_dataloader
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits,dim=-1)
    metric.add_batch(predictions=predictions,references=batch["labels"])

metric.compute()