In [9]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.nn import functional as F

# Loading the dataset
data = pd.read_csv('output.csv')
print(data.shape)
print(data.head())

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos token for consistency

# Prepare data for training
X = []
Y = []

def tokenize(text_in, text_out):
    inputs = tokenizer(text_in, padding="max_length", truncation=True, return_tensors="pt", max_length=256)
    labels = tokenizer(text_out, padding="max_length", truncation=True, return_tensors="pt", max_length=256)["input_ids"]
    
    X.append({
        'input_ids': inputs["input_ids"].squeeze(0),
        'attention_mask': inputs["attention_mask"].squeeze(0)
    })
    Y.append(labels.squeeze(0))

# Tokenizing the data
for index, row in data.iterrows():
    tokenize(row['Input'], row['Output'])
    if len(X) == 100 and len(Y) == 100:  
        break  

# Create the dataset
dataset = Dataset.from_dict({
    "input_ids": [x["input_ids"].tolist() for x in X],  # No need to clone, just convert to list
    "attention_mask": [x["attention_mask"].tolist() for x in X],
    "labels": [y.tolist() for y in Y]
})

# Split the dataset into training and validation sets
split_dataset = dataset.train_test_split(test_size=0.2, shuffle=True)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

# Check dataset lengths
print(f"Train dataset length: {len(train_dataset)}")
print(f"Validation dataset length: {len(val_dataset)}")

# Convert to DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop
num_epochs = 1
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    for batch in train_dataloader:
        input_ids = torch.stack(batch['input_ids']).to(model.device)
        attention_mask = torch.stack(batch['attention_mask']).to(model.device)
        labels = torch.stack(batch['labels']).to(model.device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print loss every 10 steps
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

# Save the model weights
torch.save(model.state_dict(), "model_weights.pth")
# Save the tokenizer
tokenizer.save_pretrained("./tokenizer")



(101034, 2)
                                               Input  \
0  I would love to try the local food with my fri...   
1  I would love to try the local food with my fri...   
2  I would love to try the local food with my fri...   
3  I would love to try the local food with my fri...   
4  I would love to try the local food with my fri...   

                                              Output  
0                                 What's your name?   
1   Nice to meet you Gavin. What kind of movies d...  
2   I can relate to that. I like to watch movies ...  
3            What are some of your favorite movies?   
4   Those are all great movies! I love "The Shaws...  
Train dataset length: 80
Validation dataset length: 20


('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.json',
 './tokenizer/merges.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')