In [None]:

import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset
import nltk
from nltk.translate.bleu_score import sentence_bleu
from sklearn.model_selection import train_test_split

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

DATA_DIR = "/Desktop/Dataset"  # Change this path to your dataset folder
A_PATH = DATA_DIR + "/userA_chats.csv"
B_PATH = DATA_DIR + "/userB_chats.csv"

userA = pd.read_csv(A_PATH)  
userB = pd.read_csv(B_PATH)  

conversations = pd.DataFrame({'input': userB['message'], 'output': userA['message']})

train_df, test_df = train_test_split(conversations, test_size=0.1, random_state=42)

class ChatDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.input_texts = dataframe['input'].tolist()
        self.target_texts = dataframe['output'].tolist()
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_text = self.input_texts[idx]
        target_text = self.target_texts[idx]
     
        input_encoding = self.tokenizer(
            input_text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        target_encoding = self.tokenizer(
            target_text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        labels = target_encoding['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100  # Ignore padding for loss
        
        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': labels.squeeze()
        }


tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad token

model = GPT2LMHeadModel.from_pretrained("gpt2").to(DEVICE)

# Prepare Datasets
train_dataset = ChatDataset(train_df, tokenizer)
test_dataset = ChatDataset(test_df, tokenizer)


# Training Arguments

training_args = TrainingArguments(
    output_dir="./chat_gpt2",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    push_to_hub=False,
    fp16=torch.cuda.is_available()
)

# Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train

trainer.train()

# Generate Replies

def generate_reply(user_b_message, max_length=50):
    model.eval()
    input_ids = tokenizer.encode(user_b_message, return_tensors='pt').to(DEVICE)
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_length=max_length,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(generate_reply("Hey, how are you?"))
# Evaluation using BLEU

def evaluate_bleu(dataset):
    scores = []
    for i in range(len(dataset)):
        reference = dataset[i]['labels'].cpu().numpy()
        reference_text = tokenizer.decode([token for token in reference if token != -100])
        predicted_text = generate_reply(dataset[i]['input_ids'].cpu().numpy())
        reference_tokens = nltk.word_tokenize(reference_text)
        predicted_tokens = nltk.word_tokenize(predicted_text)
        score = sentence_bleu([reference_tokens], predicted_tokens)
        scores.append(score)
    return sum(scores)/len(scores)

print("BLEU Score:", evaluate_bleu(test_dataset))
