In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

# Set seed
torch.manual_seed(42)

# Load and preprocess dataset
df = pd.read_csv('/kaggle/input/dataset12/merged_cleaned-data.csv')
df = df[['query', 'response']].dropna()

# Tokenization
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.split()

# Build vocab
tokenized_inputs = [tokenize(s) for s in df['query']]
tokenized_outputs = [tokenize(s) for s in df['response']]
all_tokens = sum(tokenized_inputs + tokenized_outputs, [])
vocab = ['<pad>', '<sos>', '<eos>', '<unk>'] + sorted(set(all_tokens))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
vocab_size = len(word2idx)


vocab = ['<pad>', '<sos>', '<eos>', '<unk>'] + sorted(set(all_tokens))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}

# Encode
MAX_LEN = 20

def encode(tokens):
    ids = [word2idx.get(w, word2idx['<unk>']) for w in tokens]
    return [word2idx['<sos>']] + ids[:MAX_LEN-2] + [word2idx['<eos>']]

class ChatDataset(Dataset):
    def __init__(self, queries, responses):
        self.inputs = [encode(tokenize(s)) for s in queries]
        self.outputs = [encode(tokenize(s)) for s in responses]

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_ids = self.inputs[idx]
        target_ids = self.outputs[idx]

        input_ids += [word2idx['<pad>']] * (MAX_LEN - len(input_ids))
        target_ids += [word2idx['<pad>']] * (MAX_LEN - len(target_ids))

        return torch.tensor(input_ids), torch.tensor(target_ids)

# DataLoader
train_data, val_data = train_test_split(df, test_size=0.1, random_state=42)
train_ds = ChatDataset(train_data['query'], train_data['response'])
val_ds = ChatDataset(val_data['query'], val_data['response'])
train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=16)

# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

# Transformer Model
class TransformerChatbot(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=8, num_layers=4, dim_ff=512):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_ff,
            dropout=0.1,
            batch_first=True
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, src, tgt):
        src_emb = self.embedding(src) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32)).to(src.device)
        tgt_emb = self.embedding(tgt) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32)).to(tgt.device)
        src_emb = self.pos_encoder(src_emb)
        tgt_emb = self.pos_encoder(tgt_emb)

        tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
        src_key_padding_mask = (src == word2idx['<pad>'])
        tgt_key_padding_mask = (tgt == word2idx['<pad>'])

        out = self.transformer(
            src_emb, tgt_emb,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            memory_key_padding_mask=src_key_padding_mask
        )
        return self.fc_out(out)

# Initialize Model
model = TransformerChatbot(vocab_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
loss_fn = nn.CrossEntropyLoss(ignore_index=word2idx['<pad>'])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training Loop with Accuracy
EPOCHS = 100
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    total_correct = 0
    total_tokens = 0

    for src, tgt in train_dl:
        src, tgt = src.to(device), tgt.to(device)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        optimizer.zero_grad()
        output = model(src, tgt_input)
        output = output[:, :tgt_output.size(1), :]
        output = output.reshape(-1, output.shape[-1])
        tgt_output = tgt_output.reshape(-1)

        loss = loss_fn(output, tgt_output)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        preds = output.argmax(dim=1)
        mask = tgt_output != word2idx['<pad>']
        total_correct += (preds == tgt_output)[mask].sum().item()
        total_tokens += mask.sum().item()

    avg_loss = total_loss / len(train_dl)
    accuracy = total_correct / total_tokens if total_tokens > 0 else 0
    print(f"Epoch {epoch+1} | Train Loss: {avg_loss:.4f} | Accuracy: {accuracy * 100:.2f}%")




Epoch 1 | Train Loss: 6.2097 | Accuracy: 20.37%
Epoch 2 | Train Loss: 4.8090 | Accuracy: 31.90%
Epoch 3 | Train Loss: 4.4148 | Accuracy: 35.83%
Epoch 4 | Train Loss: 4.1596 | Accuracy: 37.98%
Epoch 5 | Train Loss: 3.9699 | Accuracy: 39.67%
Epoch 6 | Train Loss: 3.8207 | Accuracy: 40.84%
Epoch 7 | Train Loss: 3.6891 | Accuracy: 42.15%
Epoch 8 | Train Loss: 3.5720 | Accuracy: 42.99%
Epoch 9 | Train Loss: 3.4628 | Accuracy: 44.14%
Epoch 10 | Train Loss: 3.3633 | Accuracy: 44.90%
Epoch 11 | Train Loss: 3.2686 | Accuracy: 45.86%
Epoch 12 | Train Loss: 3.1768 | Accuracy: 46.74%
Epoch 13 | Train Loss: 3.0873 | Accuracy: 47.56%
Epoch 14 | Train Loss: 3.0062 | Accuracy: 48.36%
Epoch 15 | Train Loss: 2.9272 | Accuracy: 49.22%
Epoch 16 | Train Loss: 2.8418 | Accuracy: 50.15%
Epoch 17 | Train Loss: 2.7639 | Accuracy: 51.06%
Epoch 18 | Train Loss: 2.6871 | Accuracy: 51.85%
Epoch 19 | Train Loss: 2.6149 | Accuracy: 52.82%
Epoch 20 | Train Loss: 2.5444 | Accuracy: 53.67%
Epoch 21 | Train Loss: 2.4683

In [7]:
# Save the model
torch.save(model.state_dict(), "transformer_chatbot6.pth")
print("Model saved to transformer_chatbot.pth")


Model saved to transformer_chatbot.pth


In [8]:
import json

# Save vocabulary
with open("word2idx.json", "w") as f:
    json.dump(word2idx, f)

with open("idx2word.json", "w") as f:
    json.dump(idx2word, f)



In [9]:
from IPython.display import FileLink

# Create a download link
FileLink("transformer_chatbot6.pth")


In [10]:
import pickle

# Define a path to save the vocab file
vocab_save_path7 = 'vocab.pkl'  # Change path if needed

# Save the word2idx and idx2word
with open(vocab_save_path7, 'wb') as f:
    pickle.dump((word2idx, idx2word), f)

print(f"Vocabulary saved to {vocab_save_path7}")


Vocabulary saved to vocab.pkl


In [11]:
from IPython.display import FileLink

# Create a download link
FileLink('vocab.pkl')
