### Importing Libraries

In [36]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel


In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Loading Dataser

In [38]:

def load_data(file_path):
    pairs = []
    with open(file_path, "r") as f:
        for line in f:
            if '\t' in line:
                input_text, target_text = line.strip().split("\t")
                pairs.append((input_text, target_text))
    return pairs


In [39]:
# Prepare Dataset and DataLoader
data_file = "dialogs.txt"
pairs = load_data(data_file)

In [40]:
# Step 1: Dataset Preparation
class DialogDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_length):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        input_text, target_text = self.pairs[idx]
        input_enc = self.tokenizer(
            input_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        target_enc = self.tokenizer(
            target_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return input_enc["input_ids"].squeeze(), target_enc["input_ids"].squeeze()

In [41]:
train_pairs = pairs[:int(0.8 * len(pairs))]
val_pairs = pairs[int(0.8 * len(pairs)):] 

max_length = 50
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_dataset = DialogDataset(train_pairs, tokenizer, max_length)
val_dataset = DialogDataset(val_pairs, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [42]:
class ChatTransformerWithCNN(nn.Module):
    def __init__(self):
        super(ChatTransformerWithCNN, self).__init__()
        self.encoder = BertModel.from_pretrained("bert-base-uncased")
        self.conv1 = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(in_channels=256, out_channels=768, kernel_size=3, padding=1)
        self.decoder = nn.Transformer(
            d_model=768, num_encoder_layers=4, num_decoder_layers=4
        )
        self.fc_out = nn.Linear(768, tokenizer.vocab_size)

    def forward(self, input_ids, target_ids):
        encoder_outputs = self.encoder(input_ids)["last_hidden_state"]
        cnn_out = self.conv1(encoder_outputs.permute(0, 2, 1))
        cnn_out = nn.ReLU()(cnn_out)
        cnn_out = self.conv2(cnn_out)
        cnn_out = nn.ReLU()(cnn_out)
        cnn_out = self.conv3(cnn_out).permute(0, 2, 1)
        decoder_outputs = self.decoder(
            cnn_out.permute(1, 0, 2), cnn_out.permute(1, 0, 2)
        )
        logits = self.fc_out(decoder_outputs)
        return logits

In [43]:
model = ChatTransformerWithCNN()
model.to(device)



ChatTransformerWithCNN(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [44]:
# Step 3: Training and Validation
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def load_model(model, optimizer, path="model_checkpoint.pth"):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    return model, optimizer, start_epoch

In [None]:
load_model(model, optimizer, path="model_checkpoint.pth")

In [45]:
def train_fn(loader, model, optimizer, criterion):
    model.train()
    total_loss = 0
    for input_ids, target_ids in loader:
        input_ids, target_ids = input_ids.to(device), target_ids.to(device)
        optimizer.zero_grad()

        logits = model(input_ids, target_ids)
        logits = logits.view(-1, logits.size(-1))
        target_ids = target_ids.view(-1)

        loss = criterion(logits, target_ids)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(loader)

In [46]:
def eval_fn(loader, model, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for input_ids, target_ids in loader:
            input_ids, target_ids = input_ids.to(device), target_ids.to(device)

            logits = model(input_ids, target_ids)
            logits = logits.view(-1, logits.size(-1))
            target_ids = target_ids.view(-1)

            loss = criterion(logits, target_ids)
            total_loss += loss.item()
    return total_loss / len(loader)

In [47]:
epoch=0
for epoc in range(5):
    epoch=epoc
    train_loss = train_fn(train_loader, model, optimizer, criterion)

    print(f"Epoch {epoch + 1}, Train Loss: {train_loss}")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1, Train Loss: 1.7900475397466975
Epoch 2, Train Loss: 1.6398147006723331
Epoch 3, Train Loss: 1.6290377148969926
Epoch 4, Train Loss: 1.6246088794208466
Epoch 5, Train Loss: 1.6216084957122803


In [48]:
val_loss = eval_fn(val_loader, model, criterion)
print(f" Val Loss: {val_loss}")

 Val Loss: 1.845977450938935


In [49]:
def save_model(model, optimizer, epoch, path="model_checkpoint.pth"):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch
    }, path)


In [50]:
save_model(model, optimizer, epoch, path="model_checkpoint.pth")