In [1]:
!nvidia-smi``

Sat Jan 18 12:08:03 2025       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.141.03   Driver Version: 470.141.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:47:00.0 Off |                    0 |
| N/A   41C    P0   128W / 400W |  81222MiB / 81251MiB |     89%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Model

In [9]:
from collections import Counter
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.utils.data
import math
import torch.nn.functional as F

In [10]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.query_linear = nn.Linear(d_model, d_model)
        self.key_linear = nn.Linear(d_model, d_model)
        self.value_linear = nn.Linear(d_model, d_model)
        self.output_linear = nn.Linear(d_model, d_model)
    
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        # Linear transformation and splitting into heads
        Q = self.query_linear(query).view(batch_size, -1, self.num_heads, self.d_model // self.num_heads).transpose(1, 2)
        K = self.key_linear(key).view(batch_size, -1, self.num_heads, self.d_model // self.num_heads).transpose(1, 2)
        V = self.value_linear(value).view(batch_size, -1, self.num_heads, self.d_model // self.num_heads).transpose(1, 2)
        
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_model ** 0.5)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float("-inf"))
        
        attention = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attention, V)
        
        # Concatenate and linear transformation
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.output_linear(context)
        return output

# Define the Feed-Forward Neural Network Layer
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForward, self).__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.linear_1(x))
        x = self.dropout(x)
        x = self.linear_2(x)
        return x
    
# Define the positional encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        return x + self.pe[:x.size(0), :]

# Define the Transformer Model
class Transformer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, num_encoder_layers, num_decoder_layers, src_vocab_size, tgt_vocab_size, max_len=512, dropout=0.1):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(src_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len=max_len)
        self.dropout = nn.Dropout(dropout)
        
        self.encoder_layers = nn.ModuleList([nn.ModuleList([MultiHeadAttention(d_model, num_heads), FeedForward(d_model, d_ff, dropout=dropout)]) for _ in range(num_encoder_layers)]) 
        self.decoder_layers = nn.ModuleList([nn.ModuleList([MultiHeadAttention(d_model, num_heads), MultiHeadAttention(d_model, num_heads), FeedForward(d_model, d_ff, dropout=dropout)]) for _ in range(num_decoder_layers)])
        
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src_embedding = self.embedding(src)
        src_embedding = self.positional_encoding(src_embedding)
        src_embedding = self.dropout(src_embedding)

        for attention, feed_forward in self.encoder_layers:
            src_embedding = attention(src_embedding, src_embedding, src_embedding, mask=src_mask)
            src_embedding = feed_forward(src_embedding)

        tgt_embedding = self.embedding(tgt)
        tgt_embedding = self.positional_encoding(tgt_embedding)
        tgt_embedding = self.dropout(tgt_embedding)
        
        for self_attention, encoder_attention, feed_forward in self.decoder_layers:
            tgt_embedding = self_attention(tgt_embedding, tgt_embedding, tgt_embedding, mask=tgt_mask)
            tgt_embedding = encoder_attention(tgt_embedding, src_embedding, src_embedding, mask=src_mask)
            tgt_embedding = feed_forward(tgt_embedding)

        output = self.fc_out(tgt_embedding)
        return output

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

# Dataset

In [None]:
from collections import Counter
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

import torch.utils.data
import math
import torch.nn.functional as F
from tqdm import tqdm

In [None]:
# Load a text corpus from a file and return as a list of lines
def load_corpus(file_path):
    with open(file_path, 'r',  errors='ignore') as f:
        return f.readlines()

# Create a dictionary mapping line IDs to their corresponding text
def create_line_dict(lines):
    line_dict = {}
    for line in lines:
        parts = line.split(" +++$+++ ")
        line_dict[parts[0]] = parts[-1]
    return line_dict

# Remove punctuations and convert text to lowercase
def clean_text(text):
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    return ''.join(char.lower() for char in text if char not in punctuations)

# Create question-answer pairs from conversations
def create_qa_pairs(conversations, line_dict):
    qa_pairs = []
    for conversation in conversations:
        ids = eval(conversation.split(" +++$+++ ")[-1])
        for i in range(len(ids) - 1):
            question = clean_text(line_dict[ids[i]].strip())
            answer = clean_text(line_dict[ids[i+1]].strip())
            qa_pairs.append([question.split()[:max_sequence_length], answer.split()[:max_sequence_length]])
    return qa_pairs

# Encode reply text to integer values
def encode_reply(words, word_map, max_length=max_sequence_length):
    encoded = [word_map['<start>']]
    encoded += [word_map.get(word, word_map['<unk>']) for word in words]
    encoded.append(word_map['<end>'])
    padding_needed = max_length - len(encoded)
    encoded.extend([word_map['<pad>']] * padding_needed)
    return encoded

# Encode question text to integer values
def encode_question(words, word_map, max_length=max_sequence_length):
    encoded = [word_map.get(word, word_map['<unk>']) for word in words]
    padding_needed = max_length - len(encoded)
    encoded.extend([word_map['<pad>']] * padding_needed)
    return encoded


In [None]:
conversations = load_corpus(movie_conversations_path)
lines = load_corpus(movie_lines_path)

# Create line dictionary
line_dict = create_line_dict(lines)

# Create question-answer pairs
qa_pairs = create_qa_pairs(conversations, line_dict)

# Count word frequencies and build vocabulary
word_frequency = Counter()
for pair in qa_pairs:
    word_frequency.update(pair[0])
    word_frequency.update(pair[1])

min_frequency = 5
vocab = [word for word, freq in word_frequency.items() if freq > min_frequency]
word_map = {word: idx + 1 for idx, word in enumerate(vocab)}
word_map.update({'<unk>': len(word_map) + 1, '<start>': len(word_map) + 2, '<end>': len(word_map) + 3, '<pad>': 0})

# Save word map
with open('WORDMAP_corpus.json', 'w') as json_file:
    json.dump(word_map, json_file)


    # Loop through each question-answer pair in the original 'pairs' list
pairs_encoded = []
for pair in qa_pairs:
    # Encode the question part of the pair using the 'encode_question' function
    qus = encode_question(pair[0], word_map)
    
    # Encode the answer part of the pair using the 'encode_reply' function
    ans = encode_reply(pair[1], word_map)
    
    # Append the encoded question and answer as a pair to 'pairs_encoded' list
    pairs_encoded.append([qus, ans])

# Save the encoded pairs to a JSON file for future use
with open('pairs_encoded.json', 'w') as p:
    json.dump(pairs_encoded, p)
