In [5]:
!pip install datasets



In [6]:
import pandas as pd
df=pd.read_csv('/kaggle/input/ahmed-data-set/samsum-train.csv')
df = df.dropna()
df.to_csv('/kaggle/working/samsum-train.csv', index=False)

In [7]:
import pandas as pd

# Paths to the individual CSV files
train_file = '/kaggle/working/samsum-train.csv'  # Update these paths accordingly
test_file = '/kaggle/input/ahmed-data-set/samsum-test.csv'
validation_file = '/kaggle/input/ahmed-data-set/samsum-validation.csv'

# Load the data into pandas DataFrames
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
validation_data = pd.read_csv(validation_file)

# Check if all files have the same columns and structure
print(train_data.columns)
print(test_data.columns)
print(validation_data.columns)

# Merge all three files into a single DataFrame
merged_data = pd.concat([train_data, test_data, validation_data], ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_file = '/kaggle/working/First_Row_New_Text_Document.csv'
merged_data.to_csv(merged_file, index=False)

print(f"Data has been successfully merged and saved to {merged_file}.")


Index(['id', 'dialogue', 'summary'], dtype='object')
Index(['id', 'dialogue', 'summary'], dtype='object')
Index(['id', 'dialogue', 'summary'], dtype='object')
Data has been successfully merged and saved to /kaggle/working/First_Row_New_Text_Document.csv.


In [8]:
import pandas as pd

def remove_long_rows_from_csv(input_file, output_file, max_dialogue_tokens=400, max_summary_tokens=50):
    """
    Remove rows from a CSV file where the `dialogue` column has token count >= max_dialogue_tokens
    or the `summary` column has token count >= max_summary_tokens.

    Args:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to save the filtered CSV file.
        max_dialogue_tokens (int): Maximum allowed tokens in `dialogue` column.
        max_summary_tokens (int): Maximum allowed tokens in `summary` column.
    """
    # Load the CSV file into a DataFrame
    df = pd.read_csv(input_file)

    # Ensure 'dialogue' and 'summary' columns exist
    if 'dialogue' not in df.columns or 'summary' not in df.columns:
        raise ValueError("Input file must contain 'dialogue' and 'summary' columns.")

    # Calculate token counts for `dialogue` and `summary`
    df['dialogue_token_count'] = df['dialogue'].apply(lambda x: len(str(x).split()))
    df['summary_token_count'] = df['summary'].apply(lambda x: len(str(x).split()))

    # Filter rows based on token count limits
    filtered_df = df[
        (df['dialogue_token_count'] < max_dialogue_tokens) &
        (df['summary_token_count'] < max_summary_tokens)
    ]

    # Drop the temporary token count columns
    filtered_df = filtered_df.drop(columns=['dialogue_token_count', 'summary_token_count'])

    # Save the filtered DataFrame back to a CSV file
    filtered_df.to_csv(output_file, index=False)

# Example usage:
input_csv_path = "/kaggle/working/First_Row_New_Text_Document.csv"  # Replace with the path to your input CSV file
output_csv_path = "/kaggle/working/First_Row_New_Text_Document.csv"  # Replace with the path to save the filtered CSV

remove_long_rows_from_csv(input_csv_path, output_csv_path, max_dialogue_tokens=400, max_summary_tokens=50)

print(f"Filtered data saved to {output_csv_path}")


Filtered data saved to /kaggle/working/First_Row_New_Text_Document.csv


##Data Cleaning

In [9]:
import pandas as pd
import re
df = pd.read_csv('/kaggle/working/First_Row_New_Text_Document.csv')
#df = df.drop(columns=['id'])
df['dialogue'] = df['dialogue'].str.replace(r'\r\n', ' ', regex=True)
df['summary'] = df['summary'].str.replace(r'\r\n', ' ', regex=True)
def clean_text(text):
    # Remove everything except letters
    return re.sub(r'[^a-zA-Z\s]', '', text)
# Apply the cleaning function to the 'dialogue' column
df['dialogue'] = df['dialogue'].apply(clean_text)
df['summary'] = df['summary'].apply(clean_text)
# Save the cleaned dataset to a new CSV file
df = df.rename(columns={'dialogue': 'en', 'summary': 'it'})
df.to_csv('/kaggle/working/First_Row_New_Text_Document.csv', index=False)
print(df.iloc[:, -3:])
print("The text has been cleaned and saved to 'cleaned.csv'")


             id                                                 en  \
0      13818513  Amanda I baked  cookies Do you want some Jerry...   
1      13728867  Olivia Who are you voting for in this election...   
2      13681000  Tim Hi whats up Kim Bad mood tbh I was going t...   
3      13730747  Edward Rachel I think Im in ove with Bella rac...   
4      13728094  Sam hey  overheard rick say something Sam i do...   
...         ...                                                ...   
15969  13611821  Chris Im on my way Peter ok great Chris are we...   
15970  13829423  Carla Ive got it Diego what Carla my date for ...   
15971  13829261  Julia Greg just texted me Robert ugh delete hi...   
15972  13680226  Marry I broke my nail  Tina oh no Marry u know...   
15973  13862383  Paige I asked them to wait and send the declar...   

                                                      it  
0      Amanda baked cookies and will bring Jerry some...  
1      Olivia and Olivier are voting for 

##Data Conversion


In [10]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Load the dataset from the CSV file
file_path = "/kaggle/working/First_Row_New_Text_Document.csv"

# Step 1: Load the dataset without forcing column names and using the correct delimiter
df = pd.read_csv(file_path, delimiter=",", header=0)

# Step 2: Check if the columns loaded correctly
print("Loaded DataFrame:")
print(df.head())

# Step 3: Add a unique 'id' column
df["id"] = df.index.astype(str)

# Step 4: Create a 'translation' column
df["translation"] = df[["en", "it"]].apply(lambda x: {"en": x["en"], "it": x["it"]}, axis=1)

# Step 5: Drop the original columns to keep only 'id' and 'translation'
df = df[["id", "translation"]]

# Step 6: Convert to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df)

# Step 7: Wrap it into a DatasetDict
dataset_dict = DatasetDict({"train": hf_dataset})

# Step 8: Save the dataset locally
output_path = "./converted_conversation_dataset"
dataset_dict.save_to_disk(output_path)

print(f"Dataset converted and saved successfully at: {output_path}")


Loaded DataFrame:
         id                                                 en  \
0  13818513  Amanda I baked  cookies Do you want some Jerry...   
1  13728867  Olivia Who are you voting for in this election...   
2  13681000  Tim Hi whats up Kim Bad mood tbh I was going t...   
3  13730747  Edward Rachel I think Im in ove with Bella rac...   
4  13728094  Sam hey  overheard rick say something Sam i do...   

                                                  it  
0  Amanda baked cookies and will bring Jerry some...  
1  Olivia and Olivier are voting for liberals in ...  
2  Kim may try the pomodoro technique recommended...  
3  Edward thinks he is in love with Bella Rachel ...  
4  Sam is confused because he overheard Rick comp...  


Saving the dataset (0/1 shards):   0%|          | 0/15974 [00:00<?, ? examples/s]

Dataset converted and saved successfully at: ./converted_conversation_dataset


In [11]:
from datasets import load_from_disk

# Load the saved dataset
dataset = load_from_disk("/kaggle/working/converted_conversation_dataset")

# Inspect the structure
print(dataset)
print(dataset["train"][0])  # First record


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 15974
    })
})
{'id': '0', 'translation': {'en': 'Amanda I baked  cookies Do you want some Jerry Sure Amanda Ill bring you tomorrow ', 'it': 'Amanda baked cookies and will bring Jerry some tomorrow'}}


## Embeding Scaling and Forward


In [12]:
import torch
import torch.nn as nn
import math

class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)

## Positional Embeding


In [13]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)

## Layer Normalization

In [14]:
class LayerNormalization(nn.Module):
    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
         # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

## Feed Forward

In [15]:
class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

## Multi Head Attentation

In [16]:
class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model # Embedding vector size
        self.h = h # Number of heads
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # Calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        return self.w_o(x)


## Residual Connection

In [17]:
class ResidualConnection(nn.Module):

        def __init__(self, features: int, dropout: float) -> None:
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.norm = LayerNormalization(features)

        def forward(self, x, sublayer):
            return x + self.dropout(sublayer(self.norm(x)))

## Encoder

In [18]:
class EncoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

In [19]:
class Encoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

## Decoder

In [20]:
class DecoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

In [21]:
class Decoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

## Projection Layer

In [22]:
class ProjectionLayer(nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)

## Transformer

In [23]:
class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)

    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)

    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)

In [24]:
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048) -> Transformer:
    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)

    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    # Create the encoder and decoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))

    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer

## Greedy Decode

In [25]:
import torch
import torch.nn as nn

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from pathlib import Path
from torch.utils.data import Dataset,DataLoader,random_split

In [26]:
def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(source, source_mask)
    # Initialize the decoder input with the sos token
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
    while True:
        if decoder_input.size(1) == max_len:
            break

        # build mask for target
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculate output
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # get next token
        prob = model.project(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
        )

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)



In [27]:
!pip install torchmetrics



## Run Validation

In [28]:
import torchmetrics
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_step, writer, num_examples=2):
    model.eval()
    count = 0

    source_texts = []
    expected = []
    predicted = []

    try:
        # get the console window width
        with os.popen('stty size', 'r') as console:
            _, console_width = console.read().split()
            console_width = int(console_width)
    except:
        # If we can't get the console width, use 80 as default
        console_width = 80

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
            encoder_mask = batch["encoder_mask"].to(device) # (b, 1, 1, seq_len)

            # check that the batch size is 1
            assert encoder_input.size(
                0) == 1, "Batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)

            # Print the source, target and model output
            print_msg('-'*console_width)
            print_msg(f"{f'SOURCE: ':>12}{source_text}")
            print_msg(f"{f'TARGET: ':>12}{target_text}")
            print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")

            if count == num_examples:
                print_msg('-'*console_width)
                break

    if writer:
        # Evaluate the character error rate
        # Compute the char error rate
        metric = torchmetrics.CharErrorRate()
        cer = metric(predicted, expected)
        writer.add_scalar('validation cer', cer, global_step)
        writer.flush()

        # Compute the word error rate
        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        writer.add_scalar('validation wer', wer, global_step)
        writer.flush()

        # Compute the BLEU metric
        metric = torchmetrics.BLEUScore()
        bleu = metric(predicted, expected)
        writer.add_scalar('validation BLEU', bleu, global_step)
        writer.flush()

## Tokenzier

In [29]:
def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]

In [30]:
def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

In [31]:
from datasets import load_from_disk
def get_ds(config):
    # It only has the train split, so we divide it overselves
    dataset_path = f"{config['datasource']}"
    ds_raw = load_from_disk(dataset_path)
    ds_raw = ds_raw['train']

    # Build tokenizers
    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])

    # Keep 90% for training, 10% for validation
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

    # Find the maximum length of each sentence in the source and target sentence
    max_len_src = 0
    max_len_tgt = 0

    for item in ds_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')


    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)
    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

## Get Model

In [32]:
def get_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(vocab_src_len, vocab_tgt_len, config["seq_len"], config['seq_len'], d_model=config['d_model'])
    return model

## Train Model

In [33]:
import os
from pathlib import Path
def train_model(config):
    # Define the device
    device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"
    print("Using device:", device)
    if (device == 'cuda'):
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
    elif (device == 'mps'):
        print(f"Device name: <mps>")
    else:
        print("NOTE: If you have a GPU, consider using it for training.")
        print("      On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc")
        print("      On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu")
    device = torch.device(device)

    # Make sure the weights folder exists
    Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])

    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

    # If the user specified a model to preload before training, load it
    initial_epoch = 0
    global_step = 0
    preload = config['preload']
    model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None
    if model_filename:
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']
    else:
        print('No model to preload, starting from scratch')

    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    for epoch in range(initial_epoch, config['num_epochs']):
        if epoch==1 or epoch==0:
            print("nothing to remove")
        elif epoch <= 11:
            epoch_num=epoch-2
            os.remove(f"/kaggle/working/converted_conversation_dataset_weights/tmodel_0{epoch_num}.pt")
        else:
            epoch_num=epoch-2
            os.remove(f"/kaggle/working/converted_conversation_dataset_weights/tmodel_{epoch_num}.pt")
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:

            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Log the loss
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1

        # Run validation at the end of every epoch
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)

        # Save the model at the end of every epoch
        model_filename = get_weights_file_path(config, f"{epoch:02d}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)


## Running the model

In [43]:
import warnings
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
if __name__ == '__main__':
    warnings.filterwarnings("ignore")
    config = get_config()
    train_model(config)

Using device: cuda
Device name: Tesla P100-PCIE-16GB
Device memory: 15.887939453125 GB
Max length of source sentence: 399
Max length of target sentence: 49
No model to preload, starting from scratch
nothing to remove


Processing Epoch 00: 100%|██████████| 1797/1797 [10:40<00:00,  2.81it/s, loss=5.539]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Martin my new neighbor is crazy Martin i cant stand him Christian Hes Joes brother right Martin YES Martin Which makes complaining about him really awkward  Christian Why do you say hes crazy Martin his apartment smells so bad that I can smell it Martin he plays loud music all the time Martin his voice is so thunderous that I hear his conversations when hes on the phone Martin he has a dog that wont stop barking Martin Strangers keep coming in and out of his apartment Martin I tried to introduce myself the day he moved in but he ignored me Christian That sounds like the neighbor from hell Martin he is Thats him Christian Maybe you could talk to Joe Martin No thats fine were grown adults Martin If it escalates Ill deal with him directly Martin i think thats better Christian Good idea
    TARGET: Martin doesnt like his new neighbor Although hes Joes brother he ignored Martin Smelly place loud mus

Processing Epoch 01: 100%|██████████| 1797/1797 [10:40<00:00,  2.81it/s, loss=5.680]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Skyler filephoto Devante tx Skyler is it legible enough Devante yeah its fine Devante thanks
    TARGET: Skyler sends the photo to Devante 
 PREDICTED: is in the
--------------------------------------------------------------------------------
    SOURCE: Olivia filephoto Can you guess where I am Julia Luis Even I can guess and Im jealous  Julia Ive just been in the hell that is Smyths Toy Store Im with kids in town When is gin oclock  Olivia Sorry for that Julia Ive just bought some baklavas Luis Whenever but not for those who are gluttons for punishment  Julia Olivia  in my current mental state I initially read that as balaclavas  Thanks for sympathy Luis  Luis  Just boarding yet another plane sympathy running thin  Julia Business or pleasure Maybe you should have been an air hostess Luis Nah too grumpy  Taking the kids and my wife but I also have some meetings and a presentation at a business

Processing Epoch 02: 100%|██████████| 1797/1797 [10:40<00:00,  2.80it/s, loss=5.213]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Pam Weee aree the champioooons my frieeeend Kitty Haha last exam done lets go celebrate Pam How about a nice dinner Kitty You always have the best ideas  Pam Shrimp house Kitty Very fancy Sure why not Pam I always wanted to go there Kitty Is it like very expensive Pam Not that much PLN for a plate Kitty Good Just remember passing exams doesnt mean were not on a student budget Pam Let me pretend Im rich for a day  Kitty Are we going today Pam Yeah why not Ill clean my room first because this week I spent all time studying Kitty I am going to be lazy and do nothing And when Im hungry lets say pm we can meet at this Shrimp house Deal Pam Deal Looking forward to it Kitty Me too See you in a bit
    TARGET: Pam and Kitty will meet at the Shrimp House at  PM to celebrate finishing their exams Plates at the Shrimp House cost  PLN Pam will clean her room first Pam spent the whole week studying
 PREDICT

Processing Epoch 03: 100%|██████████| 1797/1797 [10:40<00:00,  2.80it/s, loss=5.453]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Lenny So what about the exam Anyone who passed it Tanja I failed  Chris Me too shes a bitch Greg I passed but I got C Lenny Well it doesnt seem its gonna be easy Tanja Shes really tough I studied a lot really I spent last  weeks cramming Chris They say she always fail  of the students Lenny My summer is ruined what a bitch Tanja Guys I mean we can always ask others about the questions and keep our fingers crossed Chris OK yeah its not gonna be that bad So she asked me about Bolivian revolution and agriculture in Chile in the s Lenny OMG Tanja Mine were the geopolitical situation of Caribbean after II WW Lenny This ones better Greg I got the Cuban revolution  Tanja Lucky you If she asked me that question Id surely pass Chris What about others  Lenny I will ask Christina and Jason they have the exam today Greg I can ask Pauline she made a list with possible questions This may help Tanja Oh thats 

Processing Epoch 04: 100%|██████████| 1797/1797 [10:40<00:00,  2.81it/s, loss=5.273]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Wendy Hey Pam has her birthday on th
Diana Hey Wendy yes I remember
Wendy I want to make her happy 
Diana I was thinking of a big cake decorations inviting close friends
Wendy Surprise party Oh I see 
Diana Yeah Im sure she wont do anything by herself
Wendy That might be true Lets invite people and think of a plan on how to arrange this
Diana We can come to her place and after some time well tell her to go to the store
Wendy Yes and when shes gone we will let people in brilliant 
Diana Its gonna be legenwait for itdary  
Wendy Hope it all works out  Gotta go Speak soon
Diana Sure take care
    TARGET: Wendy and Diana are planning a surprise birthday party for Pam They are going to invite some guests over to Pams and let them in after she goes out to a shop
 PREDICTED: Jane and Sue are going to a party on Saturday at pm They will meet at the party
------------------------------------------------

Processing Epoch 05: 100%|██████████| 1797/1797 [10:40<00:00,  2.81it/s, loss=4.799]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Nathalie Ive just been to the most amazing salsa lesson You should regret ditching me P
Sara Come on Nat we didnt ditch you I was just busy 
Gabrielle If having a flu is ditching someone then yes I definitely ditched you
Mary Tell me more about the salsa 
Nathalie The teachers amazing Shes so talented plays great music and really can teach Even I picked up on something  first classes
Mary Sounds great when are the classes
Nathalie Twice a week Wednesday and Saturday
Sara What time at Saturday
Nathalie   I cant wait Im so excited
Mary Id like to come if youre saying its so great but Saturday at  may be tricky especially after a night out
Gabrielle Well if you stay up until  then yes but you dont have to 
    TARGET: Nathalie went to a salsa lesson which she really enjoyed She will attend classes every Wednesday and Saturday Her friend may join her
 PREDICTED: Mary and Mary are going to the gym o

Processing Epoch 06: 100%|██████████| 1797/1797 [10:40<00:00,  2.81it/s, loss=4.411]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Bobby heyuhm I got a problem Bobby I got stuck in the toilet Alexis what Ru serious Bobby yeah Ive tried a few times the door wont open Alexis shit Alexis Why do you always get into strange situations Alexis Ill go and fetch someone Bobby filegif Bobby thx
    TARGET: Bobby got stuck in the toilet so Alexis will go and fetch someone
 PREDICTED: The key is broken but it was not available so he will call him to call him
--------------------------------------------------------------------------------
    SOURCE: Julio hey u home
Byron yeah i am the rest went to church
Julio ok ill be there in 
Byron ok
    TARGET: Julio is going to visit Byron in  minutes Byron is alone at home
 PREDICTED: is coming home to the vet
--------------------------------------------------------------------------------


Processing Epoch 07: 100%|██████████| 1797/1797 [10:39<00:00,  2.81it/s, loss=4.673]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Uncle Billy Your scissors came in the post today your funny scissors Abbie Theyre moms For the cats claws Uncle Billy Thats a relief Abbie Why Uncle Billy I thought you were planning to trim my nose hairs
    TARGET: Moms scissors for the cats claws have arrived today
 PREDICTED: Billy has lost his phone and has a card He will give it to his wife today
--------------------------------------------------------------------------------
    SOURCE: Jude ill be in warsaw at the beginning of december so we could meet again Leon  Leon at the beginning means Leon cuz I wont be here during the first weekend Jude  Jude but i think its a monday so never mind i guess D Leon yeah monday doesnt really work for me D Leon  Jude oh well next time d Leon yeah
    TARGET: Jude is coming to Warsaw on the th of December and wants to see Leon Leon has no time
 PREDICTED: will meet at the on the th of December th and 

Processing Epoch 08: 100%|██████████| 1797/1797 [10:39<00:00,  2.81it/s, loss=3.955]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Miranda u done Danny almost finished Danny guess  mins tops Miranda ok Ill be waiting downstairs Danny ok
    TARGET: Miranda will be ready in no more than  minutes 
 PREDICTED: Georgia will wait inside for Danny in minutes
--------------------------------------------------------------------------------
    SOURCE: Max I hate shopping Payton Y Max I treat it as a necessity Payton Still y do u h shopping Max I hate having to choose the things to try them on and go from shop to shop in the hope of buying something but eventually returning home emptyhanded Payton So u h buying clothes Max Yup Payton What about other stuff Max Like what Payton Food Max Thats easy I just go to the supermarket and pick up some groceries No problem Payton Books Electronics And so on Max Online Those are the things I can buy online and it doesnt take much time  Payton So its not the shopping u h but shopping  clothes M

Processing Epoch 09: 100%|██████████| 1797/1797 [10:38<00:00,  2.81it/s, loss=3.946]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Leonardo turn on the TV  Now Channel   Zachary wait Leonardo are you watching this Zachary hahahahah is that Jake Leonardo yes hahahaha such a TV star hhaha Zachary filegif
    TARGET: Leonardo and Zachary are watching Jake on Channel  
 PREDICTED: Franklin is going to play FIFA for the first time
--------------------------------------------------------------------------------
    SOURCE: Nathalie Ive just been to the most amazing salsa lesson You should regret ditching me P
Sara Come on Nat we didnt ditch you I was just busy 
Gabrielle If having a flu is ditching someone then yes I definitely ditched you
Mary Tell me more about the salsa 
Nathalie The teachers amazing Shes so talented plays great music and really can teach Even I picked up on something  first classes
Mary Sounds great when are the classes
Nathalie Twice a week Wednesday and Saturday
Sara What time at Saturday
Nathalie   I cant

Processing Epoch 10: 100%|██████████| 1797/1797 [10:39<00:00,  2.81it/s, loss=3.628]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: John Have you stolen my pen steve No i swear i havent stolen your pen John haha I believe you steve Oh Thank God
    TARGET: John suspects Steve stole his pen but he didnt
 PREDICTED: Steve didnt sleep well
--------------------------------------------------------------------------------
    SOURCE: Ralf Hi man Did you watch the election Jim Sure did Ralf What do you think Jim Nothing much Ralf What Did your guy win or lose Jim He won But hell do nothing Ralf Why is that Jim He is a fucking politician The never do anything Ralf I wonder why you watched the election Jim Me too Fuck
    TARGET: Jim watched the election but it was a waste of time His guy won but he knows he will do nothing 
 PREDICTED: The show of the of the series is bad and its not a fan of them
--------------------------------------------------------------------------------


Processing Epoch 11: 100%|██████████| 1797/1797 [10:40<00:00,  2.81it/s, loss=3.653]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Andrew When was the last time when you smoked Conrad  months ago Andrew Thats an achievement Keep it up
    TARGET: The last time Conrad smoked was  months ago
 PREDICTED: Andrew is not satisfied with the of because the reason was
--------------------------------------------------------------------------------
    SOURCE: Mary Hello love Welcome to the green side  Anna News spreads fast I see Mary They sure do Especially the good ones  Im really happy for you Anna I must tell you I feel really motivated Its been only a month but I really want to keep it going Mary I understand Ive been through it and it can be a bumpy road but its worth it Anna Have you eaten any meat since you decided to become a vegetarian Mary Unfortunately yes as I said  bumpy road  Whats important is to keep going I once had some dumplings with meat and chicken broth Anna Werent you tempted to go back then Mary No not real

Processing Epoch 12: 100%|██████████| 1797/1797 [10:39<00:00,  2.81it/s, loss=3.334]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Ella hi i had a crazy night Scarlett what happened Ella Adrien was at my place last night Ella it was quite nice but strange Scarlett why Ella Im not sure whats between us Scarlett maybe you just need some time Ella Possible Ella But I feel attracted and at the same time I find him almost repulsive Scarlett Why repulsive Ella Hes a kind of a bad boy Scarlett what do you mean Ella hes very destructive doing strange things exaggerating with pot etc Ella He tired to commit suicide  for example  Scarlett so a guy with problems Ella very much so He has scares on his wrists Scarlett But are you afraid of it Ella He just seems to be  yo while he is  Its not a guy you can settle with I am afraid Scarlett I see not good Ella I will tell you more tonight Scarlett ok
    TARGET: Ella spent the night with Adrien She is attracted to him She is also very concerned about his mental state and past destructive 

Processing Epoch 13: 100%|██████████| 1797/1797 [10:39<00:00,  2.81it/s, loss=2.795]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Simon Hi  Derek Whats up Simon Everytihngs fine I wondered if youre going to Hannahs party on Friday Derek No I dont think so Simon What Why Derek Didnt you hear what happend at her last party Simon No what Derek Oh I dont want to tell rumors so better ask Nate about it Simon I dont really care I wanted to ask you a favor Derek What is it Simon You know Yasmine Derek Yes shes my sisters friend Simon Exactly I really like her and maybe you could tell her something nice about me Derek No problem I think she likes you too Simon Hopefully thanks bro 
    TARGET: Derek is not going to Hannahs party on Friday but he will tell Yasmine some nice things about Derek anyway
 PREDICTED: Simon is at a party on Friday Simon is not sure if he wants Simon to ask his sister for a wedding Simon who is going to ask Simon if he wants him to ask him for a wedding
----------------------------------------------------

Processing Epoch 14: 100%|██████████| 1797/1797 [10:40<00:00,  2.81it/s, loss=2.872]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Kat your new tattoo is so beautiful Melanie thank you xx Melanie look its healed now Melanie filephoto Kat so pretty Kat i was thinking about getting one but i am still not  sure Melanie just go for it Kat some day i will Kat back to work have a nice day  Melanie you too 
    TARGET: Melanie has a new tattoo Kat will maybe have one someday
 PREDICTED: Kat has a new tattoo showing her a picture of her work
--------------------------------------------------------------------------------
    SOURCE: Kian Im still in class Denise I am soooooo hungry Kian I finish in  mins then when I come back i will cook Denise Dont worry Kian So if u can wait or eat something from the fridge like eggs There should be a chicken breast in the freezer in a nylon bag Take it out to defrost please Denise I am not gonna be home until pm Kian Ah ok Kian U still at the training I mean is the training that long Denise pm 

Processing Epoch 15: 100%|██████████| 1797/1797 [10:40<00:00,  2.81it/s, loss=2.664]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Susie ask me where i am
Evan where are you
Susie in bed with my laptop aaaaaaaaahahahahaha
Susie filegif
Chad filegif
Evan hate you
    TARGET: Susie is in bed with her laptop
 PREDICTED: Susie forgot the charger Evan is in her fridge
--------------------------------------------------------------------------------
    SOURCE: Emily Should we meet at the Javits Center at  James I have to stay longer today were closing a project Emily oh no is it bad James very the atmosphere is horrible if were late we may loose  mln bucks Emily I see must be stressful  James it is ill write you as soon as Im free James but it can happen I will have to stay here till Midnight Emily ok stay strong were in touch James 
    TARGET: James is closing a project today so he cannot meet with Emily James is very stressed because this project is worth five million dollars James will talk to Emily as soon as he finishes pr

Processing Epoch 16: 100%|██████████| 1797/1797 [10:40<00:00,  2.81it/s, loss=2.509]
stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: Karen I need the titles to finish this presentation Stanford What titles Karen Of the books I need for it Stanford Comon just research it Karen You could do at least anything in this project
    TARGET: Karen asks Stanford to provide book titles for the presentation 
 PREDICTED: Karen has written a presentation on the presentation
--------------------------------------------------------------------------------
    SOURCE: Philip Do you think its possible to have legal pot in Poland anytime soon Chris I doubt it Its more like  years from now Philip  Chris Yeah I feel you man Chris Im waiting for that moment too Chris When youre not scared to have a plant on your own to grow yourself a recreational MJ to chill out in the evening Philip Seems like a dream Chris Yup A dream Because if cops find such a plant in your house right now you get a criminal record and will be charged with drug trafficking 

Processing Epoch 17:  12%|█▏        | 213/1797 [01:15<09:24,  2.80it/s, loss=2.215]


KeyboardInterrupt: 

## Attention Scores

In [52]:
import torch
import torch.nn as nn
import altair as alt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [53]:
# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [55]:
config = get_config()
train_dataloader, val_dataloader, vocab_src, vocab_tgt = get_ds(config)
model = get_model(config, vocab_src.get_vocab_size(), vocab_tgt.get_vocab_size()).to(device)

# Load the pretrained weights
model_filename = get_weights_file_path(config, f"16")
state = torch.load(model_filename)
model.load_state_dict(state['model_state_dict'])

Max length of source sentence: 399
Max length of target sentence: 49


<All keys matched successfully>

In [56]:
def load_next_batch():
    # Load a sample batch from the validation set
    batch = next(iter(val_dataloader))
    encoder_input = batch["encoder_input"].to(device)
    encoder_mask = batch["encoder_mask"].to(device)
    decoder_input = batch["decoder_input"].to(device)
    decoder_mask = batch["decoder_mask"].to(device)

    encoder_input_tokens = [vocab_src.id_to_token(idx) for idx in encoder_input[0].cpu().numpy()]
    decoder_input_tokens = [vocab_tgt.id_to_token(idx) for idx in decoder_input[0].cpu().numpy()]

    # check that the batch size is 1
    assert encoder_input.size(
        0) == 1, "Batch size must be 1 for validation"

    model_out = greedy_decode(
        model, encoder_input, encoder_mask, vocab_src, vocab_tgt, config['seq_len'], device)
    
    return batch, encoder_input_tokens, decoder_input_tokens

In [57]:
def mtx2df(m, max_row, max_col, row_tokens, col_tokens):
    return pd.DataFrame(
        [
            (
                r,
                c,
                float(m[r, c]),
                "%.3d %s" % (r, row_tokens[r] if len(row_tokens) > r else "<blank>"),
                "%.3d %s" % (c, col_tokens[c] if len(col_tokens) > c else "<blank>"),
            )
            for r in range(m.shape[0])
            for c in range(m.shape[1])
            if r < max_row and c < max_col
        ],
        columns=["row", "column", "value", "row_token", "col_token"],
    )

def get_attn_map(attn_type: str, layer: int, head: int):
    if attn_type == "encoder":
        attn = model.encoder.layers[layer].self_attention_block.attention_scores
    elif attn_type == "decoder":
        attn = model.decoder.layers[layer].self_attention_block.attention_scores
    elif attn_type == "encoder-decoder":
        attn = model.decoder.layers[layer].cross_attention_block.attention_scores
    return attn[0, head].data

def attn_map(attn_type, layer, head, row_tokens, col_tokens, max_sentence_len):
    df = mtx2df(
        get_attn_map(attn_type, layer, head),
        max_sentence_len,
        max_sentence_len,
        row_tokens,
        col_tokens,
    )
    return (
        alt.Chart(data=df)
        .mark_rect()
        .encode(
            x=alt.X("col_token", axis=alt.Axis(title="")),
            y=alt.Y("row_token", axis=alt.Axis(title="")),
            color="value",
            tooltip=["row", "column", "value", "row_token", "col_token"],
        )
        #.title(f"Layer {layer} Head {head}")
        .properties(height=400, width=400, title=f"Layer {layer} Head {head}")
        .interactive()
    )

def get_all_attention_maps(attn_type: str, layers: list[int], heads: list[int], row_tokens: list, col_tokens, max_sentence_len: int):
    charts = []
    for layer in layers:
        rowCharts = []
        for head in heads:
            rowCharts.append(attn_map(attn_type, layer, head, row_tokens, col_tokens, max_sentence_len))
        charts.append(alt.hconcat(*rowCharts))
    return alt.vconcat(*charts)

In [58]:
batch, encoder_input_tokens, decoder_input_tokens = load_next_batch()
print(f'Source: {batch["src_text"][0]}')
print(f'Target: {batch["tgt_text"][0]}')
sentence_len = encoder_input_tokens.index("[PAD]")

Source: Tracy hey guys
Tracy any idea how to save a burnt cake
Nick filegif
Dylan hahahaha
Tracy xD
Tracy ok yeah i know but now seriously xD
Nick hmmmm peel it with a knife
Dylan not sure its a good idea
Dylan it will be ragged
Nick good point
Tracy Im desperate 
Nick you think ppl will notice 
Nick maybe put some bitter chocolate on it
Tracy filephoto
Nick shit nevermind xD
Dylan ok it doesnt look good D
Dylan have you tried the unburnt side
Dylan can you taste the coal D
Tracy yeah it seems ok
Dylan try a grater
Tracy 
Dylan like a cheese grater Try to grate the burnt layer off
Nick sounds like it could workD
Tracy oh ok Ill try
Tracy hm it does look better
Tracy filephoto
Nick filegif
Dylan filegif
Tracy thanks guys 
Target: Tracy burnt a cake and shes looking for advice how to save it Nick suggests to peel it with a knife Dylan thinks grating off is better Tracy grates the burnt part off


In [59]:
layers = [0, 1, 2]
heads = [0, 1, 2, 3, 4, 5, 6, 7]

# Encoder Self-Attention
get_all_attention_maps("encoder", layers, heads, encoder_input_tokens, encoder_input_tokens, min(20, sentence_len))

In [60]:
# Encoder Self-Attention
get_all_attention_maps("decoder", layers, heads, decoder_input_tokens, decoder_input_tokens, min(20, sentence_len))

In [61]:
# Encoder Self-Attention
get_all_attention_maps("encoder-decoder", layers, heads, encoder_input_tokens, decoder_input_tokens, min(20, sentence_len))

## Remove the previous trained model

In [None]:
import os
os.remove("/kaggle/working/converted_conversation_dataset_weights/tmodel_22.pt")

## Data set configration

In [37]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

class BilingualDataset(Dataset):

    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.seq_len = seq_len

        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        # Transform the text into tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Add sos, eos and padding to each sentence
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # We will add <s> and </s>
        # We will only add <s>, and </s> only on the label
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1

        # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        # Add <s> and </s> token
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only </s> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Double check the size of the tensors to make sure they are all seq_len long
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

In [38]:
def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

In [44]:
# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
config = get_config()
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)

# Load the pretrained weights
model_filename = latest_weights_file_path(config)
state = torch.load(model_filename)
model.load_state_dict(state['model_state_dict'])

Using device: cuda
Max length of source sentence: 399
Max length of target sentence: 49


<All keys matched successfully>

In [45]:
run_validation(
    model,
    val_dataloader,
    tokenizer_src,
    tokenizer_tgt,
    config['seq_len'],
    device,
    lambda msg: print(msg),
    0,
    None,
    num_examples=1
)


--------------------------------------------------------------------------------
    SOURCE: Trevor Ive got a bit of a problem Uncle Richard Have you Trevor Trevor Yes Uncle I need to get you advice on how to break this to Dad Richard You havent gone and got that Abigail up the duff have you Trevor Thats the long and short of it Uncle Richard Shit How did that happen Trevor Well its her fathers fault Uncle Richard Her fathers fault And him a church pastor Trevor Yes Uncle She has sneaked me into the house really quietly and we are in her bedroom doing it Richard Fucking hell Do go on Trevor And we havent got any condoms so I am planning to pull out at the last moment Richard Unbelieveable Bloody idiot What then Trevor So I am just pulling out and all of a sudden in bursts her father and delivers me such a kick up the arse that I am back in there coming Richard So you say it is all his fault then Incredible Trevor Yes  Richard So maybe we can get him to pay the child support instead of 

stty: 'standard input': Inappropriate ioctl for device


In [50]:
t = Summery("Trevor Ive got a bit of a problem Uncle Richard Have you Trevor Trevor Yes Uncle I need to get you advice on how to break this to Dad Richard You havent gone and got that Abigail up the duff have you Trevor Thats the long and short of it Uncle Richard Shit How did that happen Trevor Well its her fathers fault Uncle Richard Her fathers fault And him a church pastor Trevor Yes Uncle She has sneaked me into the house really quietly and we are in her bedroom doing it Richard Fucking hell Do go on Trevor And we havent got any condoms so I am planning to pull out at the last moment Richard Unbelieveable Bloody idiot What then Trevor So I am just pulling out and all of a sudden in bursts her father and delivers me such a kick up the arse that I am back in there coming Richard So you say it is all his fault then Incredible Trevor Yes  Richard So maybe we can get him to pay the child support instead of you then eh")
print(t)

Using device: cuda
    SOURCE: Trevor Ive got a bit of a problem Uncle Richard Have you Trevor Trevor Yes Uncle I need to get you advice on how to break this to Dad Richard You havent gone and got that Abigail up the duff have you Trevor Thats the long and short of it Uncle Richard Shit How did that happen Trevor Well its her fathers fault Uncle Richard Her fathers fault And him a church pastor Trevor Yes Uncle She has sneaked me into the house really quietly and we are in her bedroom doing it Richard Fucking hell Do go on Trevor And we havent got any condoms so I am planning to pull out at the last moment Richard Unbelieveable Bloody idiot What then Trevor So I am just pulling out and all of a sudden in bursts her father and delivers me such a kick up the arse that I am back in there coming Richard So you say it is all his fault then Incredible Trevor Yes  Richard So maybe we can get him to pay the child support instead of you then eh
 PREDICTED: Trevor got Abigail got too pregnant ou

## Config

In [40]:
def get_config():
    return {
        "batch_size": 8,
        "num_epochs": 60,
        "lr": 10**-4,
        "seq_len": 500,
        "d_model": 512,
        "datasource": '/kaggle/working/converted_conversation_dataset',  # Local path
        "lang_src": "en",
        "lang_tgt": "it",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

In [41]:
def get_weights_file_path(config, epoch: str):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    model_filename = f"{config['model_basename']}{epoch}.pt"
    return str(Path('.') / model_folder / model_filename)

# Find the latest weights file in the weights folder
def latest_weights_file_path(config):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    model_filename = f"{config['model_basename']}*"
    weights_files = list(Path(model_folder).glob(model_filename))
    if len(weights_files) == 0:
        return None
    weights_files.sort()
    return str(weights_files[-1])

In [42]:
from pathlib import Path
from tokenizers import Tokenizer
from datasets import load_dataset
import torch
import sys
from datasets import load_from_disk
def Summery(sentence: str):
    # Define the device, tokenizers, and model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    config = get_config()
    tokenizer_src = Tokenizer.from_file(str(Path(config['tokenizer_file'].format(config['lang_src']))))
    tokenizer_tgt = Tokenizer.from_file(str(Path(config['tokenizer_file'].format(config['lang_tgt']))))
    model = build_transformer(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size(), config["seq_len"], config['seq_len'], d_model=config['d_model']).to(device)

    # Load the pretrained weights
    model_filename = latest_weights_file_path(config)
    state = torch.load(model_filename)
    model.load_state_dict(state['model_state_dict'])

    # if the sentence is a number use it as an index to the test set
    label = ""
    if type(sentence) == int or sentence.isdigit():
        id = int(sentence)
        dataset_path = f"{config['datasource']}"
        ds_raw = load_from_disk(dataset_path)
        ds = ds_raw['train']
        ds = BilingualDataset(ds, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
        sentence = ds[id]['src_text']
        label = ds[id]["tgt_text"]
    seq_len = config['seq_len']

    # translate the sentence
    model.eval()
    with torch.no_grad():
        # Precompute the encoder output and reuse it for every generation step
        source = tokenizer_src.encode(sentence)
        source = torch.cat([
            torch.tensor([tokenizer_src.token_to_id('[SOS]')], dtype=torch.int64),
            torch.tensor(source.ids, dtype=torch.int64),
            torch.tensor([tokenizer_src.token_to_id('[EOS]')], dtype=torch.int64),
            torch.tensor([tokenizer_src.token_to_id('[PAD]')] * (seq_len - len(source.ids) - 2), dtype=torch.int64)
        ], dim=0).to(device)
        source_mask = (source != tokenizer_src.token_to_id('[PAD]')).unsqueeze(0).unsqueeze(0).int().to(device)
        encoder_output = model.encode(source, source_mask)

        # Initialize the decoder input with the sos token
        decoder_input = torch.empty(1, 1).fill_(tokenizer_tgt.token_to_id('[SOS]')).type_as(source).to(device)

        # Print the source sentence and target start prompt
        if label != "": print(f"{f'ID: ':>12}{id}")
        print(f"{f'SOURCE: ':>12}{sentence}")
        if label != "": print(f"{f'TARGET: ':>12}{label}")
        print(f"{f'PREDICTED: ':>12}", end='')

        # Generate the translation word by word
        while decoder_input.size(1) < seq_len:
            # build mask for target and calculate output
            decoder_mask = torch.triu(torch.ones((1, decoder_input.size(1), decoder_input.size(1))), diagonal=1).type(torch.int).type_as(source_mask).to(device)
            out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

            # project next token
            prob = model.project(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
            decoder_input = torch.cat([decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1)

            # print the translated word
            print(f"{tokenizer_tgt.decode([next_word.item()])}", end=' ')

            # break if we predict the end of sentence token
            if next_word == tokenizer_tgt.token_to_id('[EOS]'):
                break

    # convert ids to tokens
    return tokenizer_tgt.decode(decoder_input[0].tolist())

#read sentence from argument
#translate(sys.argv[1] if len(sys.argv) > 1 else "I am not a very good a student.")