#### Transformer like LLM Implementation for Text Summarization

##### Importing Necessary Libraries

In [None]:
import os
import re
import math
import time
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import tensorflow as tf
import torch.optim as optim
from datasets import Dataset
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from tensorflow.keras.layers import Layer, Dense, Dropout, LayerNormalization

  from .autonotebook import tqdm as notebook_tqdm


##### Tokenization

In [2]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


##### Datasets Selected

In [5]:
train = pd.read_csv('Datasets/samsum-train.csv')
test = pd.read_csv('Datasets/samsum-test.csv')
val = pd.read_csv('Datasets/samsum-validation.csv')

train = train.dropna()
test = test.dropna()
val = val.dropna()

print(train[['dialogue', 'summary']].head(5))

                                            dialogue  \
0  Amanda: I baked  cookies. Do you want some?\r\...   
1  Olivia: Who are you voting for in this electio...   
2  Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...   
3  Edward: Rachel, I think I'm in ove with Bella....   
4  Sam: hey  overheard rick say something\r\nSam:...   

                                             summary  
0  Amanda baked cookies and will bring Jerry some...  
1  Olivia and Olivier are voting for liberals in ...  
2  Kim may try the pomodoro technique recommended...  
3  Edward thinks he is in love with Bella. Rachel...  
4  Sam is confused, because he overheard Rick com...  


##### Data Cleaning

In [6]:
def clean_tags(text):
    clean = re.compile('<.*?>') 
    clean = re.sub(clean, '', text) 
    
    clean = '\n'.join([line for line in clean.split('\n') if not re.match('.*:\s*$', line)])

    return clean

def clean_df(df, cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(clean_tags)
    return df

train = clean_df(train,['dialogue', 'summary'])
test = clean_df(test,['dialogue', 'summary'])
val = clean_df(val,['dialogue', 'summary'])

print(train['dialogue'].sample(5))

11348    Nelly: Beer after work?\r\nNina: Can't, not to...
10583    James: hiya do you know whats wring with our w...
2272     Ben: Tomorrow is the submission deadline.\r\nJ...
14689    Alex: Did you hear the newest song from Anne M...
25       Julius: dude, your assessment of manutd\r\nLaw...
Name: dialogue, dtype: object


  clean = '\n'.join([line for line in clean.split('\n') if not re.match('.*:\s*$', line)])


##### Dataset Format Selected

In [7]:
train_ds = Dataset.from_pandas(train)
test_ds = Dataset.from_pandas(test)
val_ds = Dataset.from_pandas(val)

train_ds.set_format(type='tensorflow', columns=['dialogue', 'summary'])
test_ds.set_format(type='tensorflow', columns=['dialogue', 'summary'])
val_ds.set_format(type='tensorflow', columns=['dialogue', 'summary'])

print(train_ds)
print('\n' * 2)
print(test_ds)
print('\n' * 2)
print(val_ds)

Dataset({
    features: ['id', 'dialogue', 'summary', '__index_level_0__'],
    num_rows: 14731
})



Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 819
})



Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 818
})


##### Removing Column

In [8]:
#columns removing
train_ds = train_ds.remove_columns(['__index_level_0__'])
train_ds = train_ds.remove_columns(['id'])

print(train_ds.column_names)
print(test_ds.column_names)
print(val_ds.column_names)

['dialogue', 'summary']
['id', 'dialogue', 'summary']
['id', 'dialogue', 'summary']


##### Selecting Hyperparameters

In [9]:
#hyperparameters
num_layers = 8
d_model = 512
num_heads = 8
dff = 2048
max_len = 256
dropout_rate = 0.1
EPOCHS = 2
vocab_size = tokenizer.vocab_size
batch_size = 32

##### Creation of Datasets

In [10]:
def tokenize_and_create_dataset_hf(dataset, batch_size, shuffle=True):
    input_texts = [str(tensor.numpy().decode('utf-8')) for tensor in dataset['dialogue']]
    target_texts = [str(tensor.numpy().decode('utf-8')) for tensor in dataset['summary']]

    #tokenize inputs
    input_features = tokenizer(
        input_texts,
        max_length=max_len,
        truncation=True,
        padding="max_length",
        return_tensors="tf"
    )
    input_ids = input_features["input_ids"]
    attention_mask = input_features["attention_mask"]

    #tokenize targets
    target_ids = tokenizer(
        target_texts,
        max_length=max_len,
        truncation=True,
        padding="max_length",
        return_tensors="tf"
    )["input_ids"]

    #tensorflow data
    tf_dataset = tf.data.Dataset.from_tensor_slices(({
        "input_ids": input_ids,
        "attention_mask": attention_mask
    }, target_ids))

    if shuffle:
        tf_dataset = tf_dataset.shuffle(buffer_size=len(input_ids))
    tf_dataset = tf_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    return tf_dataset

In [11]:
train_dataset = tokenize_and_create_dataset_hf(train_ds, batch_size)
test_dataset = tokenize_and_create_dataset_hf(test_ds, batch_size, shuffle=False)
val_dataset = tokenize_and_create_dataset_hf(val_ds, batch_size, shuffle=False)

print(train_dataset)
print(test_dataset)
print(val_dataset)

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 256), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 256), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 256), dtype=tf.int32, name=None))>
<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 256), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 256), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 256), dtype=tf.int32, name=None))>
<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 256), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 256), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 256), dtype=tf.int32, name=None))>


##### Positional Encoding

In [12]:
class PositionalEncoding(Layer):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.positional_encoding = self.compute_positional_encoding(d_model, max_len)

    def compute_positional_encoding(self, d_model, max_len):
        position = np.arange(max_len)[:, np.newaxis]
        div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        pe = np.zeros((max_len, d_model))
        pe[:, 0::2] = np.sin(position * div_term)
        pe[:, 1::2] = np.cos(position * div_term)
        return tf.cast(pe[np.newaxis, ...], dtype=tf.float32)  # Shape: (1, max_len, d_model)

    def call(self, x):
        seq_len = tf.shape(x)[1]  # Get the sequence length
        return x + self.positional_encoding[:, :seq_len, :]  # Add positional encoding


##### Multi-Head Attention Layer

In [13]:
class MultiHeadAttention(Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.num_heads = num_heads
        self.depth = d_model // num_heads

        self.wq = Dense(d_model)
        self.wk = Dense(d_model)
        self.wv = Dense(d_model)
        self.dense = Dense(d_model)

    def split_heads(self, x, batch_size):
        # Split into heads and transpose
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])  # (batch_size, num_heads, seq_len, depth)

    def call(self, v, k, q, mask=None):
        batch_size = tf.shape(q)[0]

        # Linear projections
        q = self.split_heads(self.wq(q), batch_size)
        k = self.split_heads(self.wk(k), batch_size)
        v = self.split_heads(self.wv(v), batch_size)

        # Scaled dot-product attention
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        attention_output = tf.matmul(attention_weights, v)  # (batch_size, num_heads, seq_len, depth)

        # Concatenate heads and project back to d_model
        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])  # (batch_size, seq_len, num_heads, depth)
        concat_attention = tf.reshape(attention_output, (batch_size, -1, self.num_heads * self.depth))  # (batch_size, seq_len, d_model)
        output = self.dense(concat_attention)  # Final output shape: (batch_size, seq_len, d_model)

        return output


##### Feed Forward Network

In [14]:
# Feed-Forward Network
class PointWiseFeedForwardNetwork(Layer):
    def __init__(self, d_model, dff):
        super(PointWiseFeedForwardNetwork, self).__init__()
        self.dense1 = Dense(dff, activation='relu')
        self.dense2 = Dense(d_model)

    def call(self, x):
        return self.dense2(self.dense1(x))

##### Decoder Layer

In [15]:
class Decoder(Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, max_len, dropout_rate=0.1):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)  # Embedding layer
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]
        self.dropout = Dropout(dropout_rate)
        self.final_layer = Dense(vocab_size)

    def call(self, x, training=False, look_ahead_mask=None):
        seq_len = tf.shape(x)[1]  # Get the sequence length of the input
        x = self.embedding(x) * tf.math.sqrt(tf.cast(self.d_model, tf.float32))  # Embedding with scaling
        x = self.pos_encoding(x)  # Add positional encoding
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.dec_layers[i](x, training=training, look_ahead_mask=look_ahead_mask)

        return self.final_layer(x)


In [16]:
class DecoderTransformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, max_len, dropout_rate=0.1):
        super(DecoderTransformer, self).__init__()
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, vocab_size, max_len, dropout_rate)

    def call(self, tar, training=None, look_ahead_mask=None):
        return self.decoder(tar, training=training, look_ahead_mask=look_ahead_mask)


In [17]:
class DecoderLayer(Layer):
    def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
        super(DecoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = PointWiseFeedForwardNetwork(d_model, dff)

        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)

        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, x, training, look_ahead_mask):
        # Multi-head self-attention
        attn_output = self.mha(x, x, x, look_ahead_mask)  # (batch_size, seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # Residual connection + LayerNorm

        # Feed-forward network
        ffn_output = self.ffn(out1)  # (batch_size, seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # Residual connection + LayerNorm

        return out2


##### Summarizing the Text

In [18]:
from transformers import AutoModelForSeq2SeqLM

# Load pre-trained summarization model
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

# Example input texts (first 5 test dialogues)
example_inputs = test['dialogue'].head(5).tolist()

# Tokenize
inputs = tokenizer(example_inputs, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Generate summaries
summary_ids = model.generate(inputs["input_ids"], max_length=60, num_beams=4, early_stopping=True)

# Decode summaries
summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

# Print input dialogues and generated summaries
for i, (dialogue, summary) in enumerate(zip(example_inputs, summaries)):
    print(f"\nExample {i+1}:\nDialogue:\n{dialogue}\n\nGenerated Summary:\n{summary}")



Example 1:
Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Generated Summary:
Hannah: Hey, do you have Betty's number? Amanda: Lemme check. Ask Larry. He called her last time we were at the park together. He's very nice, he's a good guy. Amanda texts him: Urgh.. Alright.

Example 2:
Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them