In [1]:
!pip install torch transformers datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch
import torch.nn as nn
from torch import matmul
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import softmax
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
from datasets import load_metric
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os
import re
import time
from collections import defaultdict
import math
from torch.nn import functional as F
import transformers
from tqdm import tqdm

<h2>Load Dataset and Preprocess Data</h2>  
WikiHow dataset is a sumarization dataset found at https://github.com/mahnazkoupaee/WikiHow-Dataset and contains 200000 long sequence pairs of articles and their summaries. Dataset structure: each article consists of multiple paragraphs, and subheadings that are concatenated into summarization of an article.  
  
title - article titles  
headline - concatenation of summary sentences forming the summary  
text - concatenation of all paragraphs of an article

<h3>Load</h3>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# # Load the dataset
# df = pd.read_csv(r'/content/drive/MyDrive/wikihowAll2.csv')
df = pd.read_csv(r'./data/wikihowAll.csv')
df = df.astype(str)
df.shape

(215365, 3)

In [5]:
df.head()

Unnamed: 0,headline,title,text
0,"\nKeep related supplies in the same area.,\nMa...",How to Be an Organized Artist1,"If you're a photographer, keep all the necess..."
1,\nCreate a sketch in the NeoPopRealist manner ...,How to Create a Neopoprealist Art Work,See the image for how this drawing develops s...
2,"\nGet a bachelor’s degree.,\nEnroll in a studi...",How to Be a Visual Effects Artist1,It is possible to become a VFX artist without...
3,\nStart with some experience or interest in ar...,How to Become an Art Investor,The best art investors do their research on t...
4,"\nKeep your reference materials, sketches, art...",How to Be an Organized Artist2,"As you start planning for a project or work, ..."


<h3>Preprocess</h3>    

1. remove newline chars and extra commas

2. drop short articles with long summaries and drop articles with no summaries    

3. drop duplicates  
4. drop rows where headline is empty string

In [7]:
# 1 remove newline chars and extra commas
df['headline'] = df.headline.apply(lambda s: re.sub(".,",".",s.strip().title()))
df['headline'] = df.headline.apply(lambda s: re.sub("\n","", s.strip().title()))


# 2 drop short articles with long summaries and no summaries
df['headline length'] = df.headline.str.len().astype(int)
df['text length treshold'] = df.text.str.len().astype(int) * 0.75
df = df[df['headline length'] < df['text length treshold']]
# drop unnecessary columns
df = df.drop(['headline length', 'text length treshold','title'], axis=1)
df.shape

(181130, 2)

In [8]:
# 3 drop duplicates
df = df.drop_duplicates()

# 4 drop rows where headline is empty
df = df.loc[(df.headline.str.len() != 0)]
df.shape

(181071, 2)

In [9]:
# check are there nan values 
df.isnull().values.any()

False

In [10]:
# Rename column names and apply padding to summaries
df.rename(columns = {'text':'document', 'headline':'summary'}, inplace = True)
# df['summary'] = df.summary.apply(lambda s: '<go> ' + s + ' <stop>')
df.head()

Unnamed: 0,summary,document
0,Keep Related Supplies In The Same Area.Make An...,"If you're a photographer, keep all the necess..."
1,Create A Sketch In The Neopoprealist Manner Of...,See the image for how this drawing develops s...
2,Get A Bachelor’S Degree.Enroll In A Studio-Bas...,It is possible to become a VFX artist without...
3,Start With Some Experience Or Interest In Art....,The best art investors do their research on t...
4,Keep Your Reference Material. Sketche. Article...,"As you start planning for a project or work, ..."


In [11]:
# Split the dataset into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Max token length for input and output
max_input_length = 512
max_output_length = 256

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


<h3>Custom Dataset for WikiHow Data</h3>

In [None]:
# Custom Dataset for WikiHow data
class WikiHowDataset(Dataset):
    def __init__(self, df, max_input_length, max_output_length):
        self.df = df
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        example = self.df.iloc[idx]
        input_text = example["document"] 
        output_text = example["summary"]
        input_tokens = tokenizer.encode_plus(input_text, max_length=max_input_length, padding="max_length", truncation=True, return_tensors="pt")
        output_tokens = tokenizer.encode_plus(output_text, max_length=max_output_length, padding="max_length", truncation=True, return_tensors="pt")
        input_ids = input_tokens["input_ids"].squeeze()
        attention_mask = input_tokens["attention_mask"].squeeze()
        decoder_input_ids = output_tokens["input_ids"].squeeze()[:-1]
        decoder_attention_mask = output_tokens["attention_mask"].squeeze()[:-1]
        return {
            "document": input_ids, 
            "summary": decoder_input_ids,
            "attention_mask": attention_mask,
            "decoder_attention_mask": decoder_attention_mask
        }


<h1>Model Design</h1>  
Below we define our Transformer Summarizer model and train it.
<h3>Encoder</h3>  
Encoder consists of following layers:    

1. T5 encoder layer   

2. 6 transformers layers    

3. 12 head attention layer

<h3>Decoder</h3>  
Decoder is composed of 6 Transformer layers.

In [12]:
# Max token length for input and output
max_input_length = 768
max_output_length = 256

# Encoder class 
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.t5 = transformers.T5EncoderModel.from_pretrained('t5-base')
        self.transformer_layers = nn.TransformerEncoderLayer(d_model=768, nhead=12, dim_feedforward=3072)
        self.multi_attention_layer = nn.MultiheadAttention(embed_dim=768, num_heads=12, dropout=0.1)
        self.proj = nn.Linear(768, 768)  # change output size to 768
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_layers, num_layers=3)

    def forward(self, input_ids, attention_mask):
        t5_output = self.t5(input_ids=input_ids, attention_mask=attention_mask)[0]
        t5_output = t5_output.permute(1, 0, 2)
        transformer_output, _ = self.multi_attention_layer(t5_output, t5_output, t5_output)
        transformer_output = self.proj(transformer_output)
        transformer_output = self.transformer_encoder(transformer_output)
        return transformer_output

# Define the decoder class
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(768, hidden_size)
        self.transformer_layers = nn.TransformerDecoderLayer(d_model=embedding_size, nhead=12, dim_feedforward=3072)
        self.transformer_decoder = nn.TransformerDecoder(self.transformer_layers, num_layers=num_layers)
        self.fc = nn.Linear(embedding_size, vocab_size)

    def forward(self, x, hidden):
        hidden = self.linear(hidden.permute(1, 0, 2)).permute(1, 0, 2)
        embedded = self.embedding(x)
        output = self.transformer_decoder(embedded, hidden)
        output = self.fc(output)
        return output, hidden

    def generate(self, input_seq, hidden, max_length=20):
        with torch.no_grad():
            output_seq = []
            input_tensor = input_seq.unsqueeze(0)
            for i in range(max_length):
                output, hidden = self.forward(input_tensor, hidden)
                output = output.squeeze(0)
                _, topi = output.topk(1)
                output_seq.append(topi.item())
                if topi == EOS_token:
                    break
                input_tensor = topi.unsqueeze(0)
            return output_seq, hidden


Some weights of the model checkpoint at t5-base were not used when initializing T5EncoderModel: ['decoder.block.1.layer.0.SelfAttention.v.weight', 'decoder.block.10.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.o.weight', 'decoder.block.8.layer.1.layer_norm.weight', 'decoder.block.6.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.1.EncDecAttention.o.weight', 'decoder.block.9.layer.2.DenseReluDense.wi.weight', 'decoder.block.10.layer.1.EncDecAttention.o.weight', 'decoder.block.2.layer.1.EncDecAttention.k.weight', 'decoder.block.9.layer.1.EncDecAttention.o.weight', 'decoder.block.11.layer.1.layer_norm.weight', 'decoder.block.5.layer.2.DenseReluDense.wo.weight', 'decoder.block.6.layer.0.layer_norm.weight', 'decoder.block.5.layer.2.layer_norm.weight', 'decoder.block.1.layer.0.SelfAttention.q.weight', 'decoder.block.9.layer.1.EncDecAttention.q.weight', 'decoder.block.10.layer.0.SelfAttention.k.weight', 'decoder.block.4.layer.1.EncDecAttention.v.weight', 'decod

RuntimeError: ignored

<h3>Encoder Decoder</h3>

In [None]:
# Encoder Decoder 
class Summarizer(nn.Module):
    def __init__(self, vocab_size):
        super(Summarizer, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder(vocab_size=vocab_size, embedding_size=256, hidden_size=768, num_layers=2)

    def forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask):
        encoder_output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        decoder_output = self.decoder(x=decoder_input_ids, hidden=encoder_output)
        return decoder_output

<h2>Train</h2>

In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define tokenizer
tokenizer = transformers.T5Tokenizer.from_pretrained('t5-base')

# # Define dataset object
# train_dataset = WikiHowDataset(train_df, max_input_length=max_input_length, max_output_length=max_output_length)

# # Define data loaders
# train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
# Define dataset object
train_dataset = WikiHowDataset(valid_df, max_input_length=max_input_length, max_output_length=max_output_length)

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Define model object
model = Summarizer(vocab_size=tokenizer.vocab_size)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Train model
model.train()
model.to(device)
num_epochs = 1
for epoch in tqdm(range(num_epochs)):
    running_loss = 0.0
    for batch in train_loader:
        input_ids = batch["document"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        decoder_input_ids = batch["summary"].to(device)
        decoder_attention_mask = batch["decoder_attention_mask"].to(device)

        optimizer.zero_grad()

        output = model(input_ids, attention_mask, decoder_input_ids, decoder_attention_mask=decoder_attention_mask)

        loss = criterion(output.view(-1, tokenizer.vocab_size), decoder_input_ids.view(-1))

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')
torch.save(model.state_dict(), 'trained_model.pth')

<h1>Evaluation</h1>

In [None]:
!pip install rouge

In [None]:
# Load test dataset
test_dataset = WikiHowDataset(test_df, max_input_length=max_input_length, max_output_length=max_output_length)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)

In [None]:
# Create function to generate summary using the model
def generate_summary(model, tokenizer, input_text):
    input_tokens = tokenizer.encode_plus(input_text, max_length=max_input_length, padding="max_length", truncation=True, return_tensors="pt")
    input_ids = input_tokens["input_ids"].to(device)
    attention_mask = input_tokens["attention_mask"].to(device)
    generated_ids = model.decoder.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_output_length,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )
    generated_summary = tokenizer.decode(generated_ids.squeeze(), skip_special_tokens=True)
    return generated_summary


In [None]:
# Create function to calculate ROUGE scores
def calculate_rouge_scores(hypotheses, references):
    rouge_evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l'], max_n=2)
    scores = rouge_evaluator.get_scores(hypotheses, references)
    rouge1_f1 = np.mean([score['rouge-1']['f'] for score in scores])
    rouge2_f1 = np.mean([score['rouge-2']['f'] for score in scores])
    rougel_f1 = np.mean([score['rouge-l']['f'] for score in scores])
    return rouge1_f1, rouge2_f1, rougel_f1


In [None]:
# Load test dataset
test_dataset = WikiHowDataset(test_df, max_input_length=max_input_length, max_output_length=max_output_length)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)

In [None]:
# Evaluate model on test dataset
model.eval()
generated_summaries = []
reference_summaries = []
for batch in test_loader:
    input_text = batch['document']
    output_text = batch['summary']
    generated_summary_batch = []
    for i in range(len(input_text)):
        input_text_i = input_text[i]
        input_text_i_str = tokenizer.decode(input_text_i)
        generated_summary_i = generate_summary(model, tokenizer, input_text_i_str)
        
        # generated_summary_i = generate_summary(model, tokenizer, input_text_i)
        generated_summary_batch.append(generated_summary_i)
    generated_summaries.extend(generated_summary_batch)
    reference_summaries.extend(output_text)

# Calculate ROUGE scores
rouge1, rouge2, rougel = calculate_rouge_scores(generated_summaries, reference_summaries)
print("ROUGE-1: {:.4f}".format(rouge1))
print("ROUGE-2: {:.4f}".format(rouge2))
print("ROUGE-L: {:.4f}".format(rougel))

# Print some examples of summaries
for i in range(5):
    print("Example {}".format(i+1))
    print("Input text: {}".format(test_df.iloc[i]["document"]))
    print("Generated summary: {}".format(generated_summaries[i]))
    print("Reference summary: {}".format(reference_summaries[i]))
    print()