In [1]:
import json
import pandas as pd
import numpy as np
import torch
from pathlib import Path

In [2]:
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

In [3]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)
from tqdm.auto import tqdm

In [4]:
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap

In [5]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
rcParams['figure.figsize'] = 10,10

In [6]:
pl.seed_everything(42)

Global seed set to 42


42

In [7]:
df = pd.read_csv("news_summary.csv", encoding="latin-1")

In [8]:
df

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...
...,...,...,...,...,...,...
4509,Mansha Mahajan,"24 Feb 2017,Friday",Rasna seeking ?250 cr revenue from snack categ...,http://indiatoday.intoday.in/story/rasna-eyes-...,Fruit juice concentrate maker Rasna is eyeing ...,"Mumbai, Feb 23 (PTI) Fruit juice concentrate m..."
4510,Dishant Sharma,"03 Aug 2017,Thursday",Sachin attends Rajya Sabha after questions on ...,http://indiatoday.intoday.in/story/sachin-tend...,Former Indian cricketer Sachin Tendulkar atten...,Former cricketer Sachin Tendulkar was spotted ...
4511,Tanya Dhingra,"03 Aug 2017,Thursday",Shouldn't rob their childhood: Aamir on kids r...,http://www.hindustantimes.com/bollywood/secret...,"Aamir Khan, while talking about reality shows ...","Aamir Khan, whose last film Dangal told the st..."
4512,Pragya Swastik,"07 Dec 2016,Wednesday","Asha Bhosle gets ?53,000 power bill for unused...",http://indiatoday.intoday.in/story/singer-asha...,The Maharashtra government has initiated an in...,Maharahstra Power Minister Chandrashekhar Bawa...


In [9]:
df = df[["text","ctext"]]

In [10]:
df.columns = ["summary", "text"]

In [11]:
df
df = df.dropna()

In [12]:
df

Unnamed: 0,summary,text
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...
...,...,...
4509,Fruit juice concentrate maker Rasna is eyeing ...,"Mumbai, Feb 23 (PTI) Fruit juice concentrate m..."
4510,Former Indian cricketer Sachin Tendulkar atten...,Former cricketer Sachin Tendulkar was spotted ...
4511,"Aamir Khan, while talking about reality shows ...","Aamir Khan, whose last film Dangal told the st..."
4512,The Maharashtra government has initiated an in...,Maharahstra Power Minister Chandrashekhar Bawa...


In [13]:
df.shape

(4396, 2)

In [14]:
train_df, test_df = train_test_split(df,test_size=0.1)

In [15]:
train_df.shape,test_df.shape

((3956, 2), (440, 2))

In [16]:
class NewsSummaryDataset(Dataset):

  def __init__(
    self,
    data: pd.DataFrame, 
    tokenizer: T5Tokenizer,
    text_max_token_len: int = 512,        
    summary_max_token_len: int = 128   
    ):       
       self.tokenizer = tokenizer          
       self.data = data            
       self.text_max_token_len = text_max_token_len       
       self.summary_max_token_len = summary_max_token_len

  def __len__(self):
    return len(self.data)

  
  def __getitem__(self, index: int ):
    data_row = self.data.iloc[index]

    text = data_row["text"]
    text_encoded = tokenizer(text, 
                                  max_length = self.text_max_token_len, 
                                  padding = "max_length",
                                  truncation = True,
                                  return_attention_mask = True,
                                  add_special_tokens = True,
                                  return_tensors = "pt")

    summary = data_row["summary"]
    summary_encoding = self.tokenizer(summary, 
                                  max_length = self.summary_max_token_len, 
                                  padding = "max_length",
                                  truncation = True,
                                  return_attention_mask = True,
                                  add_special_tokens = True,
                                  return_tensors = "pt")
    labels = summary_encoding["input_ids"]
    labels[labels == 0] = -100

    return dict(text = text, summary = data_row["summary"], text_input_ids = text_encoded["input_ids"].flatten(),text_attention_mask= text_encoded["attention_mask"].flatten(),
                labels = labels.flatten(), labels_attention_mask = summary_encoding["attention_mask"].flatten())
    

In [17]:
#data module for the pythorch lightning 

class NewsSummaryDataModule(pl.LightningDataModule):
  def __init__(
      self, 
      train_df: pd.DataFrame,
      test_df: pd.DataFrame,
      tokenizer: T5Tokenizer,
      batch_size: int = 8,
      text_max_token_len: int = 512,
      summary_max_token_len: int = 128,
      
  ):

    super().__init__()

    self.train_df = train_df
    self.test_df = test_df

    self.batch_size = batch_size
    self.tokenizer = tokenizer
    self.text_max_token_len = text_max_token_len
    self.summary_max_token_len = summary_max_token_len


  def setup(self, stage=None):
    self.train_dataset = NewsSummaryDataset(
        self.train_df,
        self.tokenizer,
        self.text_max_token_len,
        self.summary_max_token_len
    )

    self.test_dataset = NewsSummaryDataset(
        self.test_df,
        self.tokenizer,
        self.text_max_token_len,
        self.summary_max_token_len
       )
    
  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle = True, num_workers=2 ) 
  
  def val_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle = False, num_workers=2 ) 
  
  def test_dataloader(self):
    return DataLoader(self.test_dataset, batch_size = self.batch_size, shuffle = False, num_workers=2)




In [18]:
MODEL_NAME = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

In [19]:
text_token_counts, summary_token_counts = [],[]

for _, row in train_df.iterrows():
  text_token_count = len(tokenizer.encode(row["text"]))
  text_token_counts.append(text_token_count)

  summary_token_count = len(tokenizer.encode(row["summary"]))
  summary_token_counts.append(summary_token_count)

Token indices sequence length is longer than the specified maximum sequence length for this model (689 > 512). Running this sequence through the model will result in indexing errors


In [20]:
N_EPOCHS = 3
BATCH_SIZE = 10

data_module = NewsSummaryDataModule(train_df, test_df, tokenizer, batch_size = BATCH_SIZE)

In [21]:
#Model

class NewSummaryModel(pl.LightningModule):

  def __init__(self):
   super().__init__()
   self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)
  
  def forward(self, input_ids, attention_mask, decoder_attention_mask, labels = None):

    output = self.model(input_ids,
                        attention_mask = attention_mask,
                        labels = labels,
                        decoder_attention_mask = decoder_attention_mask
                        )
    return output.loss, output.logits
  
  def training_step(self,batch, batch_idx):
    input_ids = batch["text_input_ids"]
    attention_mask = batch["text_attention_mask"]
    labels = batch["labels"]
    labels_attention_mask = batch["labels_attention_mask"]


    loss,outputs = self(
        input_ids = input_ids,
        attention_mask = attention_mask,
        decoder_attention_mask = labels_attention_mask,
        labels = labels 
        )
    
    self.log("train_loss", loss, prog_bar= True, logger=True)
    return loss

  def validation_step(self,batch, batch_idx):
    input_ids = batch["text_input_ids"]
    attention_mask = batch["text_attention_mask"]
    labels = batch["labels"]
    labels_attention_mask = batch["labels_attention_mask"]


    loss,outputs = self(
        input_ids = input_ids,
        attention_mask = attention_mask,
        decoder_attention_mask = labels_attention_mask,
        labels = labels 
        )
    
    self.log("val_loss", loss, prog_bar= True, logger=True)
    return loss

  def tes_step(self,batch, batch_idx):
    input_ids = batch["text_input_ids"]
    attention_mask = batch["text_attention_mask"]
    labels = batch["labels"]
    labels_attention_mask = batch["labels_attention_mask"]


    loss,outputs = self(
        input_ids = input_ids,
        attention_mask = attention_mask,
        decoder_attention_mask = labels_attention_mask,
        labels = labels 
        )
    
    self.log("test_loss", loss, prog_bar= True, logger=True)
    return loss
  
  def configure_optimizers(self):
      return AdamW(self.parameters(), lr= 0.0001)
  



In [22]:
model = NewSummaryModel()

In [23]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best_checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

logger = TensorBoardLogger("lightning logs", name="news-summary")

trainer = pl.Trainer(
    logger= logger, 
    checkpoint_callback= checkpoint_callback,
    max_epochs = N_EPOCHS,
    accelerator="cpu",
    progress_bar_refresh_rate= 30
    )


  rank_zero_deprecation(
  rank_zero_deprecation(
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, data_module)

Missing logger folder: lightning logs/news-summary

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  rank_zero_warn(


Training: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
