In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
file_path = '/content/gdrive/My Drive/News summary/news_summary.csv'

In [3]:
!pip install --quiet transformers==4.5.0
!pip install --quiet pytorch-lightning==1.2.7

[K     |████████████████████████████████| 2.2MB 27.5MB/s 
[K     |████████████████████████████████| 839kB 28.9MB/s 
[K     |████████████████████████████████| 276kB 45.3MB/s 
[?25h  Building wheel for PyYAML (setup.py) ... [?25l[?25hdone


In [4]:
import json
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap

from transformers import(
    
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer,
    BartForConditionalGeneration,
    BartTokenizerFast as BartTokenizer,
    PegasusForConditionalGeneration,
    PegasusTokenizerFast as PegasusTokenizer
)

from tqdm.auto import tqdm

In [5]:
pl.__version__
#transformers.__version__

'1.2.7'

In [6]:
from pytorch_lightning.core import LightningModule

In [7]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

sns.set(style = 'whitegrid', palette = 'muted', font_scale = 1.2)
rcParams['figure.figsize'] = 16, 10

In [8]:
pl.seed_everything(786)

Global seed set to 786


786

In [9]:
df = pd.read_csv(file_path, engine = 'python')
df.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [10]:
df = df[['text', 'ctext']]
df.head()

Unnamed: 0,text,ctext
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [11]:
df.columns = ['summary', 'text']

In [12]:
df.shape

(4514, 2)

In [13]:
df.dropna()
df.shape, df.head()

((4514, 2),
                                              summary                                               text
 0  The Administration of Union Territory Daman an...  The Daman and Diu administration on Wednesday ...
 1  Malaika Arora slammed an Instagram user who tr...  From her special numbers to TV?appearances, Bo...
 2  The Indira Gandhi Institute of Medical Science...  The Indira Gandhi Institute of Medical Science...
 3  Lashkar-e-Taiba's Kashmir commander Abu Dujana...  Lashkar-e-Taiba's Kashmir commander Abu Dujana...
 4  Hotels in Maharashtra will train their staff t...  Hotels in Mumbai and other Indian cities are t...)

In [14]:
train_df, test_df = train_test_split(df, shuffle = True, random_state = 786, test_size = 0.1)
train_df.shape, test_df.shape

((4062, 2), (452, 2))

In [18]:
type(df.iloc[100].summary)

str

In [20]:
class NewsSummaryDataset(Dataset):

  def __init__( self, data : pd.DataFrame, tokenizer : T5Tokenizer, text_max_token_len : int = 512, summary_max_token_len : int = 128):

    self.tokenizer = tokenizer
    self.data = data
    self.text_max_token_len = text_max_token_len
    self.summary_max_token_len = summary_max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    data_row = self.data.iloc[idx]
    text = str(data_row.text)
    summary = str(data_row.summary)

    text_encoding = tokenizer(
        
        text,
        max_length = self.text_max_token_len,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = 'pt',
    )

    summary_encoding = tokenizer(
        
        summary,
        max_length = self.text_max_token_len,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = 'pt',
    )

    labels = summary_encoding['input_ids']
    labels[labels == 0] = -100

    return {
        
        'text' : text,
        'summary' : summary,
        'text_input_ids' : text_encoding['input_ids'].flatten(),
        'text_attention_mask' : text_encoding['attention_mask'].flatten() ,
        'labels' : labels.flatten(),
        'labels_attention_mask' : summary_encoding['attention_mask'].flatten() 

    }


In [18]:
class NewsSummaryDataModule(pl.LightningDataModule):

  def __init__(self, train_df : pd.DataFrame, test_df : pd.DataFrame, tokenizer : T5Tokenizer, batch_size : int = 8, text_max_token_len : int = 512, summary_max_token_len : int = 128):
    super().__init__()
    
    self.train_df = train_df
    self.test_df = test_df

    self.batch_size = batch_size
    self.tokenizer = tokenizer

    self.text_max_token_len = text_max_token_len
    self.summary_max_token_len = summary_max_token_len

  def setup(self, stage = None):

    self.train_dataset = NewsSummaryDataset(
        self.train_df,
        self.tokenizer,
        self.text_max_token_len,
        self.summary_max_token_len
    )

    self.test_dataset = NewsSummaryDataset(
        self.test_df,
        self.tokenizer,
        self.text_max_token_len,
        self.summary_max_token_len
    )

    def train_dataloader(self):
      return DataLoader(self.train_dataset, shuffle = True, batch_size = self.batch_size, num_workers = 2, pin_memory = True)
    
    def val_dataloader(self):
      return DataLoader(self.test_dataset, shuffle = True, batch_size = self.batch_size, num_workers = 2, pin_memory = True)

    def test_dataloader(self):
      return DataLoader(self.test_dataset, shuffle = True, batch_size = self.batch_size, num_workers = 2, pin_memory = True)


In [21]:
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)

In [None]:
#text_token_counts, summary_token_counts = [], []

#for _, row#

In [22]:
N_EPOCHs = 3
BATCH_SIZE = 8

#data_module = NewsSummaryDataModule(train_df, test_df, tokenizer , batch_size = BATCH_SIZE)

In [23]:
class NewsSummaryModel(LightningModule):

  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(model_name, return_dict = True)

  def forward(self, input_ids, attention_mask, decoder_attention_mask, labels = None):

    output = self.model(
        input_ids,
        attention_mask = attention_mask,
        labels = labels,
        decoder_attention_mask = decoder_attention_mask
    )

    return output.loss, output.logits

  def training_step(self, batch, batch_idx):

    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask'] 

    loss, outputs = self(
        input_ids = input_ids,
        attention_mask = attention_mask,
        decoder_attention_mask = labels_attention_mask,
        labels = labels_attention_mask
    )

    self.log('train_loss', loss, prog_bar = True, logger = True)
    return loss

  def validation_step(self, batch, batch_idx):

    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask'] 

    loss, outputs = self(
        input_ids = input_ids,
        attention_mask = attention_mask,
        decoder_attention_mask = labels_attention_mask,
        labels = labels_attention_mask
    )

    self.log('val_loss', loss, prog_bar = True, logger = True)
    return loss

  def test_step(self, batch, batch_idx):

    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask'] 

    loss, outputs = self(
        input_ids = input_ids,
        attention_mask = attention_mask,
        decoder_attention_mask = labels_attention_mask,
        labels = labels_attention_mask
    )

    self.log('test_loss', loss, prog_bar = True, logger = True)
    return loss

  def configure_optimizers(self):
    return AdamW(self.parameters(), lr = 0.0001)
    


In [24]:
model = NewsSummaryModel()

In [25]:
test_dataset = NewsSummaryDataset(test_df, tokenizer, 512, 128)
train_dataset = NewsSummaryDataset(train_df, tokenizer, 512, 128)

In [26]:
train_loader = DataLoader(train_dataset, batch_size = 8, shuffle = True, num_workers = 2)
test_loader = DataLoader(test_dataset, batch_size = 8, shuffle = True, num_workers = 2)

In [19]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

Reusing TensorBoard on port 6006 (pid 313), started 0:13:09 ago. (Use '!kill 313' to kill it.)

<IPython.core.display.Javascript object>

In [27]:
checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints',
    filename = 'best-checkpoint',
    save_top_k = 1,
    verbose = True,
    monitor = 'val_loss',
    mode = 'min'
)

logger = TensorBoardLogger('lightning_logs', name = 'news-summary')

trainer = pl.Trainer(
    
    logger = logger,
    checkpoint_callback = checkpoint_callback,
    max_epochs = N_EPOCHs,
    gpus = 1,
    progress_bar_refresh_rate = 30

)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [28]:
trainer.fit(model,train_loader, test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




RuntimeError: ignored