<a href="https://colab.research.google.com/github/786aafreen/PythonJenkins/blob/main/Summerization_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

# Google's T5 Abstract Summarization Tutorial
## Overview ---------downloaded on 11-Feb-22 from https://www.udemy.com/course/pytorch-deep-learning-hero/learn/lecture/24607924#overview chapter 39

T5(Text-to-Text Transfer Transformer) is a model for transfer learning that takes text as input and outputs text, since it is text-to-text.

Examples of text-to-text are
1. translation
2. question and answering
3. classification
4. summarization


Mainly, there are two types of summarization: extractive summarization, in which the original document is excerpted, and generative summarization, in which the original text is conceptually abstracted and rewritten in a different way.
In general, abstract summarization is more difficult.

In this lesson, we will use T5 for abstract summarization.


![image.png](

In [None]:
!pip install wandb

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

In [None]:
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration

In [None]:
import wandb

In [None]:
!nvidia-smi

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
device

In [None]:
!wandb login --relogin

In [None]:
!pip install sentencepiece

#Prepare Data

In [None]:
df = pd.read_csv('/content/news_summary (2).csv', encoding='latin')

In [None]:
df.ctext.iat[0] #----- what is the use of .iat

In [None]:
df.text.iat[0]

In [None]:
df.columns

In [None]:
df = df[['text', 'ctext']]

In [None]:
len(df.index)

In [None]:
df = df.sample(100)

In [None]:
df.head()

In [None]:
train_size = 0.8 #------what is train size 

train_dataset = df.sample(frac=train_size, random_state=0)
val_dataset = df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True) #try to insert index into dataframe columns. This resets the index to the default integer index.

In [None]:
len(train_dataset.index)

In [None]:
len(val_dataset.index)

In [None]:
#-------what this function does ?
class CustomDataset(Dataset):

  def __init__(self, dataframe, tokenizer, source_len, summ_len):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.summ_len = summ_len
    self.text = self.data.text
    self.ctext = self.data.ctext
  
  def __len__(self):
    return len(self.text)

  def __getitem__(self, index):
    ctext = str(self.ctext[index])
    ctext = ' '.join(ctext.split())

    text = str(self.text[index])
    text = ' '.join(text.split())

    source = self.tokenizer.batch_encode_plus([ctext], max_length=self.source_len, pad_to_max_length=True, return_tensors='pt')
    target = self.tokenizer.batch_encode_plus([text], max_length=self.summ_len, pad_to_max_length=True, return_tensors='pt')

    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze() #The attention mask is a binary tensor indicating the position of the padded indices so that the model does not attend to them
    target_ids = target['input_ids'].squeeze()
    target_mask = target['attention_mask'].squeeze()

    return {
        'source_ids' : source_ids,
        'source_mask' : source_mask,
        'target_ids' : target_ids,
        'target_mask' : target_mask
    }

In [None]:
#-------what is the role of tokenizer ,T5TokenizerFast
tokenizer = T5TokenizerFast.from_pretrained('t5-small')

In [None]:
tokenizer.batch_encode_plus(['I can do it better'], max_length=32, pad_to_max_length=True, return_tensors='pt')

In [None]:

#-------what does CustomDataset return 
training_set = CustomDataset(train_dataset, tokenizer, 512, 150)
val_set = CustomDataset(val_dataset, tokenizer, 512, 150)

In [None]:
train_params = {
    'batch_size' : 2,
    'shuffle' : True,
    'num_workers' : 0
}

In [None]:
val_params = {
    'batch_size' : 2,
    'shuffle' : False,
    'num_workers' : 0
}

In [None]:
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

In [None]:
for i in training_loader:
  print(i)
  break

In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model = model.to(device)

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):
  model.train()
  for i, data in enumerate(loader, 0):
    y = data['target_ids'].to(device, dtype=torch.long)
    y_ids = y[:, :-1].contiguous()
    print( y_ids)
    lm_labels = y[:, 1:].clone().detach()
    lm_labels[y[:, 1:] == tokenizer.pad_token_id]= -100
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_mask'].to(device, dtype=torch.long)

    outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
    loss = outputs[0]

    if i % 10  == 0:
      wandb.log({"Training Loss": loss.item()})

    if i % 500 == 0:
      print(f'Epoch:{epoch+1}, Loss:{loss.item()}')
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
def validate(epoch, tokenizer, model, device, loader):
  model.eval()
  predictions = []
  actuals = []

  with torch.no_grad():
    for i, data in enumerate(loader, 0):
      y = data['target_ids'].to(device, dtype=torch.long)
      ids = data['source_ids'].to(device, dtype=torch.long)
      mask = data['source_mask'].to(device, dtype=torch.long)

      generated_ids = model.generate(
          input_ids = ids,
          attention_mask = mask,
          max_length = 150, 
          num_beams = 2,
          repetition_penalty = 2.5,
          length_penalty = 1.0,
          early_stopping = True
      )
      preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
      target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in y]

      predictions.extend(preds)
      actuals.extend(target)

      if i % 100 == 0:
        print(f'Completed: {i}')
  return predictions, actuals

In [None]:
def main():
  wandb.init(project="transformers_tutorials_summarization")

  config = wandb.config
  config.TRAIN_BATCH_SIZE = 2
  config.VALID_BATCH_SIZE = 2
  config.TRAIN_EPOCHS = 2
  config.VAL_EPOCHS = 1
  config.LEARNING_RATE = 1e-4
  config.SEED = 42
  config.MAX_LEN = 512
  config.SUMMARY_LEN = 150

  # Reproductivity
  torch.manual_seed(config.SEED)
  np.random.seed(config.SEED)
  torch.backends.cudnn.deterministic = True

  optimizer = torch.optim.Adam(model.parameters(), lr=config.LEARNING_RATE)
  wandb.watch(model, log="all")

  for epoch in range(config.TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)

  for epoch in range(config.VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions, 'Actual Text':actuals})
    
    final_df.to_csv('gdrive/My Drive/predictions.csv')
    print('Ouput files generated for review')

In [None]:
main()