In [None]:
#Import packages 

#data processing 
import pandas as pd
import numpy as np

#For text extraction
import re         
from bs4 import BeautifulSoup 
from nltk.corpus import stopwords
import time

pd.set_option("display.max_colwidth", 200)

#For modeling
#import tf
import tensorflow as tf

# import keras
from keras.preprocessing.text import Tokenizer 



In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
#Importing data

df=pd.read_csv('sum1.csv')    
df.head()

# Rows and columns in the dataset
df.shape

# Check for any null values
df.isnull().sum()



# Text pre-processing

In [None]:
#Text preprocessing function

def txt_preprocessing(txt): 
    txt = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', txt)
    txt = re.sub(r'\'', ' ', txt)
    txt = txt.lower()
    return txt

  
headlines_processed = []
clean_news = []

# headline text cleaning
for z in df.headlines:
    headlines_processed.append(txt_preprocessing(z))


# news text cleaning
for z in df.text:
    clean_news.append(txt_preprocessing(z))


# Simple Model 

In [None]:
#connect to hugging face
!git clone https://github.com/huggingface/transformers \
&& cd transformers \

#install
!pip install -q ./transformers

#Import
import torch
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")

torch_device = 'cpu'

#Function to summarize
def summarize_news(input_text, maximum_length, minimum_length):

  #Tokenize
  input_txt_ids = tokenizer.batch_encode_plus([input_text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)
  
  #summarize
  ids_sum = model.generate(input_txt_ids, max_length=int(maximum_length), min_length=int(minimum_length))      

  #get the text summary    
  output_sum = tokenizer.decode(ids_sum.squeeze(), skip_special_tokens=True)
  return output_sum


In [None]:
input_text = "According to a Hindustan Times report, the 14 Keralites were among the terrorists and militants freed by the Taliban from Bagram jail. As of now, unconfirmed reports state that two Pakistani residents were detained by the Sunni Pashtun terrorist group for trying to blow off an Improvised Explosive Device (IED) device outside Turkmenistan embassy in Kabul on August 26. And as intelligence reports indicate, an IED was recovered from the two Pakistani nationals soon after the Kabul airport blast. As per reports, a Kerala resident contacted his home, while the remaining 13 are still in Kabul with the ISIS-K terrorist group. After Syria and Levant occupied Mosul in 2014, people from the Malappuram, Kasaragod and Kannur districts left India and joined the jihadist group in West Asia from where a few Keralites came down to Nangarhar province of Afghanistan."

#Generate summary using function
summarize_news(input_text,20,10)


# Transfer learning: Custom training on top of pre-trained model:

# BART: Simple-Transformer Pretrained model:

In [None]:
#Installing transformers
!pip install simpletransformers 
!pip install transformers 

#Importing necessary libraries from simple transformers 
import pandas as pd
from simpletransformers.seq2seq import Seq2SeqModel,Seq2SeqArgs 


In [None]:
#Rename columns as per pretrained model required format
df=df.rename(columns={'headlines':'target_text','text':'input_text'})  

model_args = Seq2SeqArgs()   

#Initializing number of epochs 
model_args.num_train_epochs = 1

#Initializing no_save arg  
model_args.no_save = True

#Initializing evaluation args  
model_args.evaluate_generated_text = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True 


In [None]:
# Initialize the model with type as 'bart' and provide model args

model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args,
    use_cuda=True,
)

 #Splitting data into train-test

from sklearn.model_selection import train_test_split 

train_df, test_df = train_test_split(df, test_size=0.2)
train_df.shape, test_df.shape

#Training the model and keeping eval dataset as test data

model.train_model(train_df, eval_data=test_df) 


In [None]:
#Generating summaries on news test data
results = model.eval_model(test_df)  

#print the loss
results 

#Original test data text summary for top 10 news

for i in test_df.target_text[:10]:
  print(i)   


In [None]:
#Predicted summary
for i in test_df.input_text:    
  print(model.predict([i]))


# T5 Pretrained Model

In [None]:
#Installing simple-T5
! pip install simplet5 -q   

#Import the library
from simplet5 import SimpleT5


In [None]:
# Model expects dataframe to have 2 column names
# Input as "source_text", Summary as "target_text", let us rename accordingly

df = df.rename(columns={"headlines":"target_text", "input_text":"source_text"})   
df = df[['source_text', 'target_text']]

# Let us add a prefix "summarize: " for all source text

df['source_text'] = "summarize: " + df['source_text']
df


In [None]:
#Splitting data into train and test

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2)
train_df.shape, test_df.shape

#Initializing the model
model = SimpleT5()   

#Importing pretrained t5 model
model.from_pretrained(model_type="t5", model_name="t5-base")    

#Import torch as this model is built on top of pytorch
import torch
torch.cuda.empty_cache()

#Training the model with 5 epochs 

model.train(train_df=train_df,
            eval_df=test_df, 
            source_max_token_len=128, 
            target_max_token_len=50, 
            batch_size=8, max_epochs=5, use_gpu=True)

#Models built at each epoch
! ( cd outputs; ls )   


In [None]:
#Loading model saved with lowest loss
model.load_model("t5","outputs/SimpleT5-epoch-4-train-loss-0.902", use_gpu=True)  


In [None]:
#Original headlines

test_df['target_text'] 


In [None]:
#Model summarized headlines
for doc in test_df['source_text']:
  print(model.predict(doc))


BLUE Score

In [None]:
from nltk.translate.bleu_score import sentence_bleu

x= [x for x in df.source_text]

y=[model.predict(p)[0] for p in df['source_text']]

# Function to calculate the score
L=0
for i,j in zip(x,y):
  L+=sentence_bleu(
    [i],
    j,
    weights=(0.25, 0.25, 0.25, 0.25),
    smoothing_function=None,
    auto_reweigh=False,
)

#Average blue score of whole corpuses
L/df.shape[0]   
