# Text Summerization - Encoder Decoder with Attention Mechanism

### Importing Basic libraries

In [None]:
import numpy as np
import pandas as pd

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import contractions

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp /content/drive/MyDrive/Data/news_summary.csv /content

### Importing Data

In [None]:
data_path = '/content/news_summary.csv'

In [None]:
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...


In [None]:
stop_words = stopwords.words('english')

def preprocess(text):
    text = text.lower()
    text = text.split()
    for i in range(len(text)):
        word = text[i]
        text[i] = contractions.fix(word)
    text = " ".join(text)
    text = text.split()
    newtext = []
    for word in text:
        if word not in stop_words:
            newtext.append(word)
    text = " ".join(newtext)
    text = text.replace("'s",'')
    text = re.sub(r'\(.*\)','',text)
    text = re.sub(r'[^a-zA-Z0-9. ]',' ',text)
    text = re.sub(r'\.','. ',text)
    text = re.sub(r'\s+', ' ', text)
    return text

sample = '''Hello! This is our 'EE626' (PRML) course project. We've tried implementing a simple "encoder-decoder" model'''
print(preprocess(sample))

hello ee626 course project. tried implementing simple encoder decoder model


In [None]:
data['headlines'] = data['headlines'].apply(lambda x:preprocess(x))
data['text'] = data['text'].apply(lambda x:preprocess(x))

print(data['headlines'][0], data['text'][0], sep='\n')

upgrad learner switches career ml al 90 salary hike
saurav kant alumnus upgrad iiit b pg program machine learning artificial intelligence sr systems engineer infosys almost 5 years work experience. program upgrad 360 degree career support helped transition data scientist tech mahindra 90 salary hike. upgrad online power learning powered 3 lakh careers. 


In [None]:
x = data['text']
y = data['headlines']
for i in range(20):
    print(f'Summary: {y[i]}',f'Text:    {x[i]}', sep='\n')
    print()

Summary: upgrad learner switches career ml al 90 salary hike
Text:    saurav kant alumnus upgrad iiit b pg program machine learning artificial intelligence sr systems engineer infosys almost 5 years work experience. program upgrad 360 degree career support helped transition data scientist tech mahindra 90 salary hike. upgrad online power learning powered 3 lakh careers. 

Summary: delhi techie wins free food swiggy one year cred
Text:    kunal shah credit card bill payment platform cred gave users chance win free food swiggy one year. pranav kaushik delhi techie bagged reward spending 2000 cred coins. users get one cred coin per rupee bill paid used avail rewards brands like ixigo bookmyshow ubereats cult. fit more. 

Summary: new zealand end rohit sharma led india 12 match winning streak
Text:    new zealand defeated india 8 wickets fourth odi hamilton thursday win first match five match odi series. india lost international match rohit sharma captaincy 12 consecutive victories dating 

**Torch Req**

In [None]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
def readLangs(text, summary):
    
    pairs = [[text[i],summary[i]] for i in range(len(text))]

    input_lang = Lang(text)
    output_lang = Lang(summary)

    return input_lang, output_lang, pairs

In [None]:
def prepareData(lang1, lang2):
    input_lang, output_lang, pairs = readLangs(lang1, lang2)

    print(f'Read {pairs} sentence pairs', end='\n\n')

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    print(input_lang.name, input_lang.n_words, end='\n\n')
    print(output_lang.name, output_lang.n_words, end='\n\n')
    return input_lang, output_lang, pairs

In [None]:
input_lang, output_lang, pairs = prepareData(x, y)

print(random.choice(pairs))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



0        saurav kant alumnus upgrad iiit b pg program m...
1        kunal shah credit card bill payment platform c...
2        new zealand defeated india 8 wickets fourth od...
3        aegon life iterm insurance plan customers enjo...
4        speaking sexual harassment allegations rajkuma...
                               ...                        
98396    crpf jawan tuesday axed death sharp edged weap...
98397     uff yeh first song sonakshi sinha starrer upc...
98398    according reports new version 1999 science fic...
98399    new music video shows rapper snoop dogg aiming...
98400    madhesi morcha alliance seven political partie...
Name: text, Length: 98401, dtype: object 100136

0        upgrad learner switches career ml al 90 salary...
1         delhi techie wins free food swiggy one year cred
2        new zealand end rohit sharma led india 12 matc...
3        aegon life iterm insurance plan helps customer...
4                 known hirani yrs metoo claims true sonam
       

## Model

In [None]:
MAX_LENGTH = 50

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax( self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
teacher_forcing_ratio = 0.5
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]

    else:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return f'{m}m {s}s'

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return f'Time: {asMinutes(s)} (ETA: {asMinutes(rs)})'

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    print("Training....")
    start = time.time()
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    # encoder_optimizer = optim.AdamW(encoder.parameters())
    # decoder_optimizer = optim.AdamW(decoder.parameters())
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        if iter% 1000 == 0:
            print(iter,"/",n_iters + 1)
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(f'{(timeSince(start, iter / n_iters)} (iter: {iter} percent: {iter / n_iters * 100}%%) %.4f' % print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('Text:       ', pair[0])
        print('Summary:    ', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('Prediction: ', output_sentence)
        print('')

In [None]:
hidden_size = 200
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 50000, print_every=1000)

Training....
1000 / 50001
2m 11s (- 107m 29s) (1000 2%) 6.9665
2000 / 50001
4m 21s (- 104m 41s) (2000 4%) 7.2466
3000 / 50001
6m 33s (- 102m 46s) (3000 6%) 7.1989
4000 / 50001
8m 46s (- 100m 51s) (4000 8%) 7.1776
5000 / 50001
10m 58s (- 98m 45s) (5000 10%) 7.0914
6000 / 50001
13m 10s (- 96m 33s) (6000 12%) 7.0301
7000 / 50001
15m 21s (- 94m 18s) (7000 14%) 7.0375
8000 / 50001
17m 33s (- 92m 10s) (8000 16%) 7.0442
9000 / 50001
19m 45s (- 90m 2s) (9000 18%) 7.0591
10000 / 50001
21m 57s (- 87m 50s) (10000 20%) 7.0480
11000 / 50001
24m 8s (- 85m 36s) (11000 22%) 6.9976
12000 / 50001
26m 20s (- 83m 24s) (12000 24%) 6.9607
13000 / 50001
28m 30s (- 81m 9s) (13000 26%) 6.9743
14000 / 50001
30m 42s (- 78m 57s) (14000 28%) 7.0121
15000 / 50001
32m 53s (- 76m 44s) (15000 30%) 6.9018
16000 / 50001
35m 4s (- 74m 31s) (16000 32%) 6.9362
17000 / 50001
37m 15s (- 72m 20s) (17000 34%) 6.9019
18000 / 50001
39m 27s (- 70m 9s) (18000 36%) 6.9120
19000 / 50001
41m 38s (- 67m 56s) (19000 38%) 7.0318
20000 /

In [None]:
torch.save(encoder1.state_dict(), './enc.w')
torch.save(attn_decoder1.state_dict(), './att.w')

In [None]:
evaluateRandomly(encoder1, attn_decoder1)

> us tuesday imposed fresh sanctions 13 chinese north korean organisations accusing supporting north korea nuclear programme trade commodities like coal helping evade nuclear restrictions .  comes us president donald trump declared north korea state sponsor terrorism .  notably north korea 90 trade china . n
= us announces sanctions china n korea trade
< us n n korea n n korea n korea <EOS>

> us startup elysium space partnered elon muskled spacex launch ashes 300 people space .  spacecraft reservations cost 1 . 6 lakh per person travel around earth pass every location world .  following twoyear journey spacecraft reenter earth orbit burn reentry . 
= spacex send ashes 300 people space 1 . 6 lakh
< us musk launches first first time <EOS>

> cbi joint director rajiv singh heading investigations 2 . 1billion pnb fraud involving nirav modi mehul choksi prematurely repatriated home cadre tripura .  move comes agency plans file red corner notice nirav choksi .  three bureaucrats also repatr