In [None]:
!pip install transformers nltk

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
import random
import math
import time
import torch
import torch.nn.functional as F
import spacy
from tqdm import tqdm
spacy_en = spacy.load('en_core_web_sm')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# 1) LSTM (Encoder + Decoder)

In [None]:
stop_words = stopwords.words('english')

def preprocess(text):
    text = text.lower() # lowercase
    text = text.split()
    for i in range(len(text)):
        word = text[i]
    text = " ".join(text)
    text = text.split()
    newtext = []
    for word in text:
        if word not in stop_words:
            newtext.append(word)
    text = " ".join(newtext)
    text = text.replace("'s",'')
    text = re.sub(r'\(.*\)','',text)
    text = re.sub(r'[^a-zA-Z0-9. ]','',text)
    text = re.sub(r'\.',' . ',text)
    return text

df = pd.read_csv(
    "https://raw.githubusercontent.com/sunnysai12345/News_Summary/master/news_summary_more.csv",
    encoding='utf-8'
)
df['headlines'] = df['headlines'].apply(lambda x:preprocess(x))
df['text'] = df['text'].apply(lambda x:preprocess(x))
df.head()

Unnamed: 0,headlines,text
0,upgrad learner switches career ml al 90 salar...,saurav kant alumnus upgrad iiitb pg program ma...
1,delhi techie wins free food swiggy one year cred,kunal shah credit card bill payment platform c...
2,new zealand end rohit sharmaled india 12match ...,new zealand defeated india 8 wickets fourth od...
3,aegon life iterm insurance plan helps customer...,aegon life iterm insurance plan customers enjo...
4,known hirani yrs metoo claims true sonam,speaking sexual harassment allegations rajkuma...


In [None]:
### create vocab index

# start and end of sentence
SOS_token = 100
EOS_token = 101
PAD_TOKEN = 0

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {SOS_token: "SOS", EOS_token: "EOS", PAD_TOKEN: "PAD"}
        self.n_words = 3  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in spacy_en.tokenizer(sentence):
            self.addWord(word.text)
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

trg_lang = Lang(df['headlines'])
src_lang = Lang(df['text'])

for line in df['headlines'].tolist():
    trg_lang.addSentence(line)

for line in df['text'].tolist():
    src_lang.addSentence(line)

print(trg_lang.n_words)
print(src_lang.n_words)

41147
99924


In [None]:
### dataset

from torch.utils.data import DataLoader, Dataset, random_split

class News_Dataset(Dataset):
    def __init__(self, src, trg, src_lang, trg_lang, max_len=256):
        self.max_len = max_len
        self.src = self.tokenize(src, src_lang, 128)
        self.trg = self.tokenize(trg, trg_lang, 32)

    def tokenize(self, sentence_list, lang, max_len):
        token_out = []
        for sentence in sentence_list:
            token = [SOS_token] + [lang.word2index[word.text] for word in spacy_en.tokenizer(sentence)]
            token = token[:max_len - 1]
            token.append(EOS_token)

            while len(token) < max_len:
                token.append(PAD_TOKEN)
            token_out.append(token)
        return token_out

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        items = {"src" : torch.tensor(self.src[idx]),
                 "trg" : torch.tensor(self.trg[idx])}
        return items

news_dataset = News_Dataset(df['text'].tolist(), df['headlines'].tolist(), src_lang, trg_lang)
train_dataset, test_dataset = torch.utils.data.random_split(news_dataset, [len(news_dataset) - 20000, 20000])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [None]:
### modeling

class Encoder(torch.nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        # initializations
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = torch.nn.Embedding(input_dim, emb_dim)

        # if batch_first=True, output = (batch, seq, feature) instead of (seq, batch, feature).
        # cell and hidden remain as (seq, batch, feature)
        self.rnn = torch.nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout,
                                 batch_first=True, bidirectional=False)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, src):
        # src = [batch size, src len]
        # embedded = [batch size, src len, emb dim]
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)

        # outputs = [batch size, src len, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        # outputs are always from the top hidden layer
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(torch.nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        # initialize
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = torch.nn.Embedding(output_dim, emb_dim)

        # for decoder we will use n_directions 1
        self.rnn = torch.nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout,
                                 batch_first=True, bidirectional=False)
        # fully connected layer to predict words
        self.fc_out = torch.nn.Linear(hid_dim, output_dim)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, trg, hidden, cell):

        # trg = [batch size, 1]
        trg = trg.unsqueeze(1)

        #embedded = [batch size, 1, emb dim]
        embedded = self.dropout(self.embedding(trg))

        # seq len and n directions will always be 1 in the decoder
        # output = [batch size, 1, hid dim * n directions]
        # hidden = [batch size, n layers * n directions, hid dim]
        # cell = [batch size, n layers * n directions, hid dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))

        # prediction = [batch size, output dim]
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

class Seq2Seq(torch.nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert encoder.hid_dim == decoder.hid_dim, "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio=0.5):

        # teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time

        # src = [batch size, src len] where src_len is number of tokens in source sentence
        # trg = [batch size, trg len]
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)

        # first input to the decoder is the <sos> tokens
        dec_input = trg[:, 0]

        for t in range(1, trg_len):

            # insert input token embedding, previous hidden and previous cell states
            # receive output tensor (predictions) and new hidden and cell states

            output, hidden, cell = self.decoder(dec_input, hidden, cell)
            # place predictions in a tensor holding predictions for each token
            outputs[:, t, :] = output

            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio

            # get the highest predicted token from our predictions
            top1 = output.argmax(1)

            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            dec_input = trg[:, t] if teacher_force else top1

        return outputs

In [None]:
# seq2seq model's config variables
INPUT_DIM = src_lang.n_words
OUTPUT_DIM = trg_lang.n_words
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
HID_DIM = 256
N_LAYERS = 2
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

# initialize seq2seq model
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, 'cuda')

In [None]:
### testing
for batch in train_loader:
    break

hidden, cell = enc.forward(batch['src'])
print('Encoder Output:', hidden.size(), cell.size())

dec_input = batch['trg'][:, 0]
print('Decoder Input', dec_input.shape)
output, hidden, cell = dec(dec_input, hidden, cell)
print('Decoder Output:', output.size(), cell.size())
print('Decoder Output First Letter: ', output.argmax(1))

final_output = model(batch['src'], batch['trg'])
print('Final Output:', final_output.size())

# loop from 0 to 32
outputs = torch.zeros(batch['trg'].size(0), 32, 41147)
outputs[:, 0, :] = output
teacher_force = random.random() < 0.8
dec_input = batch['trg'][:, 1] if teacher_force else output.argmax(1)

Encoder Output: torch.Size([2, 32, 256]) torch.Size([2, 32, 256])
Decoder Input torch.Size([32])
Decoder Output: torch.Size([32, 41147]) torch.Size([2, 32, 256])
Decoder Output First Letter:  tensor([36728, 36728, 36728, 25453, 14006, 36728, 36728, 36433, 36728, 36728,
        36728, 36728, 36433, 36728, 36728, 36728, 36728, 36728, 36728, 36728,
        36728, 36728, 36728, 10167, 36728, 36728, 36433, 36728, 36728, 36728,
         8761, 36728])
Final Output: torch.Size([32, 32, 41147])


In [None]:
### trainer

class Seq2Seq_trainer(object):
    def __init__(self, model, train_iterator, valid_iterator, pad_index, device, clip, learning_rate):
        # initialize config variables
        self.model = model.to(device)
        self.train_iterator = train_iterator
        self.valid_iterator = valid_iterator
        self.clip = clip
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
        self.criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_index)
        self.model.apply(self.init_weights)
        self.device = device
        print(f'The model has {self.count_parameters(self.model):,} trainable parameters')

    def init_weights(self,m):
        for name, param in m.named_parameters():
            torch.nn.init.uniform_(param.data, -0.08, 0.08)

    def count_parameters(self, model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    def train(self):
        self.model.train()
        epoch_loss = 0

        for i, batch in enumerate(self.train_iterator):

            # trg = [batch size, trg len]
            src = batch['src'].to(self.device)
            trg = batch['trg'].to(self.device)
            self.optimizer.zero_grad()

            # output = [batch size, trg len, output dim]
            output = self.model(src, trg)

            # batch size * seq_len for criterion (cross_ent only allow (N, H))
            # trg = [(trg len - 1) * batch size]
            # output = [(trg len - 1) * batch size, output dim]
            output = output[:, 1:, :].reshape(-1, output.shape[-1])
            trg = trg[:, 1:].reshape(-1)
            # loss function
            loss = self.criterion(output, trg)
            loss.backward()

            # clip to prevent exploding
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
            self.optimizer.step()
            epoch_loss += loss.item()
        return epoch_loss / len(self.train_iterator)

    def evaluate(self):
        self.model.eval()
        epoch_loss = 0
        with torch.no_grad():
            for i, batch in enumerate(self.valid_iterator):

                # trg = [batch size, trg len]
                # output = [batch size, trg len, output dim]
                src = batch['src'].to(self.device)
                trg = batch['trg'].to(self.device)
                output = self.model(src, trg, 0) # turn off teacher forcing

                output = output[:, 1:, :].reshape(-1, output.shape[-1])
                trg = trg[:, 1:].reshape(-1)

                loss = self.criterion(output, trg)
                epoch_loss += loss.item()
        return epoch_loss / len(self.valid_iterator)

    def epoch_time(self, start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    def fit(self, nepochs):
        best_valid_loss = float('inf')

        for epoch in tqdm(range(nepochs)):
            start_time = time.time()
            train_loss = self.train()
            valid_loss = self.evaluate()
            end_time = time.time()
            epoch_mins, epoch_secs = self.epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                # torch.save(model.state_dict(), 'tut1-model.pt')
                print(f'Epoch with best validation loss: {epoch + 1:02}')

            print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss) : 7.3f}')
            print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss) : 7.3f}')

    def predict(self, iterator):
        self.model.eval()
        with torch.no_grad():

            for i, batch in enumerate(tqdm(iterator)):
                src = batch['src'].to(self.device)
                trg = batch['trg'].to(self.device)
                # turn off teacher forcing
                output = self.model(src, trg, 0)

                if i == 0:
                    outputs = torch.argmax(output, -1)
                else:
                    outputs = torch.cat((outputs, torch.argmax(output, -1)), 0)

        # outputs = [len(iterator), trg_len]
        return outputs

In [None]:
# seq2seq model's config variables
INPUT_DIM = src_lang.n_words
OUTPUT_DIM = trg_lang.n_words
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
HID_DIM = 256
N_LAYERS = 2
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

# initialize seq2seq model
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, 'cuda')

# define data loader
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

trainer = Seq2Seq_trainer(model, train_loader, test_loader, 0, 'cuda', 1, 1e-3)
trainer.fit(10)
result = trainer.predict(test_loader).to('cpu').numpy()
print(" ".join([trg_lang.index2word.get(x) for x in result[99]]))

The model has 30,475,067 trainable parameters


  0%|          | 0/10 [00:00<?, ?it/s]

# 2) Abstractive Summaization

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
df = pd.read_csv(
    "https://raw.githubusercontent.com/sunnysai12345/News_Summary/master/news_summary_more.csv",
    encoding='utf-8'
)
df.head()

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...


In [None]:
test_str = df['text'].values[2:3].tolist()
inputs = tokenizer.batch_encode_plus(test_str, return_tensors='pt', padding=True)

summary_ids = model.generate(
    inputs['input_ids'],
    early_stopping=False,
    repetition_penalty=1.4,
    length_penalty=1,
    min_length=20,
    max_length=100,
)

bart_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print('Original Text', test_str)
print('Summary',bart_summary)

Original Text ["New Zealand defeated India by 8 wickets in the fourth ODI at Hamilton on Thursday to win their first match of the five-match ODI series. India lost an international match under Rohit Sharma's captaincy after 12 consecutive victories dating back to March 2018. The match witnessed India getting all out for 92, their seventh lowest total in ODI cricket history."]
Summary New Zealand defeated India by 8 wickets in the fourth ODI at Hamilton. India lost an international match under Rohit Sharma's captaincy after 12 consecutive victories dating back to March 2018. The match witnessed India getting all out for 92, their seventh lowest total in ODI cricket history.


# 3) Extractive Summary

In [None]:
import re
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance

df = pd.read_csv(
    "https://raw.githubusercontent.com/sunnysai12345/News_Summary/master/news_summary_more.csv",
    encoding='utf-8'
)
df.head()

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...


In [None]:
def split_sentence(text):
    text_list = text.split(". ")
    sentences = []
    for sentence in text_list:
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop()
    return sentences

def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []

    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    # Build the vector for the first sentence
    # if word present then add 1 to vector indice, if stopword then skip
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1

    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1

    return 1 - cosine_distance(vector1, vector2)

def build_similarity_matrix(sentences, stop_words):

    similarity_matrix = np.zeros((len(sentences), len(sentences)))

    # Get similarity score for pairs of sentences at idx1 and idx2
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2:
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

stop_words = stopwords.words('english')

In [None]:
### n sample set and combine
sample_text = df['text'].sample(10).values.tolist()
sample_text = [re.sub("[.]", "", x) for x in sample_text]
combined_text = '. '.join(sample_text)
splitted_text = split_sentence(combined_text)
print(sentence_similarity(splitted_text[0], splitted_text[1], stop_words))

### get similarity matrix for each sentences
sentence_similarity_martix = build_similarity_matrix(splitted_text, stop_words)
scores_df = pd.DataFrame(sentence_similarity_martix)
scores_df = scores_df.sum().to_frame().reset_index(drop=True).sort_values(0, ascending=False)
scores = scores_df[0].to_dict()

### sort sentences according to similarity score
ranked_sentence = sorted(
    ((scores[i], s) for i, s in enumerate(splitted_text)),
    reverse=True
)

### extract the actual sentences
ranked_sentences_df = pd.DataFrame(ranked_sentence, columns=['similarity_score', 'content'])
ranked_sentences_df['content'] = [' '.join(x) for x in ranked_sentences_df['content']]

### select the top n most representative sentence
top_n = 5
summarize_text = ranked_sentences_df.head(top_n)['content'].values.tolist()
summarize_text = ". ".join(summarize_text)
summarize_text

0.0


'According to reports, Google is developing a technology that will let publishers create visual-oriented media content like Snapchat\'s Discover feature Google\'s new feature will be a mix of photos, videos, and text, the reports added Google is reportedly in talks with CNN, The Washington Post, Time, and Vox Media, among others regarding their participation in the feature . Actress Kajol, while speaking about pay parity in Bollywood, said, "The pay should be according to genre, how the box office is and box office success" "Pay equality is coming up to par To call it a \'trend\' would be strange (as it is) something that should be a natural fact of life," she added. Chhattisgarh\'s Bastar, which is among the worst Maoist-affected regions, got its first Business Process Outsourcing (BPO) centre run by a private firm Four hundred tribal youths will be employed, at a monthly stipend of â\x82¹4,000, and will be trained in typing, speaking English and technology Located at a government col