<a href="https://colab.research.google.com/github/Abhiram4572/Bi-LSTM_with_attention/blob/main/Bi_LSTM_With_Attention_with_testing_on_live_news_articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import os
import sys
import torch
from torch.nn import functional as F
import numpy as np
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe
from torch.autograd import Variable
import torch.nn as nn
import requests
from bs4 import BeautifulSoup
import pandas as pd
from nltk.tokenize import word_tokenize
import time
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Create Placeholders and load the data

In [None]:
tokenize = lambda x : word_tokenize(x)
TEXT = data.Field(tokenize=tokenize,lower=True,include_lengths=True,batch_first=True,fix_length=200)
LABEL = data.LabelField()
train_data,test_data = datasets.IMDB.splits(TEXT,LABEL)

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:03<00:00, 23.3MB/s]


# Build Vocabulary and Load Glove embeddings

In [None]:
TEXT.build_vocab(train_data,vectors=GloVe(name='6B',dim=300))
LABEL.build_vocab(train_data)
word_embeddings = TEXT.vocab.vectors
vocab_size = len(TEXT.vocab)
print ("\nLength of Vocabulary: " + str(len(TEXT.vocab)))
print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())

.vector_cache/glove.6B.zip: 862MB [06:27, 2.22MB/s]                           
100%|█████████▉| 399840/400000 [00:37<00:00, 10827.35it/s]


Length of Vocabulary: 114422
Vector size of Text Vocabulary:  torch.Size([114422, 300])


# Create Train, validation and test splits

In [None]:
train_data,valid_data = train_data.split()
train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), shuffle=True)

# Define Model (Bi-LSTM with Attention)

In [None]:
class AttentionModel(torch.nn.Module):
    def __init__(self,batch_size,output_size,hidden_size,vocab_size,embedding_length,weights):
        super(AttentionModel,self).__init__()
        self.batch_size = batch_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length
        self.word_embeddings = nn.Embedding(vocab_size,embedding_length)
        self.word_embeddings.weights = nn.Parameter(weights,requires_grad=False)
        self.lstm = nn.LSTM(embedding_length,hidden_size,bidirectional=True)
        self.label = nn.Linear(hidden_size, output_size)
        
    def attention_net(self,lstm_output,final_state):
        hidden = final_state.squeeze(0)
        attn_weights = torch.bmm(lstm_output, hidden.unsqueeze(2)).squeeze(2)
        soft_attn_weights = F.softmax(attn_weights, 1)
        new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
        return new_hidden_state
    
    def forward(self,input_sentences,batch_size=None):
        i_t = self.word_embeddings(input_sentences) 
        i_t = i_t.permute(1,0,2)
        if batch_size is None:
            h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda())
            c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda())  
        else:
            h_0 = Variable(torch.zeros(1,batch_size,self.hidden_size).cuda())
            c_0 = Variable(torch.zeros(1,batch_size,self.hidden_size).cuda())
        output, (final_hidden_state,final_cell_state) = self.lstm(i_t,(h_0,c_0))
        output = output.permute(1,0,2)
        attn_output = self.attention_net(output,final_hidden_state)
        scores = self.label(attn_output)
        return scores

# train function, evaluate function and some util functions

In [None]:
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)

def train_model(model,train_iter,train_flag):
    epoch_loss = 0
    epoch_acc = 0
    model.cuda()
    optim = torch.optim.Adam(filter(lambda p:p.requires_grad, model.parameters()))
    steps = 0
    model.train()
    for idx,batch in enumerate(train_iter):
        text = batch.text[0]
        target = batch.label
        target = torch.autograd.Variable(target).long()
        if torch.cuda.is_available():
            text = text.cuda()
            target = target.cuda()
        if (text.size()[0] != 32):
            continue
        
        pred = model(text)
        loss = loss_fn(pred,target)
        correct_ops = (torch.max(pred, 1)[1].view(target.size()).data == target.data).float().sum()
        accuracy = 100.0 * correct_ops/len(batch)
        if train_flag:
            optim.zero_grad()
            loss.backward()
            clip_gradient(model,1e-1)
            optim.step()
            steps += 1
        epoch_loss += loss.item()
        epoch_acc += accuracy
    avg_loss = epoch_loss/len(train_iter)
    total_accuracy = epoch_acc.item()/len(train_iter)
    return [avg_loss,total_accuracy]

def eval_model(model,train_iter):
    epoch_loss = 0
    epoch_acc = 0
    model.cuda()
    with torch.no_grad():
        for idx,batch in enumerate(train_iter):
            text = batch.text[0]
            if (text.size()[0] != 32):
                continue
            target = batch.label
            target = torch.autograd.Variable(target).long()
            if torch.cuda.is_available():
                text = text.cuda()
                target = target.cuda()
            pred = model(text)
            loss = loss_fn(pred,target)
            correct_ops = (torch.max(pred, 1)[1].view(target.size()).data == target.data).float().sum()
            accuracy = 100.0 * correct_ops/len(batch)
            epoch_loss += loss.item()
            epoch_acc += accuracy
        avg_loss = epoch_loss/len(train_iter)
        total_accuracy = epoch_acc.item()/len(train_iter)
        return [avg_loss,total_accuracy]

# Hyperparameters

In [None]:
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 300

# Initate the Model

In [None]:
model = AttentionModel(batch_size,output_size,hidden_size,vocab_size,embedding_length,word_embeddings)
loss_fn = F.cross_entropy

# Train Model

In [None]:
train_loss = []
total_start = time.time()
for epoch in range(20):
    start = time.time()
    train_info = train_model(model,train_iter,True)
    train_loss.append(train_info[0])
    end = time.time()
    print("Epoch "+str(epoch))
    print("Train Loss "+ str(train_info[0]) + " Train Accuracy "+ str(train_info[1]))
    print("Time taken for epoch is "+str(end-start))
    print('----------\n')
total_end = time.time()
print("Total training time "+str(total_end-total_start))

Epoch 0
Train Loss 0.6605933528713596 Train Accuracy 60.87180073126142
Time taken for epoch is 26.591073274612427
----------

Epoch 1
Train Loss 0.4668132898413067 Train Accuracy 78.09643510054845
Time taken for epoch is 16.03441095352173
----------

Epoch 2
Train Loss 0.29593820993871967 Train Accuracy 87.88276965265082
Time taken for epoch is 16.075244188308716
----------

Epoch 3
Train Loss 0.1764943404072109 Train Accuracy 93.24154478976234
Time taken for epoch is 16.037344932556152
----------

Epoch 4
Train Loss 0.09119091867988545 Train Accuracy 96.66361974405851
Time taken for epoch is 16.144171476364136
----------

Epoch 5
Train Loss 0.043636184749516985 Train Accuracy 98.40607861060329
Time taken for epoch is 16.2226881980896
----------

Epoch 6
Train Loss 0.022735922652630008 Train Accuracy 99.154478976234
Time taken for epoch is 16.267990112304688
----------

Epoch 7
Train Loss 0.013542690105331675 Train Accuracy 99.45726691042047
Time taken for epoch is 16.31441378593445
--

# Evaluate the model on test data

In [None]:
test_loss, test_acc = eval_model(model, test_iter)

In [None]:
print(test_loss)
print(test_acc)

1.6271028598047592
80.41480179028133


# Crawl live news from livemint.com

In [None]:
r1 = requests.get('https://www.livemint.com/')
coverpage = r1.content

soup1 = BeautifulSoup(coverpage, 'html.parser')

coverpage_news = soup1.find_all('h2', class_='headline')

print(coverpage_news)

# Scraping the first 5 articles
number_of_articles = 40

# Empty lists for content, links and titles
news_contents = []
list_links = []
list_titles = []

for n in np.arange(0, number_of_articles):
    
    # only news articles (there are also albums and other things)
    #if "live" not in coverpage_news[n].find('a')['href']:  
    #    continue
    print(n)
    # Getting the link of the article
    try :

      link = coverpage_news[n].find('a')['href']
      #print(link)
      
      if link.startswith(('https://www.livemint.com')):

        list_links.append(link)
        #print(list_links)
      else:
        #print(link)
        link = str('https://www.livemint.com') + str(link)
        list_links.append(link)
        #print(list_links)
      
      # Getting the title
      title = coverpage_news[n].find('a').get_text()
      list_titles.append(title)
      #print(list_titles)
      
      # Reading the content (it is divided in paragraphs)
      article = requests.get(link)
      article_content = article.content
      soup_article = BeautifulSoup(article_content, 'html.parser')
      body = soup_article.find_all('div', class_='mainArea')
      x = body[0].find_all('p')
      
      # Unifying the paragraphs
      list_paragraphs = []
      for p in np.arange(0, len(x)):
          paragraph = x[p].get_text()
          list_paragraphs.append(paragraph)
          final_article = " ".join(list_paragraphs)
          
      news_contents.append(final_article)
      #print(news_contents)
    except TypeError:
      print(n)
  
#print(list_links)

df = pd.DataFrame({'Links': list_links,'Heading': list_titles,'Content': news_contents,})

print(df)

[<h2 class="headline" data-mob="Sales to normalise by Jan-end, getting orders from Jio Mart: Kishore Biyani" data-reg="Sales to normalise by Jan-end, getting orders from Jio Mart: Kishore Biyani" data-title="Sales to normalise by Jan-end, getting orders from Jio Mart: Kishore Biyani" id="headline_11610266201545"><a href="https://www.livemint.com/companies/people/sales-to-normalise-by-jan-end-getting-orders-from-jio-mart-kishore-biyani-11610266201545.html" onclick="event.preventDefault();">Sales to normalise by Jan-end, getting orders from Jio Mart: Kishore Biyani</a></h2>, <h2 class="headline"><a href="/technology/tech-news/the-danger-of-exaggerating-china-s-technological-prowess-11610212952365.html">The danger of exaggerating China’s technological prowess</a></h2>, <h2 class="headline"><a href="/news/world/what-we-already-know-about-investing-in-2021-11610212587732.html">What we already know about investing in 2021</a></h2>, <h2 class="headline"><a href="/news/world/microsoft-designer

# Util functions - get score, add scores for multiple sentences

In [None]:
def get_score(sentence): 
    sentence = TEXT.preprocess(sentence)
    #print(sentence)
    sentence = [[TEXT.vocab.stoi[x] for x in sentence]]
    
    sentence_ = np.asarray(sentence)
    sentence_ = torch.LongTensor(sentence_)
    test_tensor = Variable(sentence_)
    test_tensor = test_tensor.cuda()
    model.eval()
    output = model(test_tensor,1)
    scores = F.softmax(output,1)
    #print(scores)
    score = []
    score.append(scores[0][0])
    score.append(scores[0][1])
    return score
    # if torch.argmax(scores[0]) == 1:
    #     return [1,"Positive"]
    # else:
    #     return [0,"Negative"]

def get_max_score(list_of_scores):
  neg = 0
  pos = 0
  for each in list_of_scores:
    neg += each[0]
    pos += each[1]

  if neg > pos:
    return "Negative"
  else:
    return "Positive"

def get_element_with_max_frequency(ip):
    vals = set(ip)
    final = []
    for each in vals:
        c = 0
        for i in ip:
            if i == each:
                c += 1
        final.append([each,c])
    sorted_list = sorted(final,key=lambda x:x[-1],reverse=True)
#     print(sorted_list)
    if sorted_list[0][0] == 1:
        return "Positive"
    else:
        return "Negative"

# Run on the news collected and store results

In [None]:
df['sentences'] = df['Content'].apply(lambda x:[[y,get_score(y)] for y in x.split('.') if y.strip() != ''])
df['Document_sentiment'] = df['Content'].apply(lambda x:get_element_with_max_frequency([get_score(y)[0] for y in x.split('.') if y.strip() != '']))
df.to_csv('results.csv')

In [None]:
df

Unnamed: 0,Links,Heading,Content,sentences,Document_sentiment
0,https://www.livemint.com/companies/people/sale...,"Sales to normalise by Jan-end, getting orders ...",\nNEW DELHI :\nFuture Group expects normal sal...,[[\nNEW DELHI :\nFuture Group expects normal s...,Negative
1,https://www.livemint.com/technology/tech-news/...,The danger of exaggerating China’s technologic...,\nThe US-China relationship will be the great ...,[[\nThe US-China relationship will be the grea...,Negative
2,https://www.livemint.com/news/world/what-we-al...,What we already know about investing in 2021,\nFor decades I’ve warned investors about the ...,[[\nFor decades I’ve warned investors about th...,Negative
3,https://www.livemint.com/news/world/microsoft-...,Microsoft designer fell out of bed and found a...,\nFalling out of bed wrecked August de los Rey...,[[\nFalling out of bed wrecked August de los R...,Negative
4,https://www.livemint.com/news/world/where-to-t...,Where to travel after Covid? 25 inspiring escapes,\nLinda Lau is looking forward to being jet-la...,[[\nLinda Lau is looking forward to being jet-...,Positive
5,https://www.livemint.com/news/world/what-kids-...,What kids can learn from losing,"\nEveryone wants what’s best for their kid, bu...","[[\nEveryone wants what’s best for their kid, ...",Negative
6,https://www.livemint.com/news/india/ahead-of-c...,"Ahead of Covid vaccination drive, Centre guide...",\nNew Delhi: With Covid-19 vaccination drive s...,[[\nNew Delhi: With Covid-19 vaccination drive...,Negative
7,https://www.livemint.com/news/india/still-not-...,Still not filed your Income Tax Returns? Exten...,\nThe extended deadline for individual taxpaye...,[[\nThe extended deadline for individual taxpa...,Positive
8,https://www.livemint.com/industry/media/bollyw...,Bollywood looking to beat the blues in 2021,\nBollywood may have had a tough year with 202...,[[\nBollywood may have had a tough year with 2...,Positive
9,https://www.livemint.com/money/personal-financ...,SBI revises fixed deposit (FD) rates. Latest F...,\nState Bank of India (SBI) has increased fixe...,[[\nState Bank of India (SBI) has increased fi...,Negative


refered to multiple resources online