<a href="https://colab.research.google.com/github/Ambika2501/IndengenousInternship/blob/main/WebAPI_SentimentClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
%%capture
!pip install newspaper3k
!pip install spacy
!pip install spacytextblob
!python -m textblob.download_corpora
!python -m spacy download en_core_web_sm
!pip install transformers

In [4]:
import re
import requests
from pprint import pprint

import numpy as np
import pandas as pd

from newspaper import Article

import torch

import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Downloads and sets up model trained on BBC News dataset
model = AutoModelForSequenceClassification.from_pretrained("abhishek/autonlp-bbc-news-classification-37229289")
tokenizer = AutoTokenizer.from_pretrained("abhishek/autonlp-bbc-news-classification-37229289")

def get_news():
  """
  Queries NewAPI for top news from the Indian Express, scrapes the content of the webpage using
  newspaper3K library. Had to resort to scraping as no news agency had a free API that also gave
  access to the full content of the article (needed for sentiment classification)
  """
  # NewsAPI setup
  query_params = {
      "source": "The Indian Express", # change for different press agency
      "sortBy": "top",
      "apiKey": "3ee5a54e40ce449e8d08ad2eb8ee26e4", # API key, rate limited to 100/day, use resposibly/ use your own key
  }
  main_url = "https://newsapi.org/v2/top-headlines?country=in"
  
  # get list of top articles from newsAPI
  res = requests.get(main_url, params=query_params)
  content = res.json()
  articles = content['articles']

  # Scrapes content of the news article
  titles = list()
  article_text = list()
  for article in articles:
    titles.append(article['title'])
    art = Article(article['url'], language="en")
    art.download()
    art.parse()
    # simple cleanup, removes read also links. more can be done.
    article_text.append(re.sub(r'(Also Read:.*)', '',art.text.replace('\n',''))) 
  
  return titles, article_text

def sentiment_analysis(text):
  """ 
  Performs sentiment Analysis using textblob+SpaCy, used SpaCy instead of transformer based model
  for better performance (inference speed)
  """
  nlp = spacy.load('en_core_web_sm')
  nlp.add_pipe('spacytextblob')
  doc = nlp(text)
  sentiment = ''
  polarity, subjectivity = doc._.blob.polarity, doc._.blob.subjectivity
  # simple thresholding to bucket polarity into Negative, Neutral and Positive
  if polarity <-0.33 and polarity >= -1.0:
    sentiment = 'Negative'
  elif polarity >-0.33 and polarity <0.33:
    sentiment = 'Neutral'
  elif polarity >0.33 and polarity <=1.0:
    sentiment = 'Positive'

  return polarity, subjectivity, sentiment

def find_topic(title):
  """ 
  Uses a transformer model to classify article into the topics, trained on BBC news dataset
  """
  id2label = {0:'business', 1:'entertainment', 2:'politics', 3:'sport', 4:'tech'}
  inputs = tokenizer(title, return_tensors="pt")
  outputs = model(**inputs)
  topic = id2label[torch.argmax(outputs.logits).item()]
  return topic



Downloading:   0%|          | 0.00/963 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/311 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [5]:
output = list()
titles, articles = get_news()
for title, article_text in zip(titles, articles):
  polarity, subjectivity, sentiment = sentiment_analysis(article_text)
  topic = find_topic(title)
  output.append((title, article_text, polarity, subjectivity, sentiment, topic))

df = pd.DataFrame(output, columns=['title','article_text', 'polarity', 'subjectivity', 'sentiment', 'topic'])
df[:]

Unnamed: 0,title,article_text,polarity,subjectivity,sentiment,topic
0,Perseverance rover drops second sample on Mart...,NASA’s Perseverance rover has dropped an “earl...,0.1325,0.163333,Neutral,tech
1,WhatsApp introduces status report feature for ...,The meta-owned cross-platform instant messagin...,0.238504,0.5113,Neutral,tech
2,"Twitter Hails Ravichandran Ashwin, Shreyas Iye...","December 25, 2022, will now be remembered as a...",0.256608,0.567017,Neutral,sport
3,"Mann Ki Baat: Wash Hands, Wear Mask To Remain ...",PM Modi Mann Ki Baat: Prime Minister Narendra ...,0.270318,0.479206,Neutral,politics
4,How Team India left Pakistan on thin ice in ra...,Bangladesh started right from where they left ...,0.140714,0.453473,Neutral,sport
5,"""We Stand Ready..."": China On Ties With India ...",India says the two sides have agreed to mainta...,0.061458,0.228125,Neutral,entertainment
6,"Sheezan Khan, Co-Star Of Actor Tunisha Sharma,...",Sheezan Khan was arrested after the actor's mo...,0.02029,0.317029,Neutral,entertainment
7,Mcap of 10 most valued firms erodes by ₹1.68 l...,The combined market valuation of the top 10 va...,-0.030556,0.35463,Neutral,business
8,Steal deal! iPhone 11 price slips to 20499 fro...,You have a great chance to save huge amount of...,0.329071,0.502343,Neutral,tech
9,"Covid update: India logs 227 new cases, 2 deat...",Covid-19 cases in India are increasing with a ...,-0.143664,0.504959,Neutral,business
