<a href="https://colab.research.google.com/github/Dawudis/Political-Web-Scraping-Project/blob/main/Scraping_%2B_Sentiment_Analysis_%2B_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install/Import Dependencies**

In [None]:
!pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio===0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
import torch

In [None]:
!pip install transformers 
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [None]:
!pip install newspaper3k
import newspaper
from newspaper import Article

In [None]:
!pip install -U spacy
!python -m spacy download en
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
!pip install numpy pandas
import numpy as np
import pandas as pd

# **Scrape Articles**

In [None]:
site = newspaper.build("https://apnews.com/hub/joe-biden", memoize_articles=False)  
# get list of article URLs
site.article_urls()

['https://apnews.com/hub/ap-top-25-college-football-poll?utm_source=apnewsnav&utm_medium=featured',
 'https://apnews.com/article/joe-biden-business-ireland-europe-economic-policy-e9d41bb030271808cd045c62bd6fb6d3',
 'https://apnews.com/article/donald-trump-joe-biden-us-supreme-court-congress-capitol-siege-13803c23a094992233df3b6880d2808b',
 'https://apnews.com/article/columbus-indigenous-peoples-day-b00777738e9d0ae411e9b6acad223371',
 'https://apnews.com/article/joe-biden-entertainment-sports-pennsylvania-washington-791612241de52339ec42be7430fc8323',
 'https://apnews.com/article/coronavirus-pandemic-joe-biden-business-health-shipbuilding-25a44711104a349b80ee7b98d399dcd5',
 'https://apnews.com/article/joe-biden-business-congress-filibusters-27ee5558714da1347b68d58952cc137b',
 'https://apnews.com/article/joe-biden-technology-business-china-russia-c9a698542ed95bfa49f9cee0e96ef9a6',
 'https://apnews.com/article/joe-biden-joe-donnelly-todd-young-south-bend-indiana-d02b3c23ac032dc3ef8aed949a1

In [None]:
top_articles = []
for index in range(20):
    article = site.articles[index]
    article.download()
    article.parse()
    top_articles.append(article.text)

# **Configure DataFrame and Apply Sentiment Analysis + NER**

In [None]:
df = pd.DataFrame(np.array(top_articles), columns=['articles'])

In [None]:
def sentiment_score(articles):
    tokens = tokenizer.encode(articles, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [None]:
df['sentiment'] = df['articles'].apply(lambda x: sentiment_score(x[:512]))

In [None]:
df['named entities'] = df['articles'].apply(lambda x: list(nlp(x).ents))

In [None]:
df

Unnamed: 0,articles,sentiment,named entities
0,AP Top 25 Poll\n\nThe Associated Press began i...,5,"[(AP, Top, 25, Poll), (The, Associated, Press)..."
1,"FILE - In this June 7, 2017 file photo, the Or...",1,"[(FILE), (this, June, 7), (2017), (the, Organi..."
2,President Joe Biden salutes as he steps off of...,5,"[(Joe, Biden), (the, South, Lawn), (the, White..."
3,"FILE - In this Oct. 8, 2012 file photo, people...",4,"[(FILE), (this, Oct., 8, ,), (2012), (Christop..."
4,President Joe Biden boards Marine One at Delaw...,4,"[(Joe, Biden), (Marine, One), (Delaware, Air, ..."
5,"JACKSON, Miss. (AP) — Thousands recently ralli...",1,"[(JACKSON), (Miss.), (AP), (Thousands), (Missi..."
6,FILE - The U.S Capitol at sunset in Washington...,2,"[(FILE), (U.S), (Capitol), (Washington), (Sept..."
7,"FILE - In this Sept. 16, 2017, file photo, a p...",1,"[(FILE), (this, Sept., 16, ,, 2017), (Chicago)..."
8,INDIANAPOLIS (AP) — One of Indiana’s Republica...,5,"[(INDIANAPOLIS), (AP), (One), (Indiana), (Repu..."
9,"FILE - In this Friday, Oct. 11, 2019, file pho...",1,"[(FILE), (this, Friday, ,, Oct., 11, ,, 2019),..."
