<a href="https://colab.research.google.com/github/Dawudis/Political-Web-Scraping-Project/blob/main/Scraping%2BSentiment_Analysis%2BNER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install/Import Dependencies**

In [None]:
!pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio===0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
import torch

In [None]:
!pip install requests 
!pip install beautifulsoup4 
import requests
import re
from bs4 import BeautifulSoup


In [None]:
!pip install transformers 
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [None]:
!pip install newspaper3k
import newspaper
from newspaper import Article

In [None]:
!pip install -U spacy
!python -m spacy download en
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
!pip install numpy pandas
import numpy as np
import pandas as pd

# **Scrape Articles**

In [36]:
site = newspaper.build("https://apnews.com/hub/joe-biden", memoize_articles=False)  
# get list of article URLs
site.article_urls()

['https://apnews.com/article/coronavirus-pandemic-joe-biden-business-pandemics-congress-7466507d8f33fbc94a4c5bf0f329f6a2',
 'https://apnews.com/article/joe-biden-business-congress-mitch-mcconnell-bills-e444072fb3b2f7d5a02793bf48d9ae96',
 'https://apnews.com/article/joe-biden-technology-business-janet-yellen-51dfb6aab4f6988aa5420224d4a8c7c5',
 'https://apnews.com/article/joe-biden-dzhokhar-tsarnaev-boston-courts-bombings-3a784f624dc605fda09af8ffe0188977',
 'https://apnews.com/article/joe-biden-donald-trump-politics-dana-remus-congress-a385eb9c716589a34238c1a6cc069105',
 'https://apnews.com/article/coronavirus-pandemic-joe-biden-scott-wiener-bills-health-de858f78c2e7dd2bb26a25bb49a2b591',
 'https://apnews.com/article/joe-biden-business-congress-economy-bills-f2b52bc7ff5d1f241ed12e73e382c6b8',
 'https://apnews.com/article/donald-trump-business-congress-capitol-siege-subpoenas-375f0349971f87d30ffd24fe5b8bbdf0',
 'https://apnews.com/article/joe-biden-donald-trump-lifestyle-business-environm

In [37]:
top_articles = []
for index in range(20):
    article = site.articles[index]
    article.download()
    article.parse()
    top_articles.append(article.text)

# **Configure DataFrame and Apply Sentiment Analysis + NER**

In [38]:
df = pd.DataFrame(np.array(top_articles), columns=['articles'])

In [39]:
def sentiment_score(articles):
    tokens = tokenizer.encode(articles, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [40]:
df['sentiment'] = df['articles'].apply(lambda x: sentiment_score(x[:512]))

In [41]:
df['named entities'] = df['articles'].apply(lambda x: list(nlp(x).ents))

In [42]:
df

Unnamed: 0,articles,sentiment,named entities
0,"FILE - In this Sept. 20, 2021, file photo, Sen...",3,"[(FILE), (this, Sept., 20, ,, 2021), (Roger, M..."
1,"In this Oct. 7, 2021, photo, Senate Minority L...",4,"[(this, Oct., 7, ,, 2021), (Senate), (Mitch, M..."
2,"FILE - In this Monday, Nov. 18, 2019 file phot...",1,"[(FILE), (this, Monday, ,, Nov., 18, ,, 2019),..."
3,"FILE - In this April 16, 2013, file photo, inv...",1,"[(FILE), (this, April, 16, ,, 2013), (second),..."
4,President Joe Biden waits to speak on the Nort...,4,"[(Joe, Biden), (the, North, Lawn), (the, White..."
5,"FILE - In this June 3, 2021, file photo, Calif...",1,"[(FILE), (this, June, 3, ,, 2021), (California..."
6,WASHINGTON (AP) — Senate Minority Leader Mitch...,1,"[(WASHINGTON), (AP), (Senate), (Mitch, McConne..."
7,President Joe Biden waits to speak on the Nort...,4,"[(Joe, Biden), (the, North, Lawn), (the, White..."
8,President Joe Biden speaks outside the White H...,4,"[(Joe, Biden), (the, White, House), (Washingto..."
9,"FILE - In this Jan. 16, 2021 file photo, Eric ...",1,"[(FILE), (this, Jan., 16), (Eric, Lander), (Wi..."
