In [121]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
import pickle
from bs4 import BeautifulSoup
from newspaper import Article
import pandas as pd
from time import time
from newspaper import Config
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
sw = (stopwords.words('english'))
from nltk.tag import pos_tag
from nltk.tokenize import RegexpTokenizer
import string
from tabulate import tabulate
import re

In [123]:
model = pickle.load(open('fullyTrainedModel.sav', 'rb'))
transModel = pickle.load(open('countVectorizerModel22.sav', 'rb'))

In [124]:
#User agent for selenium scraper
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36'

config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10

In [125]:
smallData = pd.read_csv("lemmatizedArticles.csv")
trainAuthors = smallData.drop_duplicates("lastName")

In [126]:
#Tokenize and lemmatize the articles
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    lemmatized_sentence = []
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+')
    tokens = tokenizer.tokenize(text)
    for word, tag in pos_tag(tokens):
        #Only select adjectives, adverbs, interjections, verbs, or non-proper nouns
        #Proper nouns are unlikely to indicate bias (merely the subject), and other POS are generic and 
        #unlikely to be informative when it comes to bias determinations
        if tag.startswith('J') or tag.startswith('R') or tag.startswith('U') or tag.startswith('V') or tag=="NN" or tag =="NNS":
            if word not in string.punctuation and word.lower() not in sw:
                lemmatized_sentence.append(lemmatizer.lemmatize(word.lower()))
    return lemmatized_sentence

def getArticle(link):
    #Get the article in question
    atcl = Article(link,config=config)
    atcl.download()
    try:
        atcl.parse()
        text = atcl.text #Get article text
        title = atcl.title
        author = atcl.authors
        lemtext = lemmatize_text(text)
        lemtextString = " ".join(lemtext)
        sitex = re.search(r'(?<=https://www\.)\w+', link)
        if sitex== None:
            sitex = re.search(r'(?<=http://www\.)\w+', link)
        if sitex== None:
            sitex = re.search(r'(?<=https://)\w+', link)
        if sitex== None:
            sitex = re.search(r'(?<=http://)\w+', link)
        if sitex == None:
            print("Error parsing site: ",lnk)
            return None
        site = sitex.group(0)
        return [lemtextString,title,author,site]
    except:
        print("nopes")
        return None

def translateScore(score):
    if score == -2:
        return "Strong Left Bias"
    elif score == -1:
        return "Slight Left Bias"
    elif score == 0:
        return "Neutral"
    elif score == 1:
        return "Slight Right Bias"
    elif score == 2:
        return "Strong Right Bias"
    else:
        return "Indeterminate"
    
def transformAll(links):
    toTransform = []
    transformedLinks = []
    transformedTitles = []
    transformedAuthors = []
    transformedSites = []
    isitin = []
    for link in links:
        t1 = time()
        retval = getArticle(link)
        t2 = time()
        if retval != None:
            [lemtext, title, author, site] = retval
            toTransform.append(lemtext)
            transformedTitles.append(title)
            transformedAuthors.append(author)
            transformedSites.append(site)
            transformedLinks.append(link)
            prevAut = ""
            for aut in author:
                ln = aut.split(" ")[-1]
                isin = trainAuthors[trainAuthors['lastName']==ln]
                if len(isin)> 0:
                    prevAut += "Yes "+aut+" "+str(isin["Rating"].values[0])
            if len(prevAut) == 0:
                prevAut = "No"
            isitin.append(prevAut)
    t3 = time()
    dflinks = pd.Series(toTransform)
    bagOfWords = transModel.transform(dflinks)
    predicted = model.predict(bagOfWords)
    categories = [translateScore(score) for score in predicted]
    totalDF = pd.DataFrame({'URL':transformedSites,'Rating':categories,"Title":transformedTitles,
                            "Author":transformedAuthors,"NewsSite":transformedSites,"IsDone":isitin})
    t4 = time()
    print(t4-t3,t3-t2,t2-t1)
    return (totalDF)


In [120]:
inputLinks = ['https://www.newsbusters.org/blogs/nb/brent-bozell/2019/10/29/bozell-graham-column-media-cant-stand-trump-winning-ever'
,'https://www.breitbart.com/politics/2021/04/25/georgia-lawmakers-pass-professional-licenses-illegal-aliens-after-lobbying-chamber-commerce/'
,'https://www.breitbart.com/the-media/2020/06/26/blue-state-blues-alexis-de-tocqueville-saw-the-cancel-culture-coming/'
,'https://www.alternet.org/2018/08/mike-pence-once-argued-lying-cheating-president-should-be-removed-office-he/'
,'https://www.foxnews.com/opinion/biden-state-of-the-union-update-truth-president-sen-ted-cruz'
              ,'https://www.bbc.com/news/world-us-canada-56910884'
             ]
result = transformAll(inputLinks)
print((result.filter(['Title','Rating','IsDone',"Author"])))


0.3637971878051758 1.9073486328125e-06 0.36322832107543945
                                               Title             Rating  \
0  Bozell & Graham Column: Media Can't Stand Trum...            Neutral   
1  Georgia Lawmakers Approve Professional License...  Strong Right Bias   
2  Blue State Blues: Tocqueville Saw the ‘Cancel ...  Strong Right Bias   
3  Mike Pence Once Argued a Lying, Cheating Presi...   Strong Left Bias   
4  Sen. Ted Cruz: Biden's State of the Union upda...  Slight Right Bias   
5  Biden to sell child care and free university i...   Strong Left Bias   

                               IsDone  \
0  Yes Brent Bozell 2Yes Tim Graham 2   
1                                  No   
2                                  No   
3                                  No   
4                                  No   
5                                  No   

                                              Author  
0  [Brent Bozell, Founder, Tim Graham, Executive ...  
1                

In [132]:
inputLinks = ['https://www.newsbusters.org/blogs/nb/brent-bozell/2019/10/29/bozell-graham-column-media-cant-stand-trump-winning-ever'
,'https://www.breitbart.com/politics/2021/04/25/georgia-lawmakers-pass-professional-licenses-illegal-aliens-after-lobbying-chamber-commerce/'
,'https://www.breitbart.com/the-media/2020/06/26/blue-state-blues-alexis-de-tocqueville-saw-the-cancel-culture-coming/'
,'https://www.alternet.org/2018/08/mike-pence-once-argued-lying-cheating-president-should-be-removed-office-he/'
,'https://www.foxnews.com/opinion/biden-state-of-the-union-update-truth-president-sen-ted-cruz'
              ,'https://www.bbc.com/news/world-us-canada-56910884'
             ]
result = transformAll(inputLinks)
print((result.filter(['Title','Rating','IsDone',"Author"])))


1.1627721786499023 1.9073486328125e-06 0.3809070587158203
                                               Title             Rating  \
0  Bozell & Graham Column: Media Can't Stand Trum...  Strong Right Bias   
1  Georgia Lawmakers Approve Professional License...  Strong Right Bias   
2  Blue State Blues: Tocqueville Saw the ‘Cancel ...  Strong Right Bias   
3  Mike Pence Once Argued a Lying, Cheating Presi...            Neutral   
4  Sen. Ted Cruz: Biden's State of the Union upda...  Strong Right Bias   
5  Biden to sell child care and free university i...   Slight Left Bias   

                               IsDone  \
0  Yes Brent Bozell 2Yes Tim Graham 2   
1                                  No   
2                                  No   
3                                  No   
4                                  No   
5                                  No   

                                              Author  
0  [Brent Bozell, Founder, Tim Graham, Executive ...  
1                 