# Fact or Fake: News Analysis

## Data Mining 334
## Alex Laughlin,

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import sklearn
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import re
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem.snowball import SnowballStemmer



#pull in data

#just the politifact data for now
df_p_real = pd.read_csv('politifact_real.csv')
df_p_real['true/false'] = True
df_p_real.head(10)

Unnamed: 0,id,news_url,title,tweet_ids,true/false
0,politifact14984,http://www.nfib-sbet.org/,National Federation of Independent Business,967132259869487105\t967164368768196609\t967215...,True
1,politifact12944,http://www.cq.com/doc/newsmakertranscripts-494...,comments in Fayetteville NC,942953459\t8980098198\t16253717352\t1668513250...,True
2,politifact333,https://web.archive.org/web/20080204072132/htt...,"Romney makes pitch, hoping to close deal : Ele...",,True
3,politifact4358,https://web.archive.org/web/20110811143753/htt...,Democratic Leaders Say House Democrats Are Uni...,,True
4,politifact779,https://web.archive.org/web/20070820164107/htt...,"Budget of the United States Government, FY 2008",89804710374154240\t91270460595109888\t96039619...,True
5,politifact14064,http://www.politifact.com/truth-o-meter/statem...,Donald Trump exaggerates when he says China ha...,690248006399049728\t690254026663821312\t690276...,True
6,politifact14474,https://www.law.cornell.edu/constitution/amend...,25th Amendment,1262604762\t10969740933\t11182364398\t17507543...,True
7,politifact5276,http://americaneedsmitt.com/blog/2011/11/10/mi...,子供たちのコト。私のコト。,,True
8,politifact1313,https://web.archive.org/web/20090913221204/htt...,Briefing by White House Press Secretary Robert...,13511762265\t13512918230\t13513835900\t1351424...,True
9,politifact937,https://web.archive.org/web/20080623122709/htt...,A Solar Grand Plan: Scientific American,140962137332920320\t141057766704947200\t141166...,True


In [2]:
df_p_fake = pd.read_csv('politifact_fake.csv')
df_p_fake['true/false'] = False
df_p_fake.head(10)

Unnamed: 0,id,news_url,title,tweet_ids,true/false
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,False
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,False
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,False
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,False
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,False
5,politifact14404,gloria.tv/video/yRrtUtTCfPga6cq2VDJPcgQe4,Putin says: ‘Pope Francis Is Not A Man Of God’...,893290900637483009\t893290950700802048\t893290...,False
6,politifact13919,http://blogs.trendolizer.com/2015/01/new-york-...,New York Man Wanted For Infecting 240 Men And ...,,False
7,politifact14795,https://web.archive.org/web/20171027105356/htt...,Saudi Arabia to Behead 6 School Girls for Bein...,923126512458616832\t923135295070990341\t923189...,False
8,politifact14328,https://web.archive.org/web/20170702174006/htt...,Malia Obama Fired From Cushy Internship At Spa...,880455776107679747\t880457763876462598\t880461...,False
9,politifact13775,http://beforeitsnews.com/opinion-conservative/...,Target to Discontinue Sale of Holy Bible,732741826084397057\t732741823534227456\t732741...,False


In [3]:
#Join tables together
frames = [df_p_real, df_p_fake]
df = pd.concat(frames)
df.head(10)
df.shape

(1056, 5)

In [4]:
stemmer = SnowballStemmer("english")

tokenizer = nltk.RegexpTokenizer(r"\w+")

#remove special characters
df['title'] = df.apply(lambda row: re.sub('[^A-Za-z0-9 ]+', '', str(row['title'])), axis =1)

#tokenize words from title
df['tokenized_sents'] = df.apply(lambda row: tokenizer.tokenize(row['title']), axis=1)

#remove stop words from tokenized titles
df['tokens_without_stopwords'] = df['tokenized_sents'].apply(lambda x: [item for item in x if item not in stop_words])

#stem tokenized words without stopwords
df['tokens_stemmed']=df['tokens_without_stopwords'].apply(lambda x : [stemmer.stem(y) for y in x])

#lemmatize tokenized words without stopwords
lemmatizer = nltk.stem.WordNetLemmatizer()
df['tokens_lemmatized'] = df['tokens_without_stopwords'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x])
df.head(25)

Unnamed: 0,id,news_url,title,tweet_ids,true/false,tokenized_sents,tokens_without_stopwords,tokens_stemmed,tokens_lemmatized
0,politifact14984,http://www.nfib-sbet.org/,National Federation of Independent Business,967132259869487105\t967164368768196609\t967215...,True,"[National, Federation, of, Independent, Business]","[National, Federation, Independent, Business]","[nation, feder, independ, busi]","[National, Federation, Independent, Business]"
1,politifact12944,http://www.cq.com/doc/newsmakertranscripts-494...,comments in Fayetteville NC,942953459\t8980098198\t16253717352\t1668513250...,True,"[comments, in, Fayetteville, NC]","[comments, Fayetteville, NC]","[comment, fayettevill, nc]","[comment, Fayetteville, NC]"
2,politifact333,https://web.archive.org/web/20080204072132/htt...,Romney makes pitch hoping to close deal Elect...,,True,"[Romney, makes, pitch, hoping, to, close, deal...","[Romney, makes, pitch, hoping, close, deal, El...","[romney, make, pitch, hope, close, deal, elect...","[Romney, make, pitch, hoping, close, deal, Ele..."
3,politifact4358,https://web.archive.org/web/20110811143753/htt...,Democratic Leaders Say House Democrats Are Uni...,,True,"[Democratic, Leaders, Say, House, Democrats, A...","[Democratic, Leaders, Say, House, Democrats, A...","[democrat, leader, say, hous, democrat, are, u...","[Democratic, Leaders, Say, House, Democrats, A..."
4,politifact779,https://web.archive.org/web/20070820164107/htt...,Budget of the United States Government FY 2008,89804710374154240\t91270460595109888\t96039619...,True,"[Budget, of, the, United, States, Government, ...","[Budget, United, States, Government, FY, 2008]","[budget, unit, state, govern, fy, 2008]","[Budget, United, States, Government, FY, 2008]"
5,politifact14064,http://www.politifact.com/truth-o-meter/statem...,Donald Trump exaggerates when he says China ha...,690248006399049728\t690254026663821312\t690276...,True,"[Donald, Trump, exaggerates, when, he, says, C...","[Donald, Trump, exaggerates, says, China, tota...","[donald, trump, exagger, say, china, total, co...","[Donald, Trump, exaggerates, say, China, total..."
6,politifact14474,https://www.law.cornell.edu/constitution/amend...,25th Amendment,1262604762\t10969740933\t11182364398\t17507543...,True,"[25th, Amendment]","[25th, Amendment]","[25th, amend]","[25th, Amendment]"
7,politifact5276,http://americaneedsmitt.com/blog/2011/11/10/mi...,,,True,[],[],[],[]
8,politifact1313,https://web.archive.org/web/20090913221204/htt...,Briefing by White House Press Secretary Robert...,13511762265\t13512918230\t13513835900\t1351424...,True,"[Briefing, by, White, House, Press, Secretary,...","[Briefing, White, House, Press, Secretary, Rob...","[brief, white, hous, press, secretari, robert,...","[Briefing, White, House, Press, Secretary, Rob..."
9,politifact937,https://web.archive.org/web/20080623122709/htt...,A Solar Grand Plan Scientific American,140962137332920320\t141057766704947200\t141166...,True,"[A, Solar, Grand, Plan, Scientific, American]","[A, Solar, Grand, Plan, Scientific, American]","[a, solar, grand, plan, scientif, american]","[A, Solar, Grand, Plan, Scientific, American]"


In [5]:
#create corpus from lemmatized tokens as numpy array
corpus = df['tokens_lemmatized'].to_numpy()

#create dictionary
DF = {}

#write unique words to dictionary
for i in range(len(corpus)):
    tokens = corpus[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
            
#get word frequency
for i in DF:
    DF[i] = len(DF[i])
DF

{'National': 14,
 'Federation': 1,
 'Independent': 1,
 'Business': 4,
 'comment': 3,
 'Fayetteville': 1,
 'NC': 1,
 'Romney': 8,
 'make': 2,
 'pitch': 1,
 'hoping': 1,
 'close': 2,
 'deal': 4,
 'Elections': 2,
 'The': 88,
 'Rocky': 1,
 'Mountain': 1,
 'News': 41,
 'Democratic': 16,
 'Leaders': 3,
 'Say': 4,
 'House': 21,
 'Democrats': 11,
 'Are': 12,
 'United': 18,
 'Against': 8,
 'GOP': 11,
 'Default': 1,
 'Act': 17,
 'Budget': 8,
 'States': 16,
 'Government': 6,
 'FY': 2,
 '2008': 8,
 'Donald': 25,
 'Trump': 99,
 'exaggerates': 1,
 'say': 28,
 'China': 1,
 'total': 1,
 'control': 1,
 'North': 4,
 'Korea': 2,
 '25th': 2,
 'Amendment': 4,
 'Briefing': 4,
 'White': 17,
 'Press': 10,
 'Secretary': 6,
 'Robert': 3,
 'Gibbs': 2,
 '91009': 1,
 'A': 48,
 'Solar': 2,
 'Grand': 1,
 'Plan': 9,
 'Scientific': 1,
 'American': 13,
 'Covering': 1,
 'Young': 1,
 'Adults': 1,
 'Under': 3,
 'Affordable': 4,
 'Care': 14,
 'Importance': 1,
 'Outreach': 1,
 'Medicaid': 2,
 'Expansion': 1,
 'Harry': 5,
 '

In [6]:
all_words = [x for x in DF]
print(all_words)

['National', 'Federation', 'Independent', 'Business', 'comment', 'Fayetteville', 'NC', 'Romney', 'make', 'pitch', 'hoping', 'close', 'deal', 'Elections', 'The', 'Rocky', 'Mountain', 'News', 'Democratic', 'Leaders', 'Say', 'House', 'Democrats', 'Are', 'United', 'Against', 'GOP', 'Default', 'Act', 'Budget', 'States', 'Government', 'FY', '2008', 'Donald', 'Trump', 'exaggerates', 'say', 'China', 'total', 'control', 'North', 'Korea', '25th', 'Amendment', 'Briefing', 'White', 'Press', 'Secretary', 'Robert', 'Gibbs', '91009', 'A', 'Solar', 'Grand', 'Plan', 'Scientific', 'American', 'Covering', 'Young', 'Adults', 'Under', 'Affordable', 'Care', 'Importance', 'Outreach', 'Medicaid', 'Expansion', 'Harry', 'Reid', 'Says', 'Should', 'Stop', 'Crying', 'About', 'Reconciliation', 'Political', 'TV', 'Ad', 'Archive', 'PolAd', 'Bureau', 'Labor', 'Statistics', 'Data', 'Summary', 'This', 'Week', 'Transcript', 'Holder', 'Giuliani', 'Impeachment', 'Talk', 'Becomes', 'Cynical', 'Game', 'Preventing', 'Flu', 'G

In [7]:
#create column of strings from lemmatized tokens
df['tfidfprep']=[" ".join(x) for x in df['tokens_lemmatized'].values]
labels=df[["true/false"]]

#create train and test sets
labels=df[["true/false"]]
x_train, x_test, y_train, y_test= train_test_split(df['tfidfprep'], labels, test_size=0.2, stratify=df['true/false'], random_state=7)

#initialize tfidf vectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#create tfidf of train and test sets
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

df.head()

Unnamed: 0,id,news_url,title,tweet_ids,true/false,tokenized_sents,tokens_without_stopwords,tokens_stemmed,tokens_lemmatized,tfidfprep
0,politifact14984,http://www.nfib-sbet.org/,National Federation of Independent Business,967132259869487105\t967164368768196609\t967215...,True,"[National, Federation, of, Independent, Business]","[National, Federation, Independent, Business]","[nation, feder, independ, busi]","[National, Federation, Independent, Business]",National Federation Independent Business
1,politifact12944,http://www.cq.com/doc/newsmakertranscripts-494...,comments in Fayetteville NC,942953459\t8980098198\t16253717352\t1668513250...,True,"[comments, in, Fayetteville, NC]","[comments, Fayetteville, NC]","[comment, fayettevill, nc]","[comment, Fayetteville, NC]",comment Fayetteville NC
2,politifact333,https://web.archive.org/web/20080204072132/htt...,Romney makes pitch hoping to close deal Elect...,,True,"[Romney, makes, pitch, hoping, to, close, deal...","[Romney, makes, pitch, hoping, close, deal, El...","[romney, make, pitch, hope, close, deal, elect...","[Romney, make, pitch, hoping, close, deal, Ele...",Romney make pitch hoping close deal Elections ...
3,politifact4358,https://web.archive.org/web/20110811143753/htt...,Democratic Leaders Say House Democrats Are Uni...,,True,"[Democratic, Leaders, Say, House, Democrats, A...","[Democratic, Leaders, Say, House, Democrats, A...","[democrat, leader, say, hous, democrat, are, u...","[Democratic, Leaders, Say, House, Democrats, A...",Democratic Leaders Say House Democrats Are Uni...
4,politifact779,https://web.archive.org/web/20070820164107/htt...,Budget of the United States Government FY 2008,89804710374154240\t91270460595109888\t96039619...,True,"[Budget, of, the, United, States, Government, ...","[Budget, United, States, Government, FY, 2008]","[budget, unit, state, govern, fy, 2008]","[Budget, United, States, Government, FY, 2008]",Budget United States Government FY 2008


In [12]:
#instantiate passive aggresive classifier
pac=PassiveAggressiveClassifier(max_iter=100)

#fit classifer to tfidf of training set
pac.fit(tfidf_train,y_train.values.ravel())

#create predictions on test set
y_pred=pac.predict(tfidf_test)

#print score
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 86.79%
