# Fact or Fake: News Analysis

### Data Mining 334
### Alex Laughlin, Xandre Clementsmith

---
## Imports

In [7]:
# required for running on jupyterlabs (Xandre)
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Xirailuyo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Xirailuyo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import sklearn
import math
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import re
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem.snowball import SnowballStemmer

## Data Transforms

In [9]:
#pull in data

#just the politifact data for now
df_p_real = pd.read_csv('politifact_real.csv')
df_p_real['true/false'] = True
df_p_real.head(2)

Unnamed: 0,id,news_url,title,tweet_ids,true/false
0,politifact14984,http://www.nfib-sbet.org/,National Federation of Independent Business,967132259869487105\t967164368768196609\t967215...,True
1,politifact12944,http://www.cq.com/doc/newsmakertranscripts-494...,comments in Fayetteville NC,942953459\t8980098198\t16253717352\t1668513250...,True


In [10]:
df_p_fake = pd.read_csv('politifact_fake.csv')
df_p_fake['true/false'] = False
df_p_fake.head(2)

Unnamed: 0,id,news_url,title,tweet_ids,true/false
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,False
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,False


In [11]:
#Join tables together
frames = [df_p_real, df_p_fake]
df.shape

(1056, 5)

## Regularization

In [15]:
stemmer = SnowballStemmer("english")

tokenizer = nltk.RegexpTokenizer(r"\w+")

#remove special characters
df['title'] = df.apply(lambda row: re.sub('[^A-Za-z0-9 ]+', '', str(row['title'])), axis =1)

#tokenize words from title
df['tokenized_sents'] = df.apply(lambda row: tokenizer.tokenize(row['title']), axis=1)

#remove stop words from tokenized titles
df['tokens_without_stopwords'] = df['tokenized_sents'].apply(lambda x: [item for item in x if item not in stop_words])

#stem tokenized words without stopwords
df['tokens_stemmed']=df['tokens_without_stopwords'].apply(lambda x : [stemmer.stem(y) for y in x])

#lemmatize tokenized words without stopwords
lemmatizer = nltk.stem.WordNetLemmatizer()
df['tokens_lemmatized'] = df['tokens_without_stopwords'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x])
df.head(5)

Unnamed: 0,id,news_url,title,tweet_ids,true/false,tokenized_sents,tokens_without_stopwords,tokens_stemmed,tokens_lemmatized
0,politifact14984,http://www.nfib-sbet.org/,National Federation of Independent Business,967132259869487105\t967164368768196609\t967215...,True,"[National, Federation, of, Independent, Business]","[National, Federation, Independent, Business]","[nation, feder, independ, busi]","[National, Federation, Independent, Business]"
1,politifact12944,http://www.cq.com/doc/newsmakertranscripts-494...,comments in Fayetteville NC,942953459\t8980098198\t16253717352\t1668513250...,True,"[comments, in, Fayetteville, NC]","[comments, Fayetteville, NC]","[comment, fayettevill, nc]","[comment, Fayetteville, NC]"
2,politifact333,https://web.archive.org/web/20080204072132/htt...,Romney makes pitch hoping to close deal Elect...,,True,"[Romney, makes, pitch, hoping, to, close, deal...","[Romney, makes, pitch, hoping, close, deal, El...","[romney, make, pitch, hope, close, deal, elect...","[Romney, make, pitch, hoping, close, deal, Ele..."
3,politifact4358,https://web.archive.org/web/20110811143753/htt...,Democratic Leaders Say House Democrats Are Uni...,,True,"[Democratic, Leaders, Say, House, Democrats, A...","[Democratic, Leaders, Say, House, Democrats, A...","[democrat, leader, say, hous, democrat, are, u...","[Democratic, Leaders, Say, House, Democrats, A..."
4,politifact779,https://web.archive.org/web/20070820164107/htt...,Budget of the United States Government FY 2008,89804710374154240\t91270460595109888\t96039619...,True,"[Budget, of, the, United, States, Government, ...","[Budget, United, States, Government, FY, 2008]","[budget, unit, state, govern, fy, 2008]","[Budget, United, States, Government, FY, 2008]"


## Feature Extraction

### Term Frequency - Inverse Document Frequency

In [44]:
# frequency matrix approx. = # of documents with word (word rarely appears twice in title)
def frequency_matrix(column):
    freq_matrix = {}

    for row in range(column.shape[0]):
        words = row
        for word in column.iloc[row,0]:
            if word in freq_matrix:
                freq_matrix[word] += 1
            else:
                freq_matrix[word] = 1

    return freq_matrix

In [86]:
# set regularized column variable to df-column of choice 
reg_col = df[['tokens_lemmatized']]

# frequency matrix df
fm = pd.DataFrame.from_dict(frequency_matrix(reg_col), orient='index')
idf = fm.copy()

# inverse document frequency df
for row in range(df.shape[0]):
    idf.iloc[row] = math.log(df.shape[0] / idf.iloc[row])
    
idf.head(5)

Unnamed: 0,0
National,4.323186
Federation,6.962243
Independent,6.962243
Business,5.575949
comment,5.863631
