# Fact or Fake: News Analysis

## Data Mining 334
## Alex Laughlin, Xandre Clementsmith, Terence Carey

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import sklearn
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import re
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem.snowball import SnowballStemmer

## Dataframe Creation and Manipulation

In [2]:
#pull in data

#read in politifact
df_p_real = pd.read_csv('politifact_real.csv')
df_p_fake = pd.read_csv('politifact_fake.csv')

df_p_real['true/false'] = True

df_p_fake['true/false'] = False

#read in gossipcop data
df_g_real = pd.read_csv('gossipcop_real.csv')
df_g_fake = pd.read_csv('gossipcop_fake.csv')

df_g_real['true/false'] = True

df_g_fake['true/false'] = False


In [3]:
#Join tables together
sets = "p"
if sets == "p":
    frames = [df_p_real, df_p_fake]
elif sets == "g":
    frames = [df_g_real, df_g_fake]
elif sets == "b":
    frames = [df_p_real, df_p_fake, df_g_real, df_g_fake]
df = pd.concat(frames)
df.head(5)
df.shape

(1056, 5)

## Tokenization

In [4]:
stemmer = SnowballStemmer("english")

tokenizer = nltk.RegexpTokenizer(r"\w{3,}")

#parse source URL and strip down to base form 
from urllib.parse import urlparse
import urllib as urllib
df['parsedURL'] = df.apply(lambda row: urlparse(str(row['news_url'])).netloc, axis=1)
df['extension'] = df.apply(lambda row: str(row['parsedURL'])[-3:], axis = 1)

#remove special characters
df['title'] = df.apply(lambda row: re.sub('[^A-Za-z ]{3,}', '', str(row['title'])), axis =1)

#tokenize words from title
df['tokenized_sents'] = df.apply(lambda row: tokenizer.tokenize(row['title']), axis=1)

#remove stop words from tokenized titles
df['tokens_without_stopwords'] = df['tokenized_sents'].apply(lambda x: [item for item in x if item not in stop_words])

#stem tokenized words without stopwords
df['tokens_stemmed']=df['tokens_without_stopwords'].apply(lambda x : [stemmer.stem(y) for y in x])

#lemmatize tokenized words without stopwords
lemmatizer = nltk.stem.WordNetLemmatizer()
df['tokens_lemmatized'] = df['tokens_without_stopwords'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x])
df.head(10)

Unnamed: 0,id,news_url,title,tweet_ids,true/false,parsedURL,extension,tokenized_sents,tokens_without_stopwords,tokens_stemmed,tokens_lemmatized
0,politifact14984,http://www.nfib-sbet.org/,National Federation of Independent Business,967132259869487105\t967164368768196609\t967215...,True,www.nfib-sbet.org,org,"[National, Federation, Independent, Business]","[National, Federation, Independent, Business]","[nation, feder, independ, busi]","[National, Federation, Independent, Business]"
1,politifact12944,http://www.cq.com/doc/newsmakertranscripts-494...,comments in Fayetteville NC,942953459\t8980098198\t16253717352\t1668513250...,True,www.cq.com,com,"[comments, Fayetteville]","[comments, Fayetteville]","[comment, fayettevill]","[comment, Fayetteville]"
2,politifact333,https://web.archive.org/web/20080204072132/htt...,"Romney makes pitch, hoping to close deal : Ele...",,True,web.archive.org,org,"[Romney, makes, pitch, hoping, close, deal, El...","[Romney, makes, pitch, hoping, close, deal, El...","[romney, make, pitch, hope, close, deal, elect...","[Romney, make, pitch, hoping, close, deal, Ele..."
3,politifact4358,https://web.archive.org/web/20110811143753/htt...,Democratic Leaders Say House Democrats Are Uni...,,True,web.archive.org,org,"[Democratic, Leaders, Say, House, Democrats, A...","[Democratic, Leaders, Say, House, Democrats, A...","[democrat, leader, say, hous, democrat, are, u...","[Democratic, Leaders, Say, House, Democrats, A..."
4,politifact779,https://web.archive.org/web/20070820164107/htt...,"Budget of the United States Government, FY",89804710374154240\t91270460595109888\t96039619...,True,web.archive.org,org,"[Budget, the, United, States, Government]","[Budget, United, States, Government]","[budget, unit, state, govern]","[Budget, United, States, Government]"
5,politifact14064,http://www.politifact.com/truth-o-meter/statem...,Donald Trump exaggerates when he says China ha...,690248006399049728\t690254026663821312\t690276...,True,www.politifact.com,com,"[Donald, Trump, exaggerates, when, says, China...","[Donald, Trump, exaggerates, says, China, tota...","[donald, trump, exagger, say, china, total, co...","[Donald, Trump, exaggerates, say, China, total..."
6,politifact14474,https://www.law.cornell.edu/constitution/amend...,25th Amendment,1262604762\t10969740933\t11182364398\t17507543...,True,www.law.cornell.edu,edu,"[25th, Amendment]","[25th, Amendment]","[25th, amend]","[25th, Amendment]"
7,politifact5276,http://americaneedsmitt.com/blog/2011/11/10/mi...,,,True,americaneedsmitt.com,com,[],[],[],[]
8,politifact1313,https://web.archive.org/web/20090913221204/htt...,Briefing by White House Press Secretary Robert...,13511762265\t13512918230\t13513835900\t1351424...,True,web.archive.org,org,"[Briefing, White, House, Press, Secretary, Rob...","[Briefing, White, House, Press, Secretary, Rob...","[brief, white, hous, press, secretari, robert,...","[Briefing, White, House, Press, Secretary, Rob..."
9,politifact937,https://web.archive.org/web/20080623122709/htt...,A Solar Grand Plan: Scientific American,140962137332920320\t141057766704947200\t141166...,True,web.archive.org,org,"[Solar, Grand, Plan, Scientific, American]","[Solar, Grand, Plan, Scientific, American]","[solar, grand, plan, scientif, american]","[Solar, Grand, Plan, Scientific, American]"


## Dictionary and IDF

In [5]:
# MODULAR COMPONENT:
# select token_type for machine learning algorithms
# tokens_stemmed performs best for logistic regression and decision tree
# tokens_lemmatized performs best for random forest and passive aggresive classifier

token_type = 'tokens_stemmed'

In [6]:
#create corpus from lemmatized tokens as numpy array
corpus = df[token_type].to_numpy()

#create dictionary
DF = {}

#write unique words to dictionary
for i in range(len(corpus)):
    tokens = corpus[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
            
#get word frequency
for i in DF:
    DF[i] = len(DF[i])
# print(DF)

In [7]:
all_words = [x for x in DF]
# print(all_words)

In [8]:
#create column of strings from lemmatized tokens
df['tfidfprep']=[" ".join(x) for x in df[token_type].values]
labels=df[["true/false"]]

#initialize tfidf vectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#call vectorizer on text columns
tfidf_numbers=tfidf_vectorizer.fit_transform(df['tfidfprep']) 

#convert vectorizer data to dataframe
df1 = pd.DataFrame(tfidf_numbers.toarray(), columns=tfidf_vectorizer.get_feature_names())
print("df1 SHAPE = "+str(df1.shape))
print(df1.columns)
#sources is the one hot encoding of the parsedURL
extensions = pd.get_dummies(df['extension'])
print("EXTENSIONS SHAPE = "+str(extensions.shape))
print(extensions.columns)

#drop duplicates?
#df1.drop_duplicates(inplace=True)
#sources.drop_duplicates(inplace=True)

#join vectorizer data and one hot encoded sources columns
#numberDF = pd.concat([df1, sources], axis = 1)
numberDF = extensions.join(df1, lsuffix = '_left', rsuffix = '_right')


labels=df[["true/false"]]

#create train and test sets
x_train, x_test, y_train, y_test= train_test_split(numberDF, labels, test_size=0.2, stratify=df['true/false'], random_state=7)

print(numberDF.head)


df1 SHAPE = (1056, 2517)
Index(['10k', '22m', '25th', '27ave', '2kilo', '2nd', '3rd', '3year',
       'abandon', 'abc',
       ...
       'wwii', 'year', 'yearbook', 'yellow', 'york', 'young', 'younger',
       'youngest', 'youtub', 'zakaria'],
      dtype='object', length=2517)
EXTENSIONS SHAPE = (1056, 23)
Index(['', '.be', '.co', '.is', '.me', '.pw', '.ru', '.tk', '.uk', '.us',
       'com', 'edu', 'ews', 'gov', 'ife', 'ite', 'ive', 'lub', 'mil', 'net',
       'nfo', 'one', 'org'],
      dtype='object')
Index(['', '.be', '.co', '.is', '.me', '.pw', '.ru', '.tk', '.uk', '.us',
       'com', 'edu', 'ews', 'gov', 'ife', 'ite', 'ive', 'lub', 'mil', 'net',
       'nfo', 'one', 'org'],
      dtype='object')
<bound method NDFrame.head of         .be  .co  .is  .me  .pw  .ru  .tk  .uk  .us  ...  wwii  year  \
0    0    0    0    0    0    0    0    0    0    0  ...   0.0   0.0   
0    1    0    0    0    0    0    0    0    0    0  ...   0.0   0.0   
1    0    0    0    0    0    0    0    

## Machine Learning

### Logistic Regression (Baseline)

In [9]:
lr = LogisticRegression(random_state=0)

lr.fit(x_train, y_train.values.ravel())

lr_y_pred = lr.predict(x_test)


print(classification_report(y_test,lr_y_pred))
print(confusion_matrix(y_test,lr_y_pred))

              precision    recall  f1-score   support

       False       0.93      0.49      0.65        87
        True       0.73      0.98      0.84       125

    accuracy                           0.78       212
   macro avg       0.83      0.74      0.74       212
weighted avg       0.82      0.78      0.76       212

[[ 43  44]
 [  3 122]]


### Decision Tree

In [10]:
dt = DecisionTreeClassifier(random_state=0)

dt.fit(x_train, y_train.values.ravel())

dt_y_pred = dt.predict(x_test)
dt_features = dt.feature_importances_

print(classification_report(y_test,dt_y_pred))
print(confusion_matrix(y_test,dt_y_pred))

              precision    recall  f1-score   support

       False       0.67      0.77      0.72        87
        True       0.82      0.74      0.78       125

    accuracy                           0.75       212
   macro avg       0.75      0.75      0.75       212
weighted avg       0.76      0.75      0.75       212

[[67 20]
 [33 92]]


### RandomForest

In [11]:
rf = RandomForestClassifier(random_state=0)

rf.fit(x_train, y_train.values.ravel())

rf_y_pred = rf.predict(x_test)
rf_features = rf.feature_importances_

print(classification_report(y_test,rf_y_pred))
print(confusion_matrix(y_test,rf_y_pred))

              precision    recall  f1-score   support

       False       0.79      0.64      0.71        87
        True       0.78      0.88      0.83       125

    accuracy                           0.78       212
   macro avg       0.78      0.76      0.77       212
weighted avg       0.78      0.78      0.78       212

[[ 56  31]
 [ 15 110]]


### Passive Aggresive Classifier

In [12]:
#instantiate passive aggresive classifier
pac=PassiveAggressiveClassifier(max_iter=100, random_state=0)

#fit classifer to tfidf of training set
pac.fit(x_train,y_train.values.ravel())

#create predictions on test set
pac_y_pred = pac.predict(x_test)

#print score
print(classification_report(y_test,pac_y_pred))
print(confusion_matrix(y_test,pac_y_pred))

              precision    recall  f1-score   support

       False       0.77      0.91      0.84        87
        True       0.93      0.82      0.87       125

    accuracy                           0.85       212
   macro avg       0.85      0.86      0.85       212
weighted avg       0.86      0.85      0.85       212

[[ 79   8]
 [ 23 102]]
