# Fact or Fake: News Analysis

### Data Mining 334
### Alex Laughlin, Xandre Clementsmith

---
## Imports

In [170]:
# required for running on jupyterlabs (Xandre)
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Xirailuyo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Xirailuyo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [226]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import sklearn
import math
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import re
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem.snowball import SnowballStemmer

## Data Transforms

In [227]:
#pull in data

#just the politifact data for now
df_p_real = pd.read_csv('politifact_real.csv')
df_p_real['true/false'] = True
df_p_real.head(2)

Unnamed: 0,id,news_url,title,tweet_ids,true/false
0,politifact14984,http://www.nfib-sbet.org/,National Federation of Independent Business,967132259869487105\t967164368768196609\t967215...,True
1,politifact12944,http://www.cq.com/doc/newsmakertranscripts-494...,comments in Fayetteville NC,942953459\t8980098198\t16253717352\t1668513250...,True


In [228]:
df_p_fake = pd.read_csv('politifact_fake.csv')
df_p_fake['true/false'] = False
df_p_fake.head(2)

Unnamed: 0,id,news_url,title,tweet_ids,true/false
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,False
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,False


In [229]:
#Join tables together
frames = [df_p_real, df_p_fake]
df = pd.concat(frames)
df = df.reset_index(drop=True)

## Regularization

In [230]:
stemmer = SnowballStemmer("english")

tokenizer = nltk.RegexpTokenizer(r"\w+")

#remove special characters
df['title'] = df.apply(lambda row: re.sub('[^A-Za-z0-9 ]+', '', str(row['title'])), axis =1)

#tokenize words from title
df['tokenized_sents'] = df.apply(lambda row: tokenizer.tokenize(row['title']), axis=1)

#remove stop words from tokenized titles
df['tokens_without_stopwords'] = df['tokenized_sents'].apply(lambda x: [item for item in x if item not in stop_words])

#stem tokenized words without stopwords
df['tokens_stemmed']=df['tokens_without_stopwords'].apply(lambda x : [stemmer.stem(y) for y in x])

#lemmatize tokenized words without stopwords
lemmatizer = nltk.stem.WordNetLemmatizer()
df['tokens_lemmatized'] = df['tokens_without_stopwords'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x])
df.head(5)

Unnamed: 0,id,news_url,title,tweet_ids,true/false,tokenized_sents,tokens_without_stopwords,tokens_stemmed,tokens_lemmatized
0,politifact14984,http://www.nfib-sbet.org/,National Federation of Independent Business,967132259869487105\t967164368768196609\t967215...,True,"[National, Federation, of, Independent, Business]","[National, Federation, Independent, Business]","[nation, feder, independ, busi]","[National, Federation, Independent, Business]"
1,politifact12944,http://www.cq.com/doc/newsmakertranscripts-494...,comments in Fayetteville NC,942953459\t8980098198\t16253717352\t1668513250...,True,"[comments, in, Fayetteville, NC]","[comments, Fayetteville, NC]","[comment, fayettevill, nc]","[comment, Fayetteville, NC]"
2,politifact333,https://web.archive.org/web/20080204072132/htt...,Romney makes pitch hoping to close deal Elect...,,True,"[Romney, makes, pitch, hoping, to, close, deal...","[Romney, makes, pitch, hoping, close, deal, El...","[romney, make, pitch, hope, close, deal, elect...","[Romney, make, pitch, hoping, close, deal, Ele..."
3,politifact4358,https://web.archive.org/web/20110811143753/htt...,Democratic Leaders Say House Democrats Are Uni...,,True,"[Democratic, Leaders, Say, House, Democrats, A...","[Democratic, Leaders, Say, House, Democrats, A...","[democrat, leader, say, hous, democrat, are, u...","[Democratic, Leaders, Say, House, Democrats, A..."
4,politifact779,https://web.archive.org/web/20070820164107/htt...,Budget of the United States Government FY 2008,89804710374154240\t91270460595109888\t96039619...,True,"[Budget, of, the, United, States, Government, ...","[Budget, United, States, Government, FY, 2008]","[budget, unit, state, govern, fy, 2008]","[Budget, United, States, Government, FY, 2008]"


## Feature Extraction

### Term Frequency - Inverse Document Frequency

In [231]:
# frequency matrix approx. = # of documents with word (word rarely appears twice in title)
def frequency_matrix(column):
    freq_matrix = {}

    for row in range(column.shape[0]):
        words = row
        for word in column.iloc[row,0]:
            if word in freq_matrix:
                freq_matrix[word] += 1
            else:
                freq_matrix[word] = 1

    return freq_matrix

In [232]:
# set regularized column variable to df-column of choice 
colvar = 'tokens_lemmatized'
reg_col = df[[colvar]]

# frequency matrix df
fm = pd.DataFrame.from_dict(frequency_matrix(reg_col), orient='index')
idf = fm.copy()

# inverse document frequency df
for row in range(df.shape[0]):
    idf.iloc[row] = math.log(df.shape[0] / idf.iloc[row])
    
idf.head(5)

Unnamed: 0,0
National,4.323186
Federation,6.962243
Independent,6.962243
Business,5.575949
comment,5.863631


In [233]:
# BUG HERE
# Shouldn't change colvar (column variable) column, only idf, due to deep copy. However, it is changing colvar
df['idf'] = reg_col.copy(deep=True)
for row in range(df[['idf']].shape[0]):
    for item in range(len(df.loc[row,'idf'])):
        token = df.loc[row,'idf'][item]
        df.loc[row,'idf'][item] = idf.loc[token][0]

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


## Machine Learning

In [235]:
# PLACEHOLDER needs to be replaced with a predictor dataframe
X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:,"PLACEHOLDER"], df[[r'true/false']], test_size=0.33, random_state=42)

### Linear Regression

In [236]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(X_train, y_train)
lr.score(X_train, y_train)
lr.coef_
lr.intercept_

ValueError: Expected 2D array, got 1D array instead:
array=[ 2.77371881  5.5759491   3.44067064  3.58715782  4.24263529  3.56742777
  8.63805887  5.22191491  2.125       6.11704687  2.10033006  3.81712799
  2.33366529  3.2199697   9.75307281  1.79927823  2.52531637         nan
  5.67536985  6.86168382  3.24875962  5.63361411  2.75811776  3.52573001
  3.16973135  2.61443357  2.74794392  4.47200233  2.46701375  2.08062117
  3.3587946   5.83453861  4.58987942  4.23340766  5.2854096   4.04604191
  3.00642857  3.81552522  5.72804232  5.77000808  3.08969876  3.36994602
  4.09522227  1.59475617  6.26909628  5.31592612  3.08462745  1.
  3.72438037  3.05630526  3.61377562  4.58987942  2.56917238  5.06927795
  5.29100394  2.78426852  5.50016923  1.          2.63784848  3.
  3.46018933  5.95590554  3.63683011  4.93331856  1.75        2.59735482
  3.80065671  1.875       5.44513707  6.95260383  2.92522797  3.63260977
  5.10567933  5.5759491   3.67440214  2.22581164  4.2795729   5.96366604
  2.54192492  4.68647875  2.61861664  2.77272624  1.33333333  7.00194655
  5.37321655  2.5         4.84351839  2.7607247   3.53808372  1.66294569
  5.45784358  5.66168382  9.07252329  9.11693501  6.54635516  2.60849644
  3.82203383  6.61566987  5.56309023  3.00698629  5.54993805  3.56979503
  2.00455551  4.86994305  3.14108705  3.91772103  3.15995336  5.99982142
  6.3410168   5.92252269  4.22546759  3.23939377  2.18361702  2.9845991
  4.93331856 12.52268299  1.3311218   6.06636373  5.45942667  2.26517027
  5.09482155  6.26909628  4.81077073  3.17696539  5.5759491   2.87054083
  3.40718584  3.23875521  2.38835106  6.96224346  5.31435044  3.8044848
  5.02667529  4.90372125  5.40787263  8.99632193  3.085242    2.37455741
  4.93864885  2.36673774  2.81933091  2.91612573  5.10842671  8.00340079
  5.26164281  3.          2.7260603   2.16129002  2.          4.09522227
  5.73157449  2.43822468  7.9359957   6.70945807  2.15510257  3.97685745
  5.75680472  4.92807224  2.32783786  2.15384615  2.32164367  2.41699958
  4.87002369  5.21606438  5.53629785  2.72070048  3.28659314  4.32086212
  4.35106712  3.77112839  3.7165042   2.26265818  2.31569951  3.97353996
  2.52531637  1.94377664  2.24717487  1.49305485  1.73870868  6.63728282
  2.49247453  4.28496265  5.14001111  4.32008179  2.66666667  3.69754313
  3.97644551  5.9142565   2.99303895  6.14337455         nan 12.31801228
  2.23972941  4.01508962  5.39141256  4.24263529  1.99521588  4.48938972
  5.18968104  4.09522227  2.79896297  3.17921892  3.0069932   7.97025805
  2.26357051  3.          9.42971562  6.25394013  4.19198303  4.5605241
  2.03906894  3.73219918  2.61814837  6.96224346  5.5759491   5.36571097
  1.99325976  5.90233565  2.72349467  2.4         3.32655542  3.28285195
  2.94140096  3.16900908  2.48414459  7.48636312  6.12919234  3.93159485
  4.11504565  2.31486199  4.78873585  3.25332854  3.125       4.67108443
  2.75527154  5.41515679  4.88085639  2.11297654  3.77670212  2.73080946
  6.08923303  1.          5.49170604  3.05418954  3.80563307  5.3779117
  1.81940366  3.05364663  3.664343    1.          6.6072568   5.30738425
  4.08543679  7.06109724  6.03935959  2.48287694  2.52802576  2.58017431
  4.58214437  6.42576416  6.7311944   2.77257163  5.4250309   5.81591893
  5.          5.38182293  4.41615067  5.839089    7.09018704  4.78388905
  7.59403436  4.91562347  4.86994305  8.65550716  2.87076382  4.90396825
  5.88659296  4.15892628  2.69563039  6.78895667  3.58079653  1.75
  6.13122758  4.09522227  4.43501071  6.75131101  4.02092594  4.84793027
  2.36799273  1.          6.25        3.46207613  2.54678872  3.26010755
  6.00644925  5.65091871  5.99272494  1.          2.99348476  3.32601602
  1.          6.61566987  4.53782274  3.49352657  3.65542916  3.04939414
  1.8266487   2.05078962  6.17445715  5.95466759  4.84793027  4.09522227
  2.24460857  2.70927299  5.97971249  5.70532136  3.79911439  3.60294392
  5.25577408  2.94140096 10.80445591  3.0337111  10.79773252  9.43297876
  3.43247615  4.35897707  4.35157207  6.00021688  3.16816303  6.06636373
  2.38730168  5.76773715  6.60389157  3.6083762   6.96224346  6.04937383
  4.78088612  3.14118473  3.3871539   3.25803571  3.7949688   3.5876298
  3.44106204  3.61314164  5.89307693  5.93865732  3.34240862  2.84932353
  3.42303209  4.09522227  6.96224346  6.26909628  3.19799302  2.50429111
  2.93540786  4.42463312  2.38487199  1.          8.02568118  4.8245331
  5.06487773  5.50296501  3.60699484  2.20021703  3.69264298  3.44742813
  3.25718575  2.59151502  4.5506574   2.94374113  2.90728113  6.16773001
  3.67238715  2.99390555  5.37015836  8.03207323  3.77789479  3.5149745
  4.31965164  5.42993329  6.06512801  3.76580688  2.75139775  5.82638742
  1.93355421  1.4         2.34034868  3.04883726  5.23281755  5.99011634
  4.65965837  2.67278473  8.54083396  5.41956362  3.26698308  9.54318125
  2.34730755  5.30656292  4.92302678  4.34573529  3.8586497   2.72349467
  4.4652903   4.8         7.10293312  3.06727407  4.19198303  4.56609114
  3.17696539  4.69915353  3.74545247  3.19390019  3.54670433  6.26838882
  6.97487383  2.93483212  4.93205431  5.81095092  1.8340968   1.90863704
  2.65342956  5.64516222  6.31360675  3.14938297  3.56718206  3.00106004
  2.46588478  3.55290434  1.28571429  3.02198666  2.5214494   2.69629627
  5.72683286  5.73372008  2.54644092  4.52466054  5.45022737 12.60724263
  2.89941118  5.72683286  3.05418954  2.25584389  4.12440552  2.23205431
  5.96366604  4.68755892  6.23965052  5.046509    2.55912166  5.53638395
  3.70420064  3.1912303   1.62696652  1.          4.71428571  2.3139327
  6.04524997  1.66196082  3.58079653  3.73928179  3.05643291  3.45819208
  2.93865764  4.97182285  4.84597324  1.57142857  3.75        2.46478162
 10.37769925  9.75166877  2.31123602  2.19365107  2.70643718  4.9487263
  3.54284713  3.20304425  4.50671005  2.87485133  4.18118023  3.61984773
  5.55388528  3.78764564  5.58508964  2.63709529  4.0657394   1.
  3.50060783  2.14221347  1.91794738  4.253834    6.6072568   2.36955721
  4.58643414  4.86220409  4.54946273  3.17438223  1.          4.
  2.66666667  3.81070959  6.19936801  4.68908284  1.25925404  3.60781897
  3.34332726  1.66666667  4.14144634  6.11156207  9.86137053  2.95300378
  4.38335675  3.70241441  3.49840767  3.61748654  2.08079653  4.47465262
  4.00318627  4.80054647  2.67837749  3.07480792  2.70708614 11.09795435
  4.5605296   3.42671955  2.85714286  2.26916183  2.70693464  5.44477168
  5.50837159  3.06542512  2.10440647  5.04126415  2.18991352  5.35121192
  3.31845648  5.71979014  1.38630876  2.59915794  3.06819097  3.00022074
  1.48379465  6.52648771  5.49654433  2.34891605  4.43423656  2.35300378
  3.4134333   2.69101629  4.11206545  3.85519605  4.70303297  2.31011695
  1.48321563  5.18925937  5.43576324  1.5         1.80718717  3.21541258
  3.8586497   1.57142857 19.97656038  4.09522227  4.88138382  3.17253526
  1.90783156  1.63231861  3.01600944  1.78571429  3.65718014  4.19699364
  7.17769933  1.48379465  4.57312346  8.74755606  2.51387254  1.75
  3.00816666  2.91290143  5.27895813  4.24263529  3.39836593  3.55895051
  1.          5.88955174  5.45316551  5.76773715  1.47397894  4.64597597
  4.29808698  4.37262176  3.17640278  2.18021132  5.90261406  4.45142627
  4.21606449  3.23939377  2.292621    4.59268418  5.5956203   2.66666667
  6.46039445  5.13536382  2.51898762  2.14398728  3.46780891  6.85478273
  5.5759491   5.74321867  5.24760221  5.71330616  6.1251424   4.50386885
  2.10772871  3.75845788  4.85483835  5.84128408  8.747416    9.2379879
  5.97443253  2.2630039   2.83037964  3.60989507  2.27747377  4.9067885
  3.67811131  2.87965543  5.79951879  2.01268408  5.5759491   3.87025428
  2.19244869  1.95088112  4.29166155  6.16773001  1.16666667  5.44513707
  4.09522227  3.16850783  3.9845769   3.75298344  2.66928869  2.77488577
  9.04414379  1.33333333  6.1947151   2.          3.46222964  6.59603937
  1.66979503  3.69450146  3.80563307  3.26648305  4.86994305  5.420889
  5.2854096   5.39120104  5.          3.17272624  4.09522227  3.40718584
  3.58252459  3.78619965  5.82732681  3.91658852  2.57946314  2.93266709
  2.8332988   1.          6.61259271  4.97647528  5.16139208  3.50027187
  3.42544819  2.52440223  5.76773715  7.00109997  5.85635765  1.80852003
  3.73801848  6.10772871  5.13276986  9.92828045  2.95056039  4.62779987
  4.1859546   5.69132359  3.39112877  2.52776058  3.72497523  4.01011401
  6.17929505  6.96224346  3.87848522  2.93253065  2.66666667  3.68682964
 12.10953287  3.63683011  1.5         3.84541616  1.49321335  5.09330905
         nan  2.39398728  6.09580949  4.60690834  3.51382181].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.