In [1]:
import numpy as np
import pandas as pd
import xgboost
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
df = pd.read_csv('data/fake_or_real_news.csv')
# view the first 5 rows 
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,title_vectors
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,[ 1.1533764e-02 4.2144405e-03 1.9692603e-02 ...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,[ 0.11267698 0.02518966 -0.00212591 0.021095...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,[ 0.04253004 0.04300297 0.01848392 0.048672...
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,[ 0.10801624 0.11583211 0.02874823 0.061732...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,[ 1.69016439e-02 7.13498285e-03 -7.81233795e-...


In [3]:
#title = np.array(df['title'])
title_v = np.array(df['title_vectors'])
for i in range(len(title_v)):
    title_v[i] = title_v[i][1:-1]
    title_v[i] = title_v[i].split()
    title_v[i] = np.array(title_v[i],dtype=float)

title_vec = np.zeros((len(title_v),len(title_v[0])))
for i in range(title_vec.shape[0]):
    title_vec[i] = title_v[i]
print(title_vec.shape)

label = np.array(df['label'])
for i in range(len(label)):
    label[i] = 1 if label[i] == 'REAL' else 0

(6335, 300)


In [4]:
from sklearn.model_selection import train_test_split

# split data into train and test sets
seed = 42
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(title_vec, label, test_size=test_size, random_state=seed)
print(X_test.shape)

(2091, 300)


In [5]:
# instantiate a logistic regression model, and fit with X and y
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score


model = LogisticRegression()
model = model.fit(X_train, y_train.astype(int))

y_pred = model.predict(X_test)

print(y_pred[0:10])
# check the accuracy on the training set
print(confusion_matrix(y_true=y_test.astype(int), y_pred=y_pred))
score = model.score(X_test, y_test.astype(int))

p = precision_score(y_test.astype(int), y_pred, average='binary')
r = recall_score(y_test.astype(int), y_pred, average='binary')
f1score = f1_score(y_test.astype(int), y_pred, average='binary')
print('accuracy:',score)
print('precision:',p)
print('recall:',r)
print('f1score:',f1score)

[0 0 0 1 0 0 0 0 1 1]
[[837 234]
 [245 775]]
accuracy: 0.770923003348
precision: 0.768087215064
recall: 0.759803921569
f1score: 0.763923114835


In [6]:
names = ["Nearest Neighbors", "Linear SVM",
         "Decision Tree", "Random Forest", "AdaBoost",
         "Naive Bayes" ]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB()]

In [7]:
# iterate over classifiers
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train.astype(int))
    score = clf.score(X_test, y_test.astype(int))
    y_pred = clf.predict(X_test)
    f1score = f1_score(y_test.astype(int), y_pred, average='binary')
    print(name,'accuracy :',score,'        f1 score :',f1score)

Nearest Neighbors accuracy : 0.713534194165         f1 score : 0.701246882793
Linear SVM accuracy : 0.735054997609         f1 score : 0.723276723277
Decision Tree accuracy : 0.724055475849         f1 score : 0.715342871238
Random Forest accuracy : 0.713534194165         f1 score : 0.708515815085
AdaBoost accuracy : 0.744619799139         f1 score : 0.739002932551
Naive Bayes accuracy : 0.709708273553         f1 score : 0.713273500236


In [8]:
for i in range(1,15):
    clf = RandomForestClassifier(max_depth = i, n_estimators=10, max_features=2)
    clf.fit(X_train, y_train.astype(int))
    y_pred = clf.predict(X_test)
    f1score = f1_score(y_test.astype(int), y_pred, average='binary')
    print('max_depth ',i,': ',f1score)

max_depth  1 :  0.695815702868
max_depth  2 :  0.680623174294
max_depth  3 :  0.710825506673
max_depth  4 :  0.700350175088
max_depth  5 :  0.70137524558
max_depth  6 :  0.722032242306
max_depth  7 :  0.717348927875
max_depth  8 :  0.73385518591
max_depth  9 :  0.732779677577
max_depth  10 :  0.728871519297
max_depth  11 :  0.725742574257
max_depth  12 :  0.730562347188
max_depth  13 :  0.722085587801
max_depth  14 :  0.718503937008


In [3]:
import numpy as np

w = 10
w = np.array(w)
print(w)

10
