In [1]:
from collections import defaultdict, Counter
import json
from random import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
class nnEnsemble(object):
    """Neural network ensemble:
    Passes the output of the input classifiers to a new multi-layer perceptron, that uses them as input params 
    """
    def __init__(self,classes):
        self.pieces = classes
        self.classifier = MLPClassifier(early_stopping=True,verbose=True)
        
    def transform(self,X):
        feats = []
        for x in X:
            feat = []
            for p in self.pieces:
                feat += list(p.predict_proba(x)[0])
            feats.append(feat)
        return feats
        
    def fit(self, X, y):
        feats = self.transform(X)
        self.classifier.fit(feats,y)
        
    def predict(self, X):
        feats = self.transform(X)
        return self.classifier.predict(feats)
    
    def predict_proba(self, X):
        feats = self.transform(X)
        return self.classifier.predict_proba(feats)


In [3]:
fakenews = []

readfile = open("data/fakenews.json","r")
for line in readfile:
    article = json.loads(line)
    fakenews.append(article["text"])

In [4]:
realnews = []
readfile = open("data/realnews.json","r")
for line in readfile:
    article = json.loads(line)
    if "sports" not in article["info"]["site_categories"]:
        realnews.append(article["text"])

In [5]:
realtest = realnews [int(len(realnews) * .9) :]
faketest = fakenews [int(len(fakenews) * .9) :]
realnews = realnews [: int(len(realnews) * .9)]
fakenews = fakenews [: int(len(fakenews) * .9)]

In [6]:
labels = ["fake"] * len(fakenews) + ["real"] * len(realnews)
allnews = fakenews + realnews
testlabels =  ["fake"] * len(faketest) + ["real"] * len(realtest)
alltest = faketest + realtest

In [7]:
vectorizer = TfidfVectorizer(max_features = 7500)

In [8]:
vectors = vectorizer.fit_transform(allnews)
test_vec = vectorizer.transform(alltest)

In [9]:
LGR = LogisticRegression()
LGR.fit(vectors,labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
RFC = RandomForestClassifier()
RFC.fit(vectors,labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [11]:
MNB = MultinomialNB()
MNB.fit(vectors,labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
MLP = MLPClassifier(early_stopping=True,verbose=True)
MLP.fit(vectors,labels)

Iteration 1, loss = 0.34206160
Validation score: 0.933348
Iteration 2, loss = 0.13662356
Validation score: 0.954965
Iteration 3, loss = 0.08991464
Validation score: 0.958118
Iteration 4, loss = 0.06684574
Validation score: 0.960820
Iteration 5, loss = 0.05231547
Validation score: 0.964648
Iteration 6, loss = 0.04229928
Validation score: 0.961495
Iteration 7, loss = 0.03425563
Validation score: 0.964197
Iteration 8, loss = 0.02829708
Validation score: 0.961495
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [13]:
ENS = nnEnsemble([LGR,MNB,MLP,RFC])
ENS.fit(vectors,labels)

Iteration 1, loss = 0.16443316
Validation score: 0.996397
Iteration 2, loss = 0.01623750
Validation score: 0.997073
Iteration 3, loss = 0.00828204
Validation score: 0.998424
Iteration 4, loss = 0.00537648
Validation score: 0.998649
Iteration 5, loss = 0.00393575
Validation score: 0.999324
Iteration 6, loss = 0.00313396
Validation score: 0.999324
Iteration 7, loss = 0.00267255
Validation score: 0.999324
Iteration 8, loss = 0.00237590
Validation score: 0.999324
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


In [14]:
def predictor(classifier,text):
    vec = vectorizer.transform([text])[0]
    return classifier.predict(vec)[0]

In [27]:

for c in [LGR,MNB,MLP,RFC,ENS]:
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for t in faketest:
        pred = predictor(c,t)
        if pred == "fake":
            tp += 1
        else:
            fn += 1

    for t in realtest:
        pred = predictor(c,t)
        if pred == "real":
            tn += 1
        else:
            fp += 1
    precision = tp * 1.0/(tp + fp)
    recall = tp * 1.0/(tp + fn)
    acc = (tp+tn) * 1.0/(tp+tn+fp+fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    print acc, precision, recall, f1

0.9584515606 0.962732919255 0.952868852459 0.957775489186
0.892582083502 0.89397689769 0.888114754098 0.891036184211
0.966558573166 0.974155898291 0.957786885246 0.965902045877
0.951155249291 0.924681344148 0.981147540984 0.952077947902
0.976489663559 0.971208434712 0.981557377049 0.976355483082


In [28]:
manual_fake_str = """Shock from the suicide-by-poisoning death of Bosnian Croat commander Slobodan Praljak at The Hague on Wednesday continues to reverberate as observers try to figure out how the military leader brought a bottle of poison into the courtroom. Speaking to Sputnik, one of his lawyers said she was shocked by what happened.
The UN confirmed Wednesday that Praljak died in hospital after drinking what is thought to be a small bottle of poison in the courtroom, just hours after a judge handed down a 20-year jail sentence during an appeal hearing at the International Criminal Tribunal for the former Yugoslavia (ICTY). Praljak was charged for war crimes committed during the Bosnian War. The moment when he took the poison was captured on court cameras.

Natasa Favo Ivanovic, one of Praljak's lawyers, offered insight into the details of Wednesday's deadly incident via a telephone interview with Sputnik Serbia.

"""

In [29]:
print manual_fake_str

Shock from the suicide-by-poisoning death of Bosnian Croat commander Slobodan Praljak at The Hague on Wednesday continues to reverberate as observers try to figure out how the military leader brought a bottle of poison into the courtroom. Speaking to Sputnik, one of his lawyers said she was shocked by what happened.
The UN confirmed Wednesday that Praljak died in hospital after drinking what is thought to be a small bottle of poison in the courtroom, just hours after a judge handed down a 20-year jail sentence during an appeal hearing at the International Criminal Tribunal for the former Yugoslavia (ICTY). Praljak was charged for war crimes committed during the Bosnian War. The moment when he took the poison was captured on court cameras.

Natasa Favo Ivanovic, one of Praljak's lawyers, offered insight into the details of Wednesday's deadly incident via a telephone interview with Sputnik Serbia.




In [30]:
inputs = vectorizer.transform([manual_fake_str])

In [31]:
print ENS.predict_proba(inputs)
print ENS.predict(inputs)

[[ 0.8907798  0.1092202]]
['fake']


In [32]:

for c in [LGR,MNB,MLP,RFC,ENS]:
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for t in fakenews:
        pred = predictor(c,t)
        if pred == "fake":
            tp += 1
        else:
            fn += 1

    for t in realnews:
        pred = predictor(c,t)
        if pred == "real":
            tn += 1
        else:
            fp += 1
    precision = tp * 1.0/(tp + fp)
    recall = tp * 1.0/(tp + fn)
    acc = (tp+tn) * 1.0/(tp+tn+fp+fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    print acc,"\t" ,precision,"\t", recall,"\t", f1

0.962212313651 	0.957461205341 	0.966530054645 	0.961974256708
0.861910552628 	0.820430804114 	0.922723132969 	0.86857559261
0.99031662388 	0.991732139594 	0.988661202186 	0.990194289884
0.997793091024 	0.995781930334 	0.999772313297 	0.997773132158
0.999279376661 	0.999316877676 	0.999225865209 	0.99927136937
