In [1]:
import random
import numpy as np
import math

In [None]:
#prepare datasets
#designate 0 = fake, 1 = real for our labels

fake_data = "clean_fake.txt"
real_data = "clean_real.txt"

fake = []
for line in open(fake_data):
    l = line.rstrip('\n').split()
    fake.append(l)
    
real = []
for line in open(real_data):
    l = line.rstrip('\n').split()
    real.append(l)
    
random.seed(1)
random.shuffle(fake)
random.seed(1)
random.shuffle(real)

train_set, valid_set, test_set = [], [], []
train_label, valid_label, test_label = [], [], []

for i in range(len(fake)):
    if i < 0.7*len(fake):
        train_set.append(fake[i])
        train_label.append(0)
    elif i < 0.85*len(fake):
        valid_set.append(fake[i])
        valid_label.append(0)
    else:
        test_set.append(fake[i])
        test_label.append(0)

for i in range(len(real)):
    if i < 0.7*len(real):
        train_set.append(real[i])
        train_label.append(1)
    elif i < 0.85*len(real):
        valid_set.append(real[i])
        valid_label.append(1)
    else:
        test_set.append(real[i])
        test_label.append(1)
        
        
#
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS  
def remove_stop(some_set):
    for example in some_set:
        for word in example:
            if word in ENGLISH_STOP_WORDS:
                example.remove(word)
    return some_set
                
# train_set = remove_stop(train_set)
# valid_set = remove_stop(valid_set)
# test_set = remove_stop(test_set)

        
#construct word frequency dictionary, 
#word_freq = {some_word, (num_label=0, num_label=1)}
word_freq = {}
for i in range(len(train_set)):
    unique_words = set(train_set[i])
    for word in unique_words:
        if word in word_freq:
            if train_label[i] == 0: 
                word_freq[word] = (word_freq[word][0]+1, word_freq[word][1])
            elif train_label[i] == 1: 
                word_freq[word] = (word_freq[word][0], word_freq[word][1]+1)
        else:
            if train_label[i] == 0: 
                word_freq[word] = (1, 0)
            elif train_label[i] == 1: 
                word_freq[word] = (0, 1)
                
                
#construct dictionary with each word having a unique label
word_dict = {}
i = 0
all_sets = train_set + valid_set + test_set
for sentence in all_sets:
    for word in sentence:
        if word not in word_dict:
            word_dict[word] = i
            i += 1

#convert each set to np matrices and vectors
def convert_to_mat(my_set, my_label, word_dict):
    num_unique_words = len(word_dict)
    mat_set = np.zeros((len(my_set), num_unique_words))
    vec_label = np.asarray(my_label).reshape((len(my_label),1))
    #not_vec_label = 1 - vec_label
    #vec_label = np.hstack((not_vec_label,vec_label))
    
    for i in range(len(my_set)):
        for word in my_set[i]:
            mat_set[i][word_dict[word]] = 1
            
    return mat_set, vec_label
        

In [None]:
x_train, y_train = convert_to_mat(train_set, train_label, word_dict)
x_valid, y_valid = convert_to_mat(valid_set, valid_label, word_dict)
x_test, y_test = convert_to_mat(test_set, test_label, word_dict)
print x_train.shape
print y_train.shape
print x_valid.shape
print y_valid.shape
print x_test.shape
print y_test.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
clf = RandomForestClassifier(n_estimators=13)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_valid)
print 'validation ', accuracy_score(y_valid, y_pred)

y_pred = clf.predict(x_test)
print 'test ', accuracy_score(y_test, y_pred)

0.80204081632653057

0.80981595092024539

In [17]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0.65)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_valid)
print 'validation ', accuracy_score(y_valid, y_pred)

y_pred = clf.predict(x_test)
print 'test ', accuracy_score(y_test, y_pred)


validation  0.818367346939
test  0.844580777096


In [18]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 30), random_state=1)
clf.fit(x_train, y_train)   

y_pred = clf.predict(x_valid)
print 'validation ', accuracy_score(y_valid, y_pred)

y_pred = clf.predict(x_test)
print 'test ', accuracy_score(y_test, y_pred)


validation  0.795918367347
test  0.838445807771


In [8]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=20, max_depth=None,min_samples_split=2, random_state=0)
clf.fit(x_train, y_train)   

y_pred = clf.predict(x_valid)
print 'validation ', accuracy_score(y_valid, y_pred)

y_pred = clf.predict(x_test)
print 'test ', accuracy_score(y_test, y_pred)


  This is separate from the ipykernel package so we can avoid doing imports until


validation  0.812244897959
test  0.826175869121


In [19]:
from sklearn.linear_model import PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier(n_iter=30,random_state=0)
clf.fit(x_train, y_train)   

y_pred = clf.predict(x_valid)
print 'validation ', accuracy_score(y_valid, y_pred)

y_pred = clf.predict(x_test)
print 'test ', accuracy_score(y_test, y_pred)



validation  0.795918367347
test  0.811860940695


In [None]:
from sklearn import svm
clf = svm.SVC()
clf.fit(x_train, y_train)   

y_pred = clf.predict(x_valid)
print 'validation ', accuracy_score(y_valid, y_pred)

y_pred = clf.predict(x_test)
print 'test ', accuracy_score(y_test, y_pred)