In [1]:
import pandas as pd
from matplotlib.pylab import plt
import numpy as np
import gensim


In [2]:
df = pd.read_csv("../processed_datasets/WELFake_Dataset_processed.csv")

In [3]:
df.head()

Unnamed: 0,title,text,label,all,length,stem
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,law enforcement high alert threats cops whites...,4250,law enforc high alert threat cop white $number...
1,,Did they post their votes for Hillary already?,1,post votes hillary already?,39,post vote hillari already?
2,UNBELIEVABLE! OBAMAS ATTORNEY GENERAL SAYS MOS...,"Now, most of the demonstrators gathered last n...",1,unbelievable! obamas attorney general charlott...,295,unbelievable! obama attorney general charlott ...
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"bobby jindal, raised hindu, story christian co...",6736,"bobbi jindal, rais hindu, stori christian conv..."
4,SATAN $NUMBER$: Russia unvelis an image of its...,"The RS-28 Sarmat missile, dubbed Satan $NUMBER...",1,satan $number$: russia unvelis image terrifyin...,1733,satan $number$: russia unv imag terrifi supern...


In [4]:
df = df.fillna(' ')

In [5]:
import gensim.downloader as api


Pre-trained glove vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased.

In [6]:
glove = api.load('glove-twitter-50')

In [7]:
def sent_2vec(sentence):
    vector_size = glove.vector_size
    glove_res = np.zeros(vector_size)
    # print(wv_res)
    cpt = 1
    for w in sentence:
        if w in glove:
            cpt += 1
            glove_res += glove[w]
    glove_res = glove_res/cpt
    return glove_res

In [8]:
import nltk

In [9]:
df['tokens'] = df.apply(lambda row: nltk.word_tokenize(row['all']), axis=1)

In [10]:
df['vec'] = df['tokens'].apply(sent_2vec)

In [11]:
X = df['vec'].to_list()
y = df['label'].to_list()

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)

In [13]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
def cross_validate_train(X,y,num_folds,clf):
    
    if clf == 'SVC':
        clf = SVC()
        param_grid = {
            'C': [0.1, 1, 10, 100],
            'kernel': ['linear', 'rbf']
        }
    elif clf == 'NB':
        clf = GaussianNB()
        param_grid = {  
            'var_smoothing': np.logspace(0,-9, num=100)
        }
    elif clf == 'PAC':
        clf = PassiveAggressiveClassifier()
        param_grid = {'C': [0.1, 1, 10]}

    elif clf == 'random_forest':
        clf = RandomForestClassifier()
        param_grid = {
            'n_estimators': [100, 200, 300]
        }
    else:
        clf = LogisticRegression(max_iter = 2000)
        param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
    
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=num_folds, n_jobs=-1, verbose=1)
    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    return best_params, best_score
    

In [14]:

def search_models(k):
    params,score = cross_validate_train(X_train,y_train,k,'SVC')
    print("Best svc for k =",k,params,score)
    best_params_pac,best_score_pac = cross_validate_train(X_train,y_train,k,'PAC')
    print("Best pac for k =",k,best_params_pac,best_score_pac)
    best_params_log,best_score_log = cross_validate_train(X_train,y_train,k,'log')
    print("Best log for k =",k,best_params_log,best_score_log)
    best_params_rf,best_score_rf = cross_validate_train(X_train,y_train,k,'random_forest')
    print("Best rf for k =",k,best_params_rf,best_score_rf)


In [17]:
ix = [5,6,7]
for i in ix:
    search_models(i)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best svc for k = 5 {'C': 100, 'kernel': 'rbf'} 0.8806923924548713
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best pac for k = 5 {'C': 0.1} 0.8025160263933359
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best log for k = 5 {'C': 10} 0.8211287660270706
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best rf for k = 5 {'n_estimators': 300} 0.8672489957030336
Fitting 6 folds for each of 8 candidates, totalling 48 fits
Best svc for k = 6 {'C': 100, 'kernel': 'rbf'} 0.88159400940476
Fitting 6 folds for each of 3 candidates, totalling 18 fits
Best pac for k = 6 {'C': 0.1} 0.7629077965049634
Fitting 6 folds for each of 5 candidates, totalling 30 fits
Best log for k = 6 {'C': 10} 0.8212691397850613
Fitting 6 folds for each of 3 candidates, totalling 18 fits
Best rf for k = 6 {'n_estimators': 300} 0.8691524343727428
Fitting 7 folds for each of 8 candidates, totalling 56 fits
Best svc for k = 7

In [18]:
import pickle
#Creating each classifier
svm_classifier = SVC(C = 100,kernel = 'rbf')
log_classifier = LogisticRegression(C = 100,max_iter = 2000)
rf_classifier = RandomForestClassifier(n_estimators = 300)
pac_classifier = PassiveAggressiveClassifier(C = 10)

clf_list = [svm_classifier,log_classifier,rf_classifier,pac_classifier]
clf_names = ["SVM","Logistic Regression","Random Forest","Passive Agressive Classifier"]
for i in range(len(clf_list)):
    clf_list[i].fit(X_train,y_train)
    filename = "embedding_clfs/" + clf_names[i] + ".sav"
    pickle.dump(clf_list[i], open(filename, 'wb'))
    print("done")

done
done
done
done


In [17]:
from sklearn import metrics


#Training the best models
def evaluate(model_name,model,test_set,test_labels):
    predicted = model.predict(test_set)
    print(model_name + "Accuracy:",metrics.accuracy_score(test_labels, predicted))
    print(model_name +"Precision:",metrics.precision_score(test_labels, predicted))
    print(model_name +"Recall:",metrics.recall_score(test_labels, predicted))
    print(model_name +"Matthews Coefficient:",metrics.matthews_corrcoef(test_labels, predicted))

In [20]:
print(type(X_test))
for i in range(len(clf_list)):
    evaluate(clf_names[i],clf_list[i],X_test,y_test)

<class 'list'>
SVMAccuracy: 0.8842855998076768
SVMPrecision: 0.8852891024647341
SVMRecall: 0.890257209664848
SVMMatthews Coefficient: 0.7683631520224086
Logistic RegressionAccuracy: 0.8271496113470631
Logistic RegressionPrecision: 0.833072590738423
Logistic RegressionRecall: 0.8300857365549493
Logistic RegressionMatthews Coefficient: 0.6540658536329983
Random ForestAccuracy: 0.8792371183588429
Random ForestPrecision: 0.8660501193317423
Random ForestRecall: 0.9050662509742791
Random ForestMatthews Coefficient: 0.75878522232777
Passive Agressive ClassifierAccuracy: 0.8158506290568155
Passive Agressive ClassifierPrecision: 0.7948717948717948
Passive Agressive ClassifierRecall: 0.865003897116134
Passive Agressive ClassifierMatthews Coefficient: 0.6330937781554473


In [23]:
for i in ix:
    best_nb_score,best_nb_params = cross_validate_train(X_train,y_train,i,'NB')
    print(best_nb_score,best_nb_params)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'var_smoothing': 0.0004328761281083057} 0.7078917509796707
Fitting 6 folds for each of 100 candidates, totalling 600 fits
{'var_smoothing': 1e-05} 0.7077716206137601
Fitting 7 folds for each of 100 candidates, totalling 700 fits
{'var_smoothing': 0.0002848035868435802} 0.7079518901797741


In [28]:
model = GaussianNB(var_smoothing = 0.0003)
model.fit(X_train,y_train)
filename = "embedding_clfs/naive_bayes.sav"
pickle.dump(model, open(filename, 'wb'))
evaluate("Naive Bayes",model,X_test,y_test)

Naive BayesAccuracy: 0.7111948072762241
Naive BayesPrecision: 0.7583164859400845
Naive BayesRecall: 0.6431800467653936
Naive BayesMatthews Coefficient: 0.4296909884731936


Testing on new data

In [15]:
from sklearn.ensemble import VotingClassifier

svm_classifier = SVC(C = 100,kernel = 'rbf')
log_classifier = LogisticRegression(C = 10,max_iter = 2000)
rf_classifier = RandomForestClassifier(n_estimators = 300)
pac_classifier = PassiveAggressiveClassifier(C = 0.1)
nb_classifier = GaussianNB(var_smoothing = 0.0003)
ens_list = [('SVM',svm_classifier),('RF',rf_classifier)]
vot_hard = VotingClassifier(estimators = ens_list, voting ='hard')
vot_hard.fit(X_train, y_train)


VotingClassifier(estimators=[('SVM', SVC(C=100)),
                             ('RF', RandomForestClassifier(n_estimators=300))])

In [19]:
evaluate("Hard voting classifier",vot_hard,X_test,y_test)

Hard voting classifierAccuracy: 0.8717845981248498
Hard voting classifierPrecision: 0.9059180576631259
Hard voting classifierRecall: 0.8375681995323461
Hard voting classifierMatthews Coefficient: 0.7461673581613508


In [18]:
import pickle
filename = "embedding_clfs/voting.sav"
pickle.dump(vot_hard, open(filename, 'wb'))