In [None]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import sys
sys.path.append( '../' )
import importlib

import measures
importlib.reload(measures)

In [21]:
file_path_with_stopwords = os.path.join('..', 'csvs', 'syntatic_word_max_measures_with_stopwords.csv')
file_path_without_stopwords = os.path.join('..', 'csvs', 'syntatic_word_max_measures.csv')

df_with_stopwords = pd.read_csv(file_path_with_stopwords)
df_without_stopwords = pd.read_csv(file_path_without_stopwords)

df_with_stopwords.head(3)

Unnamed: 0,news_id,betweenness,closeness,eigenvector,katz,pagerank,hubs,authorities,clustering,average_clustering,correlation,transitivity,density,label
0,1,0.022053,0.179012,0.455842,0.202861,0.047915,0.084525,0.210602,0.5,0.021807,-0.278426,0.01581,0.018809,fake
1,1,0.012491,0.324142,0.723488,0.336205,0.09046,0.045897,0.349439,0.333333,0.023288,-0.252631,0.034682,0.016645,true
2,10,0.03472,0.225211,0.597723,0.238283,0.060163,0.047667,0.228952,0.333333,0.016034,-0.254378,0.017562,0.012847,fake


In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

NB = GaussianNB()
DT = DecisionTreeClassifier()
RF = RandomForestClassifier()
KNN = KNeighborsClassifier()
SVM = SVC()
MLP = MLPClassifier(max_iter=1000)

algs = [NB, DT, RF, KNN, SVM, MLP]
algs_names = ['NB', 'DT', 'RF', 'KNN', 'SVM', 'MLP']

In [None]:
from collections import defaultdict
from sklearn.model_selection import cross_val_score

scores_single_with_stopwords = defaultdict(list)
scores_single_without_stopwords = defaultdict(list)
measures = df_with_stopwords.columns[1:13]

for alg, alg_name in tqdm(zip(algs, algs_names), total=len(algs), position=0, leave=True, desc="Single measure"):
    for measure in tqdm(measures, total=len(measures), position=1, leave=True, desc=alg_name):
        scores_single_with_stopwords["measure"].append(measure)
        scores_single_with_stopwords["algorithm"].append(alg_name)
        scores_single_with_stopwords["accuracy"].append(cross_val_score(alg, df_with_stopwords[measure].values.reshape(-1, 1), df_with_stopwords['label'], cv=5, scoring='accuracy').mean())

        scores_single_without_stopwords["measure"].append(measure)
        scores_single_without_stopwords["algorithm"].append(alg_name)
        scores_single_without_stopwords["accuracy"].append(cross_val_score(alg, df_without_stopwords[measure].values.reshape(-1, 1), df_without_stopwords['label'], cv=5, scoring='accuracy').mean())

In [24]:
df_single_with_stopwords = pd.DataFrame(scores_single_with_stopwords)
df_single_without_stopwords = pd.DataFrame(scores_single_without_stopwords)

df_single = pd.merge(df_single_with_stopwords, df_single_without_stopwords, on=['measure', 'algorithm'], suffixes=('_with_stopwords', '_without_stopwords')) 
df_single = df_single.set_index(['measure', 'algorithm'])

cols = pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']])
df_single = pd.DataFrame(df_single[["accuracy_with_stopwords", "accuracy_without_stopwords"]].values, columns=cols, index=df_single.index)
df_single = df_single.sort_values(by=('with stopwords', 'accuracy'), ascending=False)

In [25]:
save_path = os.path.join('..', 'results', 'compare_models_single_syntatic_word_max.csv')

df_single = df_single.reset_index(col_level=1)

df_single.to_csv(save_path, index=False)

In [None]:
from collections import defaultdict
from sklearn.model_selection import cross_val_score

scores_all_with_stopwords = defaultdict(list)
scores_all_without_stopwords = defaultdict(list)
measures = df_with_stopwords.columns[1:13]

for alg, alg_name in tqdm(zip(algs, algs_names), total=len(algs), position=0, leave=True, desc="All measures"):
    scores_all_with_stopwords["algorithm"].append(alg_name)
    scores_all_with_stopwords["accuracy"].append(cross_val_score(alg, df_with_stopwords[measures], df_with_stopwords['label'], cv=5, scoring='accuracy').mean())

    scores_all_without_stopwords["algorithm"].append(alg_name)
    scores_all_without_stopwords["accuracy"].append(cross_val_score(alg, df_without_stopwords[measures], df_without_stopwords['label'], cv=5, scoring='accuracy').mean())

In [29]:
df_all_with_stopwords = pd.DataFrame(scores_all_with_stopwords)
df_all_without_stopwords = pd.DataFrame(scores_all_without_stopwords)

df_all = pd.merge(df_all_with_stopwords, df_all_without_stopwords, on='algorithm', suffixes=('_with_stopwords', '_without_stopwords'))
df_all = df_all.set_index('algorithm')

cols = pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']])
df_all = pd.DataFrame(df_all[["accuracy_with_stopwords", "accuracy_without_stopwords"]].values, columns=cols, index=df_all.index)
df_all = df_all.sort_values(by=('with stopwords', 'accuracy'), ascending=False)

In [30]:
save_path = os.path.join('..', 'results', 'compare_models_all_syntatic_word_max.csv')

df_all = df_all.reset_index(col_level=1)

df_all.to_csv(save_path, index=False)

In [31]:
read_path_mean = os.path.join('..', 'results', 'compare_models_single_syntatic_word_mean.csv')
read_path_max = os.path.join('..', 'results', 'compare_models_single_syntatic_word_max.csv')

df_single_mean = pd.read_csv(read_path_mean, header=[0,1], index_col=[0,1])
df_single_mean.index.names = ['measure', 'algorithm']
df_single_mean = pd.DataFrame(df_single_mean.values, columns=pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']]), index=df_single_mean.index)

df_single_max = pd.read_csv(read_path_max, header=[0,1], index_col=[0,1])
df_single_max.index.names = ['measure', 'algorithm']
df_single_max = pd.DataFrame(df_single_max.values, columns=pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']]), index=df_single_max.index)

In [32]:
df_single_mean.loc[df_single_mean.groupby(level=0).idxmax().values[:,1]]\
            .style.highlight_max(props='color:lightgreen;', axis=0).format("{:.2%}")\
            .set_caption("Single measure - mean")

Unnamed: 0_level_0,Unnamed: 1_level_0,with stopwords,without stopwords
Unnamed: 0_level_1,Unnamed: 1_level_1,accuracy,accuracy
measure,algorithm,Unnamed: 2_level_2,Unnamed: 3_level_2
authorities,NB,49.86%,50.74%
average_clustering,SVM,56.68%,54.42%
betweenness,SVM,51.89%,55.96%
closeness,SVM,51.40%,52.46%
clustering,SVM,56.68%,54.42%
correlation,KNN,51.76%,50.97%
density,DT,50.25%,51.49%
eigenvector,SVM,53.40%,53.15%
hubs,NB,49.86%,50.74%
katz,KNN,49.94%,51.53%


In [33]:
df_single_max.loc[df_single_max.groupby(level=0).idxmax().values[:,1]]\
            .style.highlight_max(props='color:lightgreen;', axis=0).format("{:.2%}")\
            .set_caption("Single measure - max")

Unnamed: 0_level_0,Unnamed: 1_level_0,with stopwords,without stopwords
Unnamed: 0_level_1,Unnamed: 1_level_1,accuracy,accuracy
measure,algorithm,Unnamed: 2_level_2,Unnamed: 3_level_2
authorities,MLP,54.01%,53.71%
average_clustering,SVM,56.68%,54.42%
betweenness,SVM,53.33%,56.74%
closeness,SVM,56.28%,54.19%
clustering,SVM,51.68%,52.85%
correlation,KNN,51.76%,50.97%
density,RF,50.10%,51.58%
eigenvector,SVM,53.43%,51.90%
hubs,MLP,52.72%,53.51%
katz,SVM,56.01%,53.21%


In [8]:
read_path_mean = os.path.join('..', 'results', 'compare_models_all_syntatic_word_mean.csv')
read_path_max = os.path.join('..', 'results', 'compare_models_all_syntatic_word_max.csv')

df_all_mean = pd.read_csv(read_path_mean, header=[0,1], index_col=0)
df_all_max  = pd.read_csv(read_path_max, header=[0,1], index_col=0)

In [34]:
df_all_mean.style.format("{:.2%}").highlight_max(props='color:lightgreen;')\
                .set_caption("All measures - mean")

Unnamed: 0_level_0,with stopwords,without stopwords
algorithm,accuracy,accuracy
SVM,58.96%,54.96%
MLP,58.79%,56.96%
RF,57.37%,57.19%
NB,56.19%,51.43%
DT,54.22%,53.14%
KNN,54.10%,52.28%


In [35]:
df_all_max.style.format("{:.2%}").highlight_max(props='color:lightgreen;')\
                .set_caption("All measures - max")

Unnamed: 0_level_0,with stopwords,without stopwords
algorithm,accuracy,accuracy
MLP,60.51%,60.35%
SVM,59.90%,57.93%
RF,59.39%,59.24%
NB,58.99%,56.82%
KNN,54.54%,53.68%
DT,54.10%,54.47%
