In [None]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import sys
sys.path.append( '../' )
import importlib

import measures
importlib.reload(measures)

In [2]:
file_path_with_stopwords = os.path.join('..', 'csvs', 'syntatic_max_measures_with_stopwords.csv')
file_path_without_stopwords = os.path.join('..', 'csvs', 'syntatic_max_measures.csv')

df_with_stopwords = pd.read_csv(file_path_with_stopwords)
df_without_stopwords = pd.read_csv(file_path_without_stopwords)

df_with_stopwords.head(3)

Unnamed: 0,news_id,betweenness,closeness,eigenvector,katz,pagerank,hubs,authorities,density,label
0,1,0.014513,0.011814,0.702893,0.07936,0.012486,0.682282,0.075809,0.006289,fake
1,1,0.007044,0.011043,0.447059,0.07814,0.010969,0.714286,0.119048,0.006098,true
2,10,0.007227,0.007061,0.691866,0.061571,0.006828,1.0,0.0625,0.003788,fake


In [3]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

NB = GaussianNB()
DT = DecisionTreeClassifier()
RF = RandomForestClassifier()
KNN = KNeighborsClassifier()
SVM = SVC()
MLP = MLPClassifier(max_iter=1000)

algs = [NB, DT, RF, KNN, SVM, MLP]
algs_names = ['NB', 'DT', 'RF', 'KNN', 'SVM', 'MLP']

In [None]:
from collections import defaultdict
from sklearn.model_selection import cross_val_score

scores_single_with_stopwords = defaultdict(list)
scores_single_without_stopwords = defaultdict(list)
measures = df_with_stopwords.columns[1:9]

for alg, alg_name in tqdm(zip(algs, algs_names), total=len(algs), position=0, leave=True, desc="Single measure"):
    for measure in tqdm(measures, total=len(measures), position=1, leave=True, desc=alg_name):
        scores_single_with_stopwords["measure"].append(measure)
        scores_single_with_stopwords["algorithm"].append(alg_name)
        scores_single_with_stopwords["accuracy"].append(cross_val_score(alg, df_with_stopwords[measure].values.reshape(-1, 1), df_with_stopwords['label'], cv=5, scoring='accuracy').mean())

        scores_single_without_stopwords["measure"].append(measure)
        scores_single_without_stopwords["algorithm"].append(alg_name)
        scores_single_without_stopwords["accuracy"].append(cross_val_score(alg, df_without_stopwords[measure].values.reshape(-1, 1), df_without_stopwords['label'], cv=5, scoring='accuracy').mean())

In [5]:
df_single_with_stopwords = pd.DataFrame(scores_single_with_stopwords)
df_single_without_stopwords = pd.DataFrame(scores_single_without_stopwords)

df_single = pd.merge(df_single_with_stopwords, df_single_without_stopwords, on=['measure', 'algorithm'], suffixes=('_with_stopwords', '_without_stopwords')) 
df_single = df_single.set_index(['measure', 'algorithm'])

cols = pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']])
df_single = pd.DataFrame(df_single[["accuracy_with_stopwords", "accuracy_without_stopwords"]].values, columns=cols, index=df_single.index)
df_single = df_single.sort_values(by=('with stopwords', 'accuracy'), ascending=False)

In [6]:
save_path = os.path.join('..', 'results', 'compare_models_single_syntatic_max.csv')

df_single = df_single.reset_index(col_level=1)

df_single.to_csv(save_path, index=False)

In [None]:
from collections import defaultdict
from sklearn.model_selection import cross_val_score

scores_all_with_stopwords = defaultdict(list)
scores_all_without_stopwords = defaultdict(list)
measures = df_with_stopwords.columns[1:9]

for alg, alg_name in tqdm(zip(algs, algs_names), total=len(algs), position=0, leave=True, desc="All measures"):
    scores_all_with_stopwords["algorithm"].append(alg_name)
    scores_all_with_stopwords["accuracy"].append(cross_val_score(alg, df_with_stopwords[measures], df_with_stopwords['label'], cv=5, scoring='accuracy').mean())

    scores_all_without_stopwords["algorithm"].append(alg_name)
    scores_all_without_stopwords["accuracy"].append(cross_val_score(alg, df_without_stopwords[measures], df_without_stopwords['label'], cv=5, scoring='accuracy').mean())

In [8]:
df_all_with_stopwords = pd.DataFrame(scores_all_with_stopwords)
df_all_without_stopwords = pd.DataFrame(scores_all_without_stopwords)

df_all = pd.merge(df_all_with_stopwords, df_all_without_stopwords, on='algorithm', suffixes=('_with_stopwords', '_without_stopwords'))
df_all = df_all.set_index('algorithm')

cols = pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']])
df_all = pd.DataFrame(df_all[["accuracy_with_stopwords", "accuracy_without_stopwords"]].values, columns=cols, index=df_all.index)
df_all = df_all.sort_values(by=('with stopwords', 'accuracy'), ascending=False)

In [9]:
save_path = os.path.join('..', 'results', 'compare_models_all_syntatic_max.csv')

df_all = df_all.reset_index(col_level=1)

df_all.to_csv(save_path, index=False)

In [10]:
read_path_mean = os.path.join('..', 'results', 'compare_models_single_syntatic_mean.csv')
read_path_max = os.path.join('..', 'results', 'compare_models_single_syntatic_max.csv')

df_single_mean = pd.read_csv(read_path_mean, header=[0,1], index_col=[0,1])
df_single_mean.index.names = ['measure', 'algorithm']
df_single_mean = pd.DataFrame(df_single_mean.values, columns=pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']]), index=df_single_mean.index)

df_single_max = pd.read_csv(read_path_max, header=[0,1], index_col=[0,1])
df_single_max.index.names = ['measure', 'algorithm']
df_single_max = pd.DataFrame(df_single_max.values, columns=pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']]), index=df_single_max.index)

In [19]:
df_single_mean.loc[df_single_mean.groupby(level=0).idxmax().values[:,1]]\
            .style.highlight_max(props='color:lightgreen;', axis=0).format("{:.2%}")\
            .set_caption("Single measure - mean")

Unnamed: 0_level_0,Unnamed: 1_level_0,with stopwords,without stopwords
Unnamed: 0_level_1,Unnamed: 1_level_1,accuracy,accuracy
measure,algorithm,Unnamed: 2_level_2,Unnamed: 3_level_2
authorities,RF,49.64%,52.22%
betweenness,SVM,52.96%,57.31%
closeness,SVM,51.07%,52.29%
density,DT,50.22%,52.17%
eigenvector,SVM,51.78%,53.28%
hubs,RF,49.76%,52.40%
katz,KNN,51.57%,52.25%
pagerank,DT,50.06%,52.17%


In [17]:
df_single_max.loc[df_single_max.groupby(level=0).idxmax().values[:,1]]\
            .style.highlight_max(props='color:lightgreen;', axis=0).format("{:.2%}")\
            .set_caption("Single measure - max")

Unnamed: 0_level_0,Unnamed: 1_level_0,with stopwords,without stopwords
Unnamed: 0_level_1,Unnamed: 1_level_1,accuracy,accuracy
measure,algorithm,Unnamed: 2_level_2,Unnamed: 3_level_2
authorities,RF,48.69%,51.71%
betweenness,SVM,55.06%,58.69%
closeness,DT,50.67%,52.37%
density,RF,49.97%,52.29%
eigenvector,KNN,51.10%,50.81%
hubs,RF,50.61%,51.61%
katz,KNN,50.62%,52.64%
pagerank,MLP,49.93%,52.26%


In [13]:
read_path_mean = os.path.join('..', 'results', 'compare_models_all_syntatic_mean.csv')
read_path_max = os.path.join('..', 'results', 'compare_models_all_syntatic_max.csv')

df_all_mean = pd.read_csv(read_path_mean, header=[0,1], index_col=0)
df_all_max  = pd.read_csv(read_path_max, header=[0,1], index_col=0)

In [14]:
df_all_mean.style.format("{:.2%}").highlight_max(props='color:lightgreen;')\
                .set_caption("All measures - mean")

Unnamed: 0_level_0,with stopwords,without stopwords
algorithm,accuracy,accuracy
RF,51.68%,55.00%
SVM,51.31%,54.54%
DT,51.26%,52.26%
MLP,50.90%,55.26%
NB,50.75%,50.58%
KNN,49.39%,52.94%


In [15]:
df_all_max.style.format("{:.2%}").highlight_max(props='color:lightgreen;')\
                .set_caption("All measures - max")

Unnamed: 0_level_0,with stopwords,without stopwords
algorithm,accuracy,accuracy
MLP,53.97%,57.74%
RF,53.37%,55.31%
DT,50.85%,52.00%
KNN,50.35%,52.04%
NB,49.97%,51.04%
SVM,49.69%,49.90%
