In [1]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import sys
sys.path.append( '../' )
import importlib

import measures
importlib.reload(measures)

<module 'measures' from 'd:\\Projetos\\TCC\\Part_2.1-Complex_Networks\\syntactic\\measures.py'>

In [2]:
file_path_with_stopwords = os.path.join('..', 'csvs', 'syntatic_word_mean_measures_with_stopwords.csv')
file_path_without_stopwords = os.path.join('..', 'csvs', 'syntatic_word_mean_measures.csv')

df_with_stopwords = pd.read_csv(file_path_with_stopwords)
df_without_stopwords = pd.read_csv(file_path_without_stopwords)

df_with_stopwords.head(3)

Unnamed: 0,news_id,betweenness,closeness,eigenvector,katz,pagerank,hubs,authorities,clustering,average_clustering,correlation,transitivity,density,label
0,1,0.002859,0.030342,0.056997,0.102508,0.010753,0.010753,0.010753,0.021084,0.021084,-0.238608,0.015873,0.017064,fake
1,1,0.001572,0.025936,0.015334,0.098473,0.010101,0.010101,0.010101,0.009144,0.009144,-0.277525,0.011834,0.015976,true
2,10,0.001142,0.020293,0.01469,0.079517,0.006623,0.006623,0.006623,0.011587,0.011587,-0.223655,0.016129,0.011082,fake


In [3]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

NB = GaussianNB()
DT = DecisionTreeClassifier()
RF = RandomForestClassifier()
KNN = KNeighborsClassifier()
SVM = SVC()
MLP = MLPClassifier(max_iter=1000)

algs = [NB, DT, RF, KNN, SVM, MLP]
algs_names = ['NB', 'DT', 'RF', 'KNN', 'SVM', 'MLP']

In [4]:
from collections import defaultdict
from sklearn.model_selection import cross_val_score

scores_single_with_stopwords = defaultdict(list)
scores_single_without_stopwords = defaultdict(list)
measures = df_with_stopwords.columns[1:13]

for alg, alg_name in tqdm(zip(algs, algs_names), total=len(algs), position=0, leave=True, desc="Single measure"):
    for measure in tqdm(measures, total=len(measures), position=1, leave=True, desc=alg_name):
        scores_single_with_stopwords["measure"].append(measure)
        scores_single_with_stopwords["algorithm"].append(alg_name)
        scores_single_with_stopwords["accuracy"].append(cross_val_score(alg, df_with_stopwords[measure].values.reshape(-1, 1), df_with_stopwords['label'], cv=5, scoring='accuracy').mean())

        scores_single_without_stopwords["measure"].append(measure)
        scores_single_without_stopwords["algorithm"].append(alg_name)
        scores_single_without_stopwords["accuracy"].append(cross_val_score(alg, df_without_stopwords[measure].values.reshape(-1, 1), df_without_stopwords['label'], cv=5, scoring='accuracy').mean())

Single measure:   0%|          | 0/6 [00:00<?, ?it/s]

NB:   0%|          | 0/12 [00:00<?, ?it/s]

DT:   0%|          | 0/12 [00:00<?, ?it/s]

RF:   0%|          | 0/12 [00:00<?, ?it/s]

KNN:   0%|          | 0/12 [00:00<?, ?it/s]

SVM:   0%|          | 0/12 [00:00<?, ?it/s]

MLP:   0%|          | 0/12 [00:00<?, ?it/s]

In [5]:
df_single_with_stopwords = pd.DataFrame(scores_single_with_stopwords)
df_single_without_stopwords = pd.DataFrame(scores_single_without_stopwords)

df_single = pd.merge(df_single_with_stopwords, df_single_without_stopwords, on=['measure', 'algorithm'], suffixes=('_with_stopwords', '_without_stopwords')) 
df_single = df_single.set_index(['measure', 'algorithm'])

cols = pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']])
df_single = pd.DataFrame(df_single[["accuracy_with_stopwords", "accuracy_without_stopwords"]].values, columns=cols, index=df_single.index)
df_single = df_single.sort_values(by=('with stopwords', 'accuracy'), ascending=False)

In [6]:
save_path = os.path.join('..', 'results', 'compare_models_single_syntatic_word_mean.csv')

df_single = df_single.reset_index(col_level=1)

df_single.to_csv(save_path, index=False)

In [7]:
from collections import defaultdict
from sklearn.model_selection import cross_val_score

scores_all_with_stopwords = defaultdict(list)
scores_all_without_stopwords = defaultdict(list)
measures = df_with_stopwords.columns[1:13]

for alg, alg_name in tqdm(zip(algs, algs_names), total=len(algs), position=0, leave=True, desc="All measures"):
    scores_all_with_stopwords["algorithm"].append(alg_name)
    scores_all_with_stopwords["accuracy"].append(cross_val_score(alg, df_with_stopwords[measures], df_with_stopwords['label'], cv=5, scoring='accuracy').mean())

    scores_all_without_stopwords["algorithm"].append(alg_name)
    scores_all_without_stopwords["accuracy"].append(cross_val_score(alg, df_without_stopwords[measures], df_without_stopwords['label'], cv=5, scoring='accuracy').mean())

All measures:   0%|          | 0/6 [00:00<?, ?it/s]

In [8]:
df_all_with_stopwords = pd.DataFrame(scores_all_with_stopwords)
df_all_without_stopwords = pd.DataFrame(scores_all_without_stopwords)

df_all = pd.merge(df_all_with_stopwords, df_all_without_stopwords, on='algorithm', suffixes=('_with_stopwords', '_without_stopwords'))
df_all = df_all.set_index('algorithm')

cols = pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']])
df_all = pd.DataFrame(df_all[["accuracy_with_stopwords", "accuracy_without_stopwords"]].values, columns=cols, index=df_all.index)
df_all = df_all.sort_values(by=('with stopwords', 'accuracy'), ascending=False)

In [9]:
save_path = os.path.join('..', 'results', 'compare_models_all_syntatic_word_mean.csv')

df_all = df_all.reset_index(col_level=1)

df_all.to_csv(save_path, index=False)

In [13]:
read_path_mean = os.path.join('..', 'results', 'compare_models_single_syntatic_word_mean.csv')
read_path_max = os.path.join('..', 'results', 'compare_models_single_syntatic_word_max.csv')

df_single_mean = pd.read_csv(read_path_mean, header=[0,1], index_col=[0,1])
df_single_mean.index.names = ['measure', 'algorithm']
df_single_mean = pd.DataFrame(df_single.values, columns=pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']]), index=df_single_mean.index)

df_single_max = pd.read_csv(read_path_max, header=[0,1], index_col=[0,1])
df_single_max.index.names = ['measure', 'algorithm']
df_single_max = pd.DataFrame(df_single.values, columns=pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']]), index=df_single_max.index)

In [16]:
df_single_mean.loc[df_single.groupby(level=0).idxmax().values[:,1]]\
            .style.highlight_max(props='color:lightgreen;', axis=0).format("{:.2%}")\
            .set_caption("Single measure - mean")

Unnamed: 0_level_0,Unnamed: 1_level_0,with stopwords,without stopwords
Unnamed: 0_level_1,Unnamed: 1_level_1,accuracy,accuracy
measure,algorithm,Unnamed: 2_level_2,Unnamed: 3_level_2
authorities,RF,50.06%,51.17%
average_clustering,MLP,56.88%,53.40%
betweenness,MLP,50.69%,55.51%
closeness,SVM,51.82%,52.24%
clustering,NB,55.83%,53.07%
correlation,RF,49.76%,50.71%
density,RF,51.56%,52.83%
eigenvector,SVM,52.83%,53.68%
hubs,RF,50.19%,50.85%
katz,KNN,47.69%,51.17%


In [17]:
df_single_max.loc[df_single.groupby(level=0).idxmax().values[:,1]]\
            .style.highlight_max(props='color:lightgreen;', axis=0).format("{:.2%}")\
            .set_caption("Single measure - max")

Unnamed: 0_level_0,Unnamed: 1_level_0,with stopwords,without stopwords
Unnamed: 0_level_1,Unnamed: 1_level_1,accuracy,accuracy
measure,algorithm,Unnamed: 2_level_2,Unnamed: 3_level_2
authorities,RF,50.35%,51.90%
average_clustering,MLP,53.86%,50.74%
betweenness,MLP,49.99%,50.08%
closeness,SVM,56.58%,53.00%
clustering,NB,50.74%,49.38%
correlation,RF,49.10%,49.79%
density,RF,50.04%,50.74%
eigenvector,SVM,52.46%,50.19%
hubs,RF,49.93%,50.18%
katz,KNN,51.56%,52.83%


In [22]:
read_path_mean = os.path.join('..', 'results', 'compare_models_all_syntatic_word_mean.csv')
read_path_max = os.path.join('..', 'results', 'compare_models_all_syntatic_word_max.csv')

df_all_mean = pd.read_csv(read_path_mean, header=[0,1], index_col=0)
df_all_max  = pd.read_csv(read_path_max, header=[0,1], index_col=0)

In [23]:
df_all_mean.style.format("{:.2%}").highlight_max(props='color:lightgreen;')\
                .set_caption("All measures - mean")

Unnamed: 0_level_0,with stopwords,without stopwords
algorithm,accuracy,accuracy
SVM,58.96%,54.96%
MLP,58.79%,56.96%
RF,57.37%,57.19%
NB,56.19%,51.43%
DT,54.22%,53.14%
KNN,54.10%,52.28%


In [24]:
df_all_max.style.format("{:.2%}").highlight_max(props='color:lightgreen;')\
                .set_caption("All measures - max")

Unnamed: 0_level_0,with stopwords,without stopwords
algorithm,accuracy,accuracy
MLP,60.51%,60.35%
SVM,59.90%,57.93%
RF,59.39%,59.24%
NB,58.99%,56.82%
KNN,54.54%,53.68%
DT,54.10%,54.47%
