In [3]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import sys
sys.path.append( '../' )
import importlib

import measures_common
importlib.reload(measures_common)

<module 'measures_common' from 'd:\\Projetos\\TCC\\Part_2.1-Complex_Networks\\adjacency\\..\\measures_common.py'>

In [4]:
file_path_with_stopwords = os.path.join('..', 'csvs', 'adjacency_max_measures_with_stopwords.csv')
file_path_without_stopwords = os.path.join('..', 'csvs', 'adjacency_max_measures.csv')

df_with_stopwords = pd.read_csv(file_path_with_stopwords)
df_without_stopwords = pd.read_csv(file_path_without_stopwords)

df_with_stopwords.head(3)

Unnamed: 0,text,label,betweenness,closeness,eigenvector,katz,pagerank,hubs,authorities,clustering,average_clustering,correlation,transitivity,density
0,autor bestseller cita 5 expressoes indicam int...,fake,0.234984,0.160355,0.298449,0.113623,0.026214,0.688351,0.184588,0.5,0.016841,-0.041276,0.015,0.008579
1,juiz df confirma indicios expresidente petista...,fake,0.563737,0.098277,0.346027,0.149402,0.046685,0.809017,0.236068,0.5,0.023574,-0.082239,0.047619,0.01592
2,senhora 60 anos corre atras eduardo cunha aero...,fake,0.576355,0.194726,0.356792,0.215475,0.088245,0.767592,0.302776,0.166667,0.013889,0.010157,0.05,0.041379


In [5]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

NB = GaussianNB()
DT = DecisionTreeClassifier()
RF = RandomForestClassifier()
KNN = KNeighborsClassifier()
SVM = SVC()
MLP = MLPClassifier(max_iter=1000)

algs = [NB, DT, RF, KNN, SVM, MLP]
algs_names = ['NB', 'DT', 'RF', 'KNN', 'SVM', 'MLP']

In [None]:
from collections import defaultdict
from sklearn.model_selection import cross_val_score

scores_single_with_stopwords = defaultdict(list)
scores_single_without_stopwords = defaultdict(list)
measures = df_with_stopwords.columns[2:]

for alg, alg_name in tqdm(zip(algs, algs_names), total=len(algs), position=0, leave=True, desc="Single measure"):
    for measure in tqdm(measures, total=len(measures), position=1, leave=True, desc=alg_name):
        scores_single_with_stopwords["measure"].append(measure)
        scores_single_with_stopwords["algorithm"].append(alg_name)
        scores_single_with_stopwords["accuracy"].append(cross_val_score(alg, df_with_stopwords[measure].values.reshape(-1, 1), df_with_stopwords['label'], cv=5, scoring='accuracy').mean())

        scores_single_without_stopwords["measure"].append(measure)
        scores_single_without_stopwords["algorithm"].append(alg_name)
        scores_single_without_stopwords["accuracy"].append(cross_val_score(alg, df_without_stopwords[measure].values.reshape(-1, 1), df_without_stopwords['label'], cv=5, scoring='accuracy').mean())

In [7]:
df_single_with_stopwords = pd.DataFrame(scores_single_with_stopwords)
df_single_without_stopwords = pd.DataFrame(scores_single_without_stopwords)

df_single = pd.merge(df_single_with_stopwords, df_single_without_stopwords, on=['measure', 'algorithm'], suffixes=('_with_stopwords', '_without_stopwords')) 
df_single = df_single.set_index(['measure', 'algorithm'])

cols = pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']])
df_single = pd.DataFrame(df_single[["accuracy_with_stopwords", "accuracy_without_stopwords"]].values, columns=cols, index=df_single.index)
df_single = df_single.sort_values(by=('with stopwords', 'accuracy'), ascending=False)

In [8]:
save_path = os.path.join('..', 'results', 'compare_models_single_adjacency_max.csv')

df_single = df_single.reset_index(col_level=1)

df_single.to_csv(save_path, index=False)

In [None]:
from collections import defaultdict
from sklearn.model_selection import cross_val_score

scores_all_with_stopwords = defaultdict(list)
scores_all_without_stopwords = defaultdict(list)
measures = df_with_stopwords.columns[2:]

for alg, alg_name in tqdm(zip(algs, algs_names), total=len(algs), position=0, leave=True, desc="All measures"):
    scores_all_with_stopwords["algorithm"].append(alg_name)
    scores_all_with_stopwords["accuracy"].append(cross_val_score(alg, df_with_stopwords[measures], df_with_stopwords['label'], cv=5, scoring='accuracy').mean())

    scores_all_without_stopwords["algorithm"].append(alg_name)
    scores_all_without_stopwords["accuracy"].append(cross_val_score(alg, df_without_stopwords[measures], df_without_stopwords['label'], cv=5, scoring='accuracy').mean())

In [10]:
df_all_with_stopwords = pd.DataFrame(scores_all_with_stopwords)
df_all_without_stopwords = pd.DataFrame(scores_all_without_stopwords)

df_all = pd.merge(df_all_with_stopwords, df_all_without_stopwords, on='algorithm', suffixes=('_with_stopwords', '_without_stopwords'))
df_all = df_all.set_index('algorithm')

cols = pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']])
df_all = pd.DataFrame(df_all[["accuracy_with_stopwords", "accuracy_without_stopwords"]].values, columns=cols, index=df_all.index)
df_all = df_all.sort_values(by=('with stopwords', 'accuracy'), ascending=False)

In [11]:
save_path = os.path.join('..', 'results', 'compare_models_all_adjacency_max.csv')

df_all = df_all.reset_index(col_level=1)

df_all.to_csv(save_path, index=False)

In [12]:
read_path_mean = os.path.join('..', 'results', 'compare_models_single_adjacency_mean.csv')
read_path_max = os.path.join('..', 'results', 'compare_models_single_adjacency_max.csv')

df_single_mean = pd.read_csv(read_path_mean, header=[0,1], index_col=[0,1])
df_single_mean.index.names = ['measure', 'algorithm']
df_single_mean = pd.DataFrame(df_single_mean.values, columns=pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']]), index=df_single_mean.index)

df_single_max = pd.read_csv(read_path_max, header=[0,1], index_col=[0,1])
df_single_max.index.names = ['measure', 'algorithm']
df_single_max = pd.DataFrame(df_single_max.values, columns=pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['accuracy']]), index=df_single_max.index)

In [13]:
df_single_mean.loc[df_single_mean.groupby(level=0).idxmax().values[:,1]]\
                  .style.highlight_max(props='color:lightgreen;', axis=0).format("{:.2%}")\
                  .set_caption("Single measure - mean")

Unnamed: 0_level_0,Unnamed: 1_level_0,with stopwords,without stopwords
Unnamed: 0_level_1,Unnamed: 1_level_1,accuracy,accuracy
measure,algorithm,Unnamed: 2_level_2,Unnamed: 3_level_2
authorities,SVM,50.89%,50.89%
average_clustering,KNN,50.43%,50.43%
betweenness,NB,50.97%,50.97%
closeness,SVM,54.21%,54.21%
clustering,KNN,50.43%,50.43%
correlation,MLP,53.28%,53.39%
density,RF,52.26%,52.43%
eigenvector,SVM,50.58%,50.76%
hubs,RF,50.82%,50.94%
katz,DT,51.03%,51.03%


In [19]:
df_single_max.loc[df_single_max.groupby(level=0).idxmax().values[:,1]]\
                  .style.highlight_max(props='color:lightgreen;', axis=0).format("{:.2%}")\
                  .set_caption("Single measure - max")

Unnamed: 0_level_0,Unnamed: 1_level_0,with stopwords,without stopwords
Unnamed: 0_level_1,Unnamed: 1_level_1,accuracy,accuracy
measure,algorithm,Unnamed: 2_level_2,Unnamed: 3_level_2
authorities,RF,52.94%,53.35%
average_clustering,KNN,50.43%,50.43%
betweenness,SVM,56.46%,56.46%
closeness,SVM,53.22%,53.22%
clustering,KNN,49.93%,49.93%
correlation,MLP,53.49%,53.36%
density,DT,52.26%,52.26%
eigenvector,SVM,54.12%,54.18%
hubs,DT,53.19%,53.33%
katz,SVM,53.18%,53.18%


In [15]:
read_path_mean = os.path.join('..', 'results', 'compare_models_all_adjacency_mean.csv')
read_path_max = os.path.join('..', 'results', 'compare_models_all_adjacency_max.csv')

df_all_mean = pd.read_csv(read_path_mean, header=[0,1], index_col=0)
df_all_max = pd.read_csv(read_path_max, header=[0,1], index_col=0)

In [16]:
df_all_mean.style.format("{:.2%}").highlight_max(props='color:lightgreen;')\
                 .set_caption("All measures - mean")

Unnamed: 0_level_0,with stopwords,without stopwords
algorithm,accuracy,accuracy
MLP,58.62%,58.69%
SVM,57.31%,57.35%
RF,56.69%,57.07%
NB,55.11%,55.12%
KNN,54.40%,54.37%
DT,53.00%,53.07%


In [17]:
df_all_max.style.format("{:.2%}").highlight_max(props='color:lightgreen;')\
                .set_caption("All measures - max")

Unnamed: 0_level_0,with stopwords,without stopwords
algorithm,accuracy,accuracy
MLP,60.43%,59.72%
RF,59.96%,60.00%
KNN,54.28%,54.46%
SVM,53.79%,58.79%
DT,53.53%,53.96%
NB,51.50%,56.12%
