In [1]:
import networkx as nx
from tqdm.notebook import tqdm
import os
import statistics
import numpy as np
from collections import defaultdict
import pandas as pd
import matplotlib.pyplot as plt
import math

import sys
sys.path.append( '.' )
import importlib

import measures
importlib.reload(measures)

<module 'measures' from 'd:\\Projetos\\TCC\\Part_2.1-Complex_Networks\\syntactic\\measures.py'>

In [2]:
def get_measures(start_path):

    measures_total = defaultdict(int)

    for file in tqdm(os.listdir(start_path + 'fake')):
        get_compare_measurements(file, measures_total, start_path)

    measures_percentage = defaultdict(int)

    for key in list(measures_total.keys()):
        measures_percentage[key] = round(measures_total[key]/len(os.listdir(start_path + 'fake')), 2)

    return measures_total, measures_percentage

def get_compare_measurements(file, measures_total, start_path):

    fake_file_path = start_path + 'fake' + '/' + file
    true_file_path = start_path + 'true' + '/' + file

    G_fake = measures.generate_graph_udpipe_word(fake_file_path)
    G_true = measures.generate_graph_udpipe_word(true_file_path)

    measures_fake = measures.get_measures_from_graph_word(G_fake)
    measures_true = measures.get_measures_from_graph_word(G_true)

    for key, value in list(measures_fake.items()):
        if  isinstance(value, dict):
            measures_fake[key] = max(value.values())        

    for key, value in list(measures_true.items()):
        if  isinstance(value, dict):
            measures_true[key] = max(value.values())

    for key in measures_fake.keys():
        if measures_true[key] > measures_fake[key]:
            measures_total[key] += 1

def return_list_in_order(measures):
    measures_list = []

    measures_list.append(measures['betweenness'])
    measures_list.append(measures['closeness'])
    measures_list.append(measures['eigenvector'])
    measures_list.append(measures['katz'])
    measures_list.append(measures['pagerank'])
    measures_list.append(measures['hubs'])
    measures_list.append(measures['authorities'])

    measures_list.append(measures['clustering'])
    measures_list.append(measures['average_clustering'])
    measures_list.append(measures['correlation'])
    measures_list.append(measures['transitivity'])

    measures_list.append(measures['density'])
    return measures_list

def return_keys_in_order():
    measures_list = []

    measures_list.append('betweenness')
    measures_list.append('closeness')
    measures_list.append('eigenvector')
    measures_list.append('katz')
    measures_list.append('pagerank')
    measures_list.append('hubs')
    measures_list.append('authorities')

    measures_list.append('clustering')
    measures_list.append('average_clustering')
    measures_list.append('correlation')
    measures_list.append('transitivity')

    measures_list.append('density')
    return measures_list


In [3]:
measures_without_stopwords, measures_percentage_without_stopwords = get_measures('data/udpipe_tuples/')
measures_with_stopwords, measures_percentage_with_stopwords = get_measures('data/udpipe_tuples_with_stopwords/')

  0%|          | 0/3600 [00:00<?, ?it/s]



  0%|          | 0/3600 [00:00<?, ?it/s]

In [4]:
cols = pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['true>fake', '%']])

df_metrics = pd.DataFrame(np.array([return_list_in_order(measures_with_stopwords), return_list_in_order(measures_percentage_with_stopwords),
                                    return_list_in_order(measures_without_stopwords), return_list_in_order(measures_percentage_without_stopwords)]).T,
                            columns=cols, 
                            index=return_keys_in_order())

In [6]:
df_metrics = df_metrics.reset_index(col_level=1).rename(columns={'index':'Measure'})
df_metrics.to_csv("../results/compare_metrics_syntatic_word_greater_max.csv", index=False)

In [8]:
df_metrics = pd.read_csv("../results/compare_metrics_syntatic_word_greater_mean.csv", header=[0,1], index_col=0)
df_metrics.style.highlight_min(axis = 0, props='color:lightgreen;').highlight_max(axis = 0, props='color:lightgreen;').format(precision=2)

Unnamed: 0_level_0,with stopwords,with stopwords,without stopwords,without stopwords
Measure,true>fake,%,true>fake,%
betweenness,1741.0,0.48,1456.0,0.4
closeness,1948.0,0.54,1684.0,0.47
eigenvector,1626.0,0.45,1670.0,0.46
katz,1797.0,0.5,1611.0,0.45
pagerank,1890.0,0.53,1596.0,0.44
hubs,1872.0,0.52,1572.0,0.44
authorities,1885.0,0.52,1581.0,0.44
clustering,2151.0,0.6,1856.0,0.52
average_clustering,2151.0,0.6,1856.0,0.52
correlation,1799.0,0.5,1741.0,0.48


In [9]:
df_metrics = pd.read_csv("../results/compare_metrics_syntatic_word_greater_max.csv", header=[0,1], index_col=0)
df_metrics.style.highlight_min(axis = 0, props='color:lightgreen;').highlight_max(axis = 0, props='color:lightgreen;').format(precision=2)

Unnamed: 0_level_0,with stopwords,with stopwords,without stopwords,without stopwords
Measure,true>fake,%,true>fake,%
betweenness,1697.0,0.47,1475.0,0.41
closeness,2293.0,0.64,1555.0,0.43
eigenvector,2068.0,0.57,1883.0,0.52
katz,2390.0,0.66,1400.0,0.39
pagerank,2211.0,0.61,1549.0,0.43
hubs,1332.0,0.37,1947.0,0.54
authorities,1947.0,0.54,1530.0,0.42
clustering,1036.0,0.29,1234.0,0.34
average_clustering,2151.0,0.6,1856.0,0.52
correlation,1799.0,0.5,1741.0,0.48
