In [13]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm

import sys
sys.path.append( '../' )
import importlib

import measures_common
importlib.reload(measures_common)

<module 'measures_common' from 'd:\\Projetos\\TCC\\Part_2.1-Complex_Networks\\adjacency\\..\\measures_common.py'>

In [14]:
def compare_measures(measures, fake_text, true_text):

    fake_measures = measures_common.get_measures_from_text(fake_text)
    true_measures = measures_common.get_measures_from_text(true_text)

    for key in fake_measures.keys():
        if true_measures[key] >= fake_measures[key]:
            measures[key] += 1
        
    
def get_comparison_measures_df(df):

    measures = defaultdict(int)
    measures_percentage = defaultdict(float)

    for index, row in tqdm(df.iterrows(), total=len(df), unit='rows', mininterval=1 ,desc='Comparing measures from dataframe'):
        compare_measures(measures, row['Fake'], row['True'])

    for key in list(measures.keys()):
        measures_percentage[key] = round(measures[key]/len(df), 2)


    return measures, measures_percentage

def return_list_in_order(measures):
    measures_list = []

    measures_list.append(measures['betweenness'])
    measures_list.append(measures['closeness'])
    measures_list.append(measures['eigenvector'])
    measures_list.append(measures['katz'])
    measures_list.append(measures['pagerank'])
    measures_list.append(measures['hubs'])
    measures_list.append(measures['authorities'])

    measures_list.append(measures['clustering'])
    measures_list.append(measures['average_clustering'])
    measures_list.append(measures['correlation'])
    measures_list.append(measures['transitivity'])
    

    measures_list.append(measures['density'])
    return measures_list

def return_keys_in_order():
    measures_list = []

    measures_list.append('betweenness')
    measures_list.append('closeness')
    measures_list.append('eigenvector')
    measures_list.append('katz')
    measures_list.append('pagerank')
    measures_list.append('hubs')
    measures_list.append('authorities')

    measures_list.append('clustering')
    measures_list.append('average_clustering')
    measures_list.append('correlation')
    measures_list.append('transitivity')

    measures_list.append('density')
    return measures_list

In [15]:
file_path_with_stopwords = os.path.join('..', '..', 'data', 'csvs', 'fake_vs_true_pre_norm_with_stopwords.csv')
file_path_without_stopwords = os.path.join('..', '..', 'data', 'csvs', 'fake_vs_true_pre_norm.csv')

df_with_stopwords = pd.read_csv(file_path_with_stopwords)
df_without_stopwords = pd.read_csv(file_path_without_stopwords)

df_with_stopwords.head(3)

Unnamed: 0,Fake,True
0,katia abreu diz que vai colocar sua expulsao e...,o podemos decidiu expulsar o deputado federal ...
1,dr ray peita bolsonaro chamao de conservador f...,bolsonaro e um liberal completo diz presidente...
2,reinaldo azevedo desmascarado pela policia fed...,ministro do stf libera andrea neves de prisao ...


In [16]:
measures_with_stopwords, measures_percentage_with_stopwords = get_comparison_measures_df(df_with_stopwords)
measures_without_stopwords, measures_percentage_without_stopwords = get_comparison_measures_df(df_without_stopwords)

Comparing measures from dataframe:   0%|          | 0/3600 [00:00<?, ?rows/s]

Comparing measures from dataframe:   0%|          | 0/3600 [00:00<?, ?rows/s]

In [17]:
cols = pd.MultiIndex.from_product([["with stopwords","without stopwords"], ['true>fake', '%']])

df_metrics = pd.DataFrame(np.array([return_list_in_order(measures_with_stopwords), return_list_in_order(measures_percentage_with_stopwords),
                                    return_list_in_order(measures_without_stopwords), return_list_in_order(measures_percentage_without_stopwords)]).T,
                            columns=cols, 
                            index=return_keys_in_order())

In [18]:
df_metrics = df_metrics.reset_index(col_level=1).rename(columns={'index':'Measure'})
df_metrics.to_csv("../results/compare_metrics_adjacency_greater_or_equal.csv", index=False)

In [19]:
df_metrics = pd.read_csv("../results/compare_metrics_adjacency_greater.csv", header=[0,1], index_col=0)
df_metrics.style.highlight_min(axis = 0, props='color:lightgreen;').highlight_max(axis = 0, props='color:lightgreen;').format(precision=2)

Unnamed: 0_level_0,with stopwords,with stopwords,without stopwords,without stopwords
Measure,true>fake,%,true>fake,%
betweenness,1872.0,0.52,1871.0,0.52
closeness,1917.0,0.53,1654.0,0.46
eigenvector,1878.0,0.52,1866.0,0.52
katz,1834.0,0.51,1618.0,0.45
pagerank,1796.0,0.5,1561.0,0.43
hubs,1816.0,0.5,1577.0,0.44
authorities,1826.0,0.51,1574.0,0.44
clustering,1778.0,0.49,1495.0,0.42
average_clustering,1778.0,0.49,1495.0,0.42
correlation,1584.0,0.44,1946.0,0.54


In [20]:
df_metrics = pd.read_csv("../results/compare_metrics_adjacency_greater_or_equal.csv", header=[0,1], index_col=0)
df_metrics.style.highlight_min(axis = 0, props='color:lightgreen;').highlight_max(axis = 0, props='color:lightgreen;').format(precision=2)

Unnamed: 0_level_0,with stopwords,with stopwords,without stopwords,without stopwords
Measure,true>fake,%,true>fake,%
betweenness,1882.0,0.52,1901.0,0.53
closeness,1917.0,0.53,1661.0,0.46
eigenvector,1877.0,0.52,1866.0,0.52
katz,1834.0,0.51,1625.0,0.45
pagerank,1900.0,0.53,1660.0,0.46
hubs,1889.0,0.52,1644.0,0.46
authorities,1881.0,0.52,1633.0,0.45
clustering,1874.0,0.52,2131.0,0.59
average_clustering,1874.0,0.52,2131.0,0.59
correlation,1584.0,0.44,1946.0,0.54
