In [26]:
import os
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import kstest, norm, expon, gamma, lognorm, beta
import statistics
import seaborn as sns
from scipy.stats import f_oneway
import scikit_posthocs as sp
from scipy.stats import kruskal
import statsmodels.stats.multicomp as mc
from scipy.stats import mode
from scipy.stats import kstest



## Importing the data
- The imported data is then processed Liar Dataset through the faKy library.
- All the features, Readability, Information Complexity, Vader scores, NER, and POS tags are computed in a different notebook.
- We have stored the data in a new data frame and now import this data frame for efficiency purposes.
- We classify the qualative labels true(0), false(1) and in Between (2)
- We also define in which columns the labels are stored in the data frame to compute the significance of the features 
- At last, we divide the data into three labels, True, False, and In between data corresponding with the qualitative labels 0,1,2.



In [27]:
Liar_computed = pd.read_csv('/Users/sandrobarreshamers/Thesis_IS_fake_news/ThesisData/Liar_computed_final_version.csv')

In [28]:
Liar_computed.head(2)

Unnamed: 0.1,Unnamed: 0,json_id,claim,object,binary label,readability,compressed_size,vader_neg,vader_neu,vader_pos,...,pos_NOUN,pos_NUM,pos_PART,pos_PRON,pos_PROPN,pos_PUNCT,pos_SCONJ,pos_SYM,pos_VERB,pos_X
0,0,10540.json,half-true,When did the decline of coal start? It started...,2,71.815,11444,0.0,0.902,0.098,...,5.0,0.0,1.0,1.0,4.0,4.0,0.0,0.0,4.0,0.0
1,1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",2,71.781579,9089,0.107,0.687,0.206,...,2.0,0.0,1.0,0.0,7.0,3.0,0.0,0.0,3.0,0.0


In [29]:
Liar_computed.columns

Index(['Unnamed: 0', 'json_id', 'claim', 'object', 'binary label',
       'readability', 'compressed_size', 'vader_neg', 'vader_neu', 'vader_pos',
       'vader_compound', 'tot_ner_count', 'ner_counts', 'input_vector_ner',
       'NER_CARDINAL', 'NER_DATE', 'NER_EVENT', 'NER_FAC', 'NER_GPE',
       'NER_LANGUAGE', 'NER_LAW', 'NER_LOC', 'NER_MONEY', 'NER_NORP',
       'NER_ORDINAL', 'NER_ORG', 'NER_PERCENT', 'NER_PERSON', 'NER_PRODUCT',
       'NER_QUANTITY', 'NER_TIME', 'NER_WORK_OF_ART', 'pos counts',
       'input_vector_pos', 'pos_ADJ', 'pos_ADP', 'pos_ADV', 'pos_AUX',
       'pos_CCONJ', 'pos_DET', 'pos_INTJ', 'pos_NOUN', 'pos_NUM', 'pos_PART',
       'pos_PRON', 'pos_PROPN', 'pos_PUNCT', 'pos_SCONJ', 'pos_SYM',
       'pos_VERB', 'pos_X'],
      dtype='object')

In [30]:
from faKy import values_by_label, dunn_table
labels = [0, 1, 2]
df_label = 'binary label'

df_Liar_true = Liar_computed[(Liar_computed['binary label'] == 0)]
df_Liar_false = Liar_computed[(Liar_computed['binary label'] == 1)]
df_Liar_between = Liar_computed[(Liar_computed['binary label'] == 2)]

print(len(df_Liar_true))
print(len(df_Liar_false))
print(len(df_Liar_between))

1676
2833
5730


# Kruskal Dunn tests
- In this code block, we compute the Dunn tests for the different features; we first perform the Kruskal Wallis test (KST) to test the significance
- Subsequently, we perform the ad-hoc Dunn test and test the significance of the features between the different qualitative labels
- The KST is translated to a Dunn table through the faKy function dunn_table
- All the Dunn tables are printed into latex format through the to_latex function and used in the research paper

### Readability

In [43]:
label_fke = values_by_label(Liar_computed, 'readability',labels,df_label)
stat, p = kruskal(*label_fke) # unpack the elements 

print('Kruskal results:')
print(f'F-statistic: {stat:.3f}')
print(f'p-value: {p:.3e}')

Kruskal results:
F-statistic: 49.863
p-value: 1.487e-11


In [44]:
dunn_results = sp.posthoc_dunn(label_fke, p_adjust='bonferroni')
df_dunn_R = dunn_table(dunn_results)

print('Dunn results for the FKE-readability')
dunn_table(dunn_results)

Dunn results for the FKE-readability


group,1,1,2,2,3,3
metric,value,reject,value,reject,value,reject
1,1.0,False,0.0,True,0.012227,True
2,0.0,True,1.0,False,0.0,True
3,0.012227,True,0.0,True,1.0,False


In [46]:
#latex_table = df_dunn_R.to_latex(index=True, float_format=lambda x: "%.2f" % x)
#print(latex_table)

### Information Complexity

In [47]:
label_com = values_by_label(Liar_computed, 'compressed_size',labels, df_label)
stat, p = kruskal(*label_com) # unpack the elements 

print('Kruskal results:')
print(f'F-statistic: {stat:.3f}')
print(f'p-value: {p:.3e}')


Kruskal results:
F-statistic: 79.409
p-value: 5.710e-18


In [48]:
dunn_results = sp.posthoc_dunn(label_com, p_adjust='bonferroni')
print('Dunn results for the Information Complexity')
df_dunn_IC = dunn_table(dunn_results)
df_dunn_IC


Dunn results for the Information Complexity


group,1,1,2,2,3,3
metric,value,reject,value,reject,value,reject
1,1.0,False,5e-05,True,0.028695,True
2,5e-05,True,1.0,False,0.0,True
3,0.028695,True,0.0,True,1.0,False


In [49]:
#latex_table_DIC = df_dunn_IC.to_latex(index=False, float_format=lambda x: "%.2f" % x)
#print(latex_table_DIC)

### VADER labels
- Since we have multiple Vader scores we need to compute the Dunn KST over the different labels
- this code snippet iterates over three tables simultaneously and prints the formatted values of Vader, stat, and p in each iteration. 
- Also, for the Dunn table, we need to compute extra steps.
- The code performs the Dunn table method, stores the results in DataFrames, and assigns those DataFrames to specific keys in the df_dict dictionary. It then prints the results for each iteration of the loop

In [50]:
vader_labels = ['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']

computed_labels = []
for vader in vader_labels:
    label_vader = values_by_label(Liar_computed, vader,labels,df_label)
    stat, p = kruskal(*label_vader) # unpack the elements
    computed_labels.append([stat, p])

for i, (stat, p), vader in zip(range(len(computed_labels)), computed_labels, vader_labels):
    print(f'{vader} : fstat {stat}; p {p}')

vader_neg : fstat 10.409693080422452; p 0.005489892854085175
vader_neu : fstat 10.17546297354846; p 0.00617200529376983
vader_pos : fstat 4.323321766239063; p 0.11513373846730762
vader_compound : fstat 1.999867736891534; p 0.36790377041511385


In [18]:
df_dict = {
    "vader_neg": pd.DataFrame(),
    "vader_neu": pd.DataFrame(),
    "vader_pos": pd.DataFrame(),
    "vader_compound": pd.DataFrame()
}

for vader in vader_labels:
    label_vader = values_by_label(Liar_computed, vader, labels, df_label)
    dunn_results = sp.posthoc_dunn(label_vader, p_adjust='bonferroni')
    result_dunn_test = dunn_table(dunn_results)
    
    df_dict[vader] = result_dunn_test
        
    print(f'Dunn results: {vader}')
    print(result_dunn_test)


Dunn results: vader_neg
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False       1.0  False  0.022119   True
2            1.0  False       1.0  False  0.041474   True
3       0.022119   True  0.041474   True       1.0  False
Dunn results: vader_neu
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False  0.060342  False  0.004317   True
2       0.060342  False       1.0  False       1.0  False
3       0.004317   True       1.0  False       1.0  False
Dunn results: vader_pos
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False  0.144254  False  0.197013  False
2       0.144254  False       1.0  False       1.0  False
3       0.197013  False       1.0  False       1.0  False
Dunn results: vader_compound
group      1                2

In [51]:
#vader_results = pd.concat([df_dict["vader_neg"], df_dict["vader_neu"], df_dict["vader_pos"], df_dict["vader_compound"]], axis=1)
#vader_results.columns = vader_results.columns.set_names(['Dunn results', 'metric'])
#vader_results = vader_results.reset_index()
#print(vader_results)

In [52]:
#latex_table_vader = vader_results.to_latex(index=True, float_format=lambda x: "%.2f" % x)
#print(latex_table_vader)

### NER

In [53]:
label_ner_sum = values_by_label(Liar_computed, 'tot_ner_count',labels,df_label)
stat, p = kruskal(*label_ner_sum) 

print('NER count results:')
print(f'F-statistic: {stat:.3f}')
print(f'p-value: {p:.3e}')

NER count results:
F-statistic: 42.182
p-value: 6.924e-10


In [54]:
dunn_results = sp.posthoc_dunn(label_ner_sum, p_adjust='bonferroni')

print('Dunn results for the total NER count')
df_dunn_NER = dunn_table(dunn_results)
print(df_dunn_NER)

Dunn results for the total NER count
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False  0.001408   True  0.413233  False
2       0.001408   True       1.0  False       0.0   True
3       0.413233  False       0.0   True       1.0  False


In [55]:
#latex_table = df_dunn_NER.to_latex(index=True, float_format=lambda x: "%.2f" % x)
#print(latex_table)

### NER counts tag
- Similar to the VSS we need to compute the scores over the different NER labels

In [56]:
ner_labels = [ 'NER_CARDINAL',
       'NER_DATE', 'NER_EVENT', 'NER_FAC', 'NER_GPE', 'NER_LANGUAGE',
       'NER_LAW', 'NER_LOC', 'NER_MONEY', 'NER_NORP', 'NER_ORDINAL', 'NER_ORG',
       'NER_PERCENT', 'NER_PERSON', 'NER_PRODUCT', 'NER_QUANTITY', 'NER_TIME',
       'NER_WORK_OF_ART']
computed_ner_labels = []
for ner in ner_labels:
    label_ner = values_by_label(Liar_computed, ner, labels, df_label)
    stat, p = kruskal(*label_ner) # unpack the elements 
    computed_ner_labels.append([stat, p])

for i, (stat, p), pos in zip(range(len(computed_ner_labels)), computed_ner_labels, ner_labels):
       print(f'{pos} : fstat {stat}; p {p}')

NER_CARDINAL : fstat 49.83110092286668; p 1.5111720797740373e-11
NER_DATE : fstat 51.61532051517984; p 6.192636087610597e-12
NER_EVENT : fstat 0.17192881583577338; p 0.9176268908905847
NER_FAC : fstat 1.62191142550578; p 0.44443311279487685
NER_GPE : fstat 4.891785484232781; p 0.08664874537809603
NER_LANGUAGE : fstat 0.2765099377408568; p 0.8708766170086077
NER_LAW : fstat 5.044810889786552; p 0.08026629818911608
NER_LOC : fstat 0.4509002696889834; p 0.7981568596715132
NER_MONEY : fstat 35.30762339462919; p 2.1530147012843794e-08
NER_NORP : fstat 0.7534040417699982; p 0.6861204930154862
NER_ORDINAL : fstat 13.269417872165102; p 0.0013139611361827818
NER_ORG : fstat 15.497830751288278; p 0.00043120998788284286
NER_PERCENT : fstat 51.19931270863278; p 7.624484865681605e-12
NER_PERSON : fstat 82.52962410037611; p 1.1992778976434277e-18
NER_PRODUCT : fstat 1.096845519071406; p 0.5778605169801413
NER_QUANTITY : fstat 3.320164395658514; p 0.19012335173241685
NER_TIME : fstat 3.71035263653848

In [66]:
for ner in ner_labels:
    label_ner = values_by_label(Liar_computed, ner, labels, df_label)
    dunn_results = sp.posthoc_dunn(label_ner, p_adjust='bonferroni')
    result_dunn_test_ner = dunn_table(dunn_results)
    
    df_dict_ner[ner] = result_dunn_test_ner
        
    print(f'Dunn results: {ner}')
    print(result_dunn_test_ner)


Dunn results: NER_CARDINAL
group      1            2            3       
metric value reject value reject value reject
1        1.0  False   0.0   True   1.0  False
2        0.0   True   1.0  False   0.0   True
3        1.0  False   0.0   True   1.0  False
Dunn results: NER_DATE
group          1            2                3       
metric     value reject value reject     value reject
1            1.0  False   0.0   True  0.020809   True
2            0.0   True   1.0  False       0.0   True
3       0.020809   True   0.0   True       1.0  False
Dunn results: NER_EVENT
group      1            2            3       
metric value reject value reject value reject
1        1.0  False   1.0  False   1.0  False
2        1.0  False   1.0  False   1.0  False
3        1.0  False   1.0  False   1.0  False
Dunn results: NER_FAC
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False  0.644083  False  0.852657  False


In [58]:
#df_list_ner = []

#for label in df_dict_ner.keys():
#   df_list_ner.append(df_dict_ner[label])

#result_ner_tag = pd.concat(df_list_ner, axis=0)

#result_ner_tag.columns = result_ner_tag.columns.set_names(['Dunn results', 'metric'])
#result_ner_tag = result_ner_tag.reset_index()

#print(result_ner_tag)


### Significant features:
- CARDINAL p 0.0
- DATE  p 0.0
- ORDINAL p 0.0
- ORG p 0.0
- PERCENT 0.0
- PERSON 0.0


In [59]:
#latex_table_ner_tags = result_ner_tag.to_latex(index=True, float_format=lambda x: "%.2f" % x)
#print(latex_table_ner_tags)

### Pos labels
- Similar to VSS and NER we compute the KST and Dunn test for all the POS tags

In [60]:
pos_labels = [ 'pos_ADJ',
       'pos_ADP', 'pos_ADV', 'pos_AUX', 'pos_CCONJ', 'pos_DET', 'pos_INTJ',
       'pos_NOUN', 'pos_NUM', 'pos_PART', 'pos_PRON', 'pos_PROPN', 'pos_PUNCT',
       'pos_SCONJ', 'pos_SYM', 'pos_VERB', 'pos_X']

computed_pos_labels = []
for pos in pos_labels:
    label_pos = values_by_label(Liar_computed, pos, labels, df_label)
    stat, p = kruskal(*label_pos) # unpack the elements
    computed_pos_labels.append([stat, p])

for i, (stat, p), pos in zip(range(len(computed_pos_labels)), computed_pos_labels, pos_labels):
       print(f'{pos} : fstat {stat}; p {p}')


pos_ADJ : fstat 75.63363133229193; p 3.7702107990073795e-17
pos_ADP : fstat 52.88463642780589; p 3.2828203050086716e-12
pos_ADV : fstat 15.343928130378757; p 0.000465702255291164
pos_AUX : fstat 10.672755272313356; p 0.004813274601537372
pos_CCONJ : fstat 30.78256967100686; p 2.0684733341776858e-07
pos_DET : fstat 17.70000951293461; p 0.0001433810542873743
pos_INTJ : fstat 1.6205299951697043; p 0.44474019552820465
pos_NOUN : fstat 74.42562883952174; p 6.897314694451533e-17
pos_NUM : fstat 157.83110536457033; p 5.33839781071815e-35
pos_PART : fstat 15.798490526270482; p 0.0003710234598959158
pos_PRON : fstat 6.770194727289172; p 0.03387434405551494
pos_PROPN : fstat 49.15895156684152; p 2.1147999875958165e-11
pos_PUNCT : fstat 39.48929409787203; p 2.6607786034061635e-09
pos_SCONJ : fstat 50.565192976835576; p 1.0469064051354758e-11
pos_SYM : fstat 27.912685893597782; p 8.686348924034588e-07
pos_VERB : fstat 45.05819719363287; p 1.6433754586485928e-10
pos_X : fstat 4.769992151936845; p 0

In [64]:
for pos in pos_labels:
    label_pos = values_by_label(Liar_computed, pos,labels,df_label)
    dunn_results = sp.posthoc_dunn(label_pos, p_adjust='bonferroni')
    result_dunn_test_pos = dunn_table(dunn_results)
    df_dict_pos[pos] = result_dunn_test_pos
        
    print(f'Dunn results: {pos}')
    print(result_dunn_test_pos)
    
    

Dunn results: pos_ADJ
group          1            2                3       
metric     value reject value reject     value reject
1            1.0  False   0.0   True  0.895962  False
2            0.0   True   1.0  False       0.0   True
3       0.895962  False   0.0   True       1.0  False
Dunn results: pos_ADP
group      1            2            3       
metric value reject value reject value reject
1        1.0  False   0.0   True   1.0  False
2        0.0   True   1.0  False   0.0   True
3        1.0  False   0.0   True   1.0  False
Dunn results: pos_ADV
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False  0.003806   True       1.0  False
2       0.003806   True       1.0  False  0.001214   True
3            1.0  False  0.001214   True       1.0  False
Dunn results: pos_AUX
group          1                2                3       
metric     value reject     value reject     value reject
1     

### Significant POS features
- ADJ p: 0.00
- ADP p: 0.00
- AUX p: 0.01
- CCONJ p: 0.05
- DET p: 0.00
- NOUN p: 0.00
- NUM p:0.00
- PART p:0.00
- PUNCT p:0.00
- SCONJ p: 0.00
- VERB p:0 .00

## aggregate results
- This section computes the aggregate results for the Continous and Discrete values
- we define a function get_stats that computes the relevant statistics


In [37]:
def get_stats(df, column_name):
    avg = df[column_name].mean()
    max_val = df[column_name].max()
    mode = df[column_name].mode()[0]
    std = df[column_name].std()
    return pd.DataFrame({'avg': [avg], 'max': [max_val], 'most common': [mode], 'std': [std]})

### Continous values

In [38]:
stat_columns = ['compressed_size', 'readability']
df_Liar_true_agg = df_Liar_true[stat_columns]
df_Liar_false_agg = df_Liar_false[stat_columns]
df_Liar_between_agg = df_Liar_between[stat_columns]

In [39]:
stat_columns = ['compressed_size', 'readability']
labels = [0, 1, 2]

rows = []
for feature in stat_columns:
    for label in labels:
        df = Liar_computed[Liar_computed['binary label'] == label][[feature]]
        stats = get_stats(df, feature)
        rows.append({'feature': feature, 'label': label, 'avg': stats.iloc[0]['avg'], 'max': stats.iloc[0]['max'], 
                     'most common': stats.iloc[0]['most common'],'std': stats.iloc[0]['std']})
        
df_stats = pd.DataFrame(rows)



In [41]:
df_stats.head()

Unnamed: 0,feature,label,avg,max,most common,std
0,compressed_size,0,8555.408711,149838.0,5219.0,4779.221222
1,compressed_size,1,8130.946347,197588.0,5999.0,4929.493438
2,compressed_size,2,8776.261955,294079.0,6366.0,5112.226317
3,readability,0,60.759599,127.215714,74.27,21.520138
4,readability,1,56.33427,124.155,60.705,22.948483


In [69]:
#latex_table = df_stats.to_latex(index=False, float_format=lambda x: "%.2f" % x)
#print(latex_table)


### Discrete values

- total sum of the counts
- average sum of count /total
- max number of count for a individual

In [43]:
count_features = ['tot_ner_count','NER_CARDINAL', 'NER_DATE','NER_ORDINAL', 'NER_ORG','NER_PERCENT', 'NER_PERSON', 
                  'pos_ADJ','pos_ADP','pos_AUX', 'pos_CCONJ', 'pos_DET', 'pos_NOUN','pos_NUM','pos_PART',
                    'pos_PUNCT', 'pos_SCONJ','pos_VERB', ]
df_Liar_true_count = df_Liar_true[count_features]
df_Liar_false_count = df_Liar_false[count_features]
df_Liar_between_count = df_Liar_between[count_features]



In [44]:


max_counts = pd.concat([df_Liar_true[count_features].max(axis=0), df_Liar_false[count_features].max(axis=0),
                        df_Liar_between[count_features].max(axis=0)], axis=1)
avg = pd.concat([df_Liar_true[count_features].mean(axis=0), df_Liar_false[count_features].mean(axis=0),
                        df_Liar_between[count_features].mean(axis=0)], axis=1)

std = pd.concat([df_Liar_true[count_features].std(axis=0), df_Liar_false[count_features].std(axis=0),
                 df_Liar_between[count_features].std(axis=0)], axis=1)
feature_count_stats = pd.DataFrame(index=count_features, columns=['Max Counts','avg','std'])


feature_count_stats['std'] = list(zip(std[0].round(2), std[1].round(2), std[2].round(2)))


feature_count_stats['Max Counts'] = list(zip(max_counts[0].astype(int), max_counts[1].astype(int), max_counts[2].astype(int)))
feature_count_stats['avg'] = list(zip(avg[0].round(2), avg[1].round(2), avg[2].round(2)))


feature_count_stats.index.name = 'Feature'

#print(feature_count_stats)


In [46]:
feature_count_stats.head()

Unnamed: 0_level_0,Max Counts,avg,std
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tot_ner_count,"(42, 59, 69)","(2.28, 2.12, 2.33)","(1.76, 1.78, 1.74)"
NER_CARDINAL,"(5, 5, 6)","(0.29, 0.19, 0.28)","(0.6, 0.48, 0.58)"
NER_DATE,"(5, 4, 4)","(0.32, 0.21, 0.28)","(0.62, 0.48, 0.57)"
NER_ORDINAL,"(2, 2, 3)","(0.05, 0.03, 0.04)","(0.24, 0.18, 0.21)"
NER_ORG,"(7, 12, 16)","(0.32, 0.36, 0.32)","(0.63, 0.64, 0.63)"


In [68]:
#latex_table = feature_count_stats.to_latex(index=True, float_format=lambda x: "%.2f" % x)
#print(latex_table)

### End of Notebook