In [1]:
#basic libraries
import os
import pandas as pd 
import numpy as np
import requests
import json
import glob
import pickle

#Visiualization and ML libraries
import sklearn
import matplotlib.pyplot as plt

from scipy.stats import norm
import statistics
import seaborn as sns
from scipy.stats import f_oneway
import scikit_posthocs as sp
from scipy.stats import kruskal
import statsmodels.stats.multicomp as mc

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from mlxtend.plotting import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import RandomOverSampler


In [10]:
Liar_computed = pd.read_csv('/Users/sandrobarreshamers/Thesis_IS_fake_news/ThesisData/Liar_computed_v2.csv')


In [19]:
from faKy import values_by_label, compute_statistics, dunn_table
labels = [0, 1, 2]
df_label = 'binary label'

### Compressed

In [20]:
label_com = values_by_label(Liar_computed, 'compressed_size',labels, df_label)
stat, p = kruskal(*label_com) # unpack the elements 

print('Kruskal results:')
print(f'F-statistic: {stat:.3f}')
print(f'p-value: {p:.3e}')


Kruskal results:
F-statistic: 79.488
p-value: 5.488e-18


In [21]:
dunn_results = sp.posthoc_dunn(label_com, p_adjust='bonferroni')

print('Dunn results for the information complexity')
dunn_table(dunn_results)


Dunn results for the information complexity


group,1,1,2,2,3,3
metric,value,reject,value,reject,value,reject
1,1.0,False,4.9e-05,True,0.028887,True
2,4.9e-05,True,1.0,False,0.0,True
3,0.028887,True,0.0,True,1.0,False


### Readability

In [14]:
label_fke = values_by_label(Liar_computed, 'readability',labels,df_label)
stat, p = kruskal(*label_fke) # unpack the elements 

print('Kruskal results:')
print(f'F-statistic: {stat:.3f}')
print(f'p-value: {p:.3e}')



Kruskal results:
F-statistic: 49.863
p-value: 1.487e-11


In [22]:
dunn_results = sp.posthoc_dunn(label_fke, p_adjust='bonferroni')


print('Dunn results for the FKE-readability')
dunn_table(dunn_results)

Dunn results for the FKE-readability


group,1,1,2,2,3,3
metric,value,reject,value,reject,value,reject
1,1.0,False,0.0,True,0.012227,True
2,0.0,True,1.0,False,0.0,True
3,0.012227,True,0.0,True,1.0,False


### VADER labels

In [29]:
vader_labels = ['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']

computed_labels = []
for vader in vader_labels:
    label_vader = values_by_label(Liar_computed, vader,labels,df_label)
    stat, p = kruskal(*label_vader) # unpack the elements
    computed_labels.append([stat, p])

for i, (stat, p), vader in zip(range(len(computed_labels)), computed_labels, vader_labels):
    print(f'{vader} : fstat {stat}; p {p}')

vader_neg : fstat 10.409693080422452; p 0.005489892854085175
vader_neu : fstat 10.17546297354846; p 0.00617200529376983
vader_pos : fstat 4.323321766239063; p 0.11513373846730762
vader_compound : fstat 1.999867736891534; p 0.36790377041511385


In [30]:
for vader in vader_labels:
    label_vader = values_by_label(Liar_computed, vader,labels,df_label)
    dunn_results = sp.posthoc_dunn(label_vader, p_adjust='bonferroni')
    a = dunn_table(dunn_results)
    print(f'Dunn results: {vader}')
    print(a)


Dunn results: vader_neg
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False       1.0  False  0.022119   True
2            1.0  False       1.0  False  0.041474   True
3       0.022119   True  0.041474   True       1.0  False
Dunn results: vader_neu
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False  0.060342  False  0.004317   True
2       0.060342  False       1.0  False       1.0  False
3       0.004317   True       1.0  False       1.0  False
Dunn results: vader_pos
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False  0.144254  False  0.197013  False
2       0.144254  False       1.0  False       1.0  False
3       0.197013  False       1.0  False       1.0  False
Dunn results: vader_compound
group      1                2

### NER

In [16]:
ner_labels = [ 'NER_CARDINAL',
       'NER_DATE', 'NER_EVENT', 'NER_FAC', 'NER_GPE', 'NER_LANGUAGE',
       'NER_LAW', 'NER_LOC', 'NER_MONEY', 'NER_NORP', 'NER_ORDINAL', 'NER_ORG',
       'NER_PERCENT', 'NER_PERSON', 'NER_PRODUCT', 'NER_QUANTITY', 'NER_TIME',
       'NER_WORK_OF_ART']
computed_ner_labels = []
for ner in ner_labels:
    label_ner = values_by_label(Liar_computed, ner, labels, df_label)
    stat, p = kruskal(*label_ner) # unpack the elements 
    computed_ner_labels.append([stat, p])

for i, (stat, p), pos in zip(range(len(computed_ner_labels)), computed_ner_labels, ner_labels):
       print(f'{pos} : fstat {stat}; p {p}')

NER_CARDINAL : fstat 49.83110092286668; p 1.5111720797740373e-11
NER_DATE : fstat 51.61532051517984; p 6.192636087610597e-12
NER_EVENT : fstat 0.17192881583577338; p 0.9176268908905847
NER_FAC : fstat 1.62191142550578; p 0.44443311279487685
NER_GPE : fstat 4.891785484232781; p 0.08664874537809603
NER_LANGUAGE : fstat 0.2765099377408568; p 0.8708766170086077
NER_LAW : fstat 5.044810889786552; p 0.08026629818911608
NER_LOC : fstat 0.4509002696889834; p 0.7981568596715132
NER_MONEY : fstat 35.30762339462919; p 2.1530147012843794e-08
NER_NORP : fstat 0.7534040417699982; p 0.6861204930154862
NER_ORDINAL : fstat 13.269417872165102; p 0.0013139611361827818
NER_ORG : fstat 15.497830751288278; p 0.00043120998788284286
NER_PERCENT : fstat 51.19931270863278; p 7.624484865681605e-12
NER_PERSON : fstat 82.52962410037611; p 1.1992778976434277e-18
NER_PRODUCT : fstat 1.096845519071406; p 0.5778605169801413
NER_QUANTITY : fstat 3.320164395658514; p 0.19012335173241685
NER_TIME : fstat 3.71035263653848

In [25]:
for ner in ner_labels:
    label_ner = values_by_label(Liar_computed, ner,labels,df_label)
    dunn_results = sp.posthoc_dunn(label_ner, p_adjust='bonferroni')
    a = dunn_table(dunn_results)
    print(f'Dunn results: {ner}')
    print(a)

Dunn results: NER_CARDINAL
group      1            2            3       
metric value reject value reject value reject
1        1.0  False   0.0   True   1.0  False
2        0.0   True   1.0  False   0.0   True
3        1.0  False   0.0   True   1.0  False
Dunn results: NER_DATE
group          1            2                3       
metric     value reject value reject     value reject
1            1.0  False   0.0   True  0.020809   True
2            0.0   True   1.0  False       0.0   True
3       0.020809   True   0.0   True       1.0  False
Dunn results: NER_EVENT
group      1            2            3       
metric value reject value reject value reject
1        1.0  False   1.0  False   1.0  False
2        1.0  False   1.0  False   1.0  False
3        1.0  False   1.0  False   1.0  False
Dunn results: NER_FAC
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False  0.644083  False  0.852657  False


### Pos labels

In [17]:
pos_labels = [ 'pos_ADJ',
       'pos_ADP', 'pos_ADV', 'pos_AUX', 'pos_CCONJ', 'pos_DET', 'pos_INTJ',
       'pos_NOUN', 'pos_NUM', 'pos_PART', 'pos_PRON', 'pos_PROPN', 'pos_PUNCT',
       'pos_SCONJ', 'pos_SYM', 'pos_VERB', 'pos_X']

computed_pos_labels = []
for pos in pos_labels:
    label_pos = values_by_label(Liar_computed, pos, labels, df_label)
    stat, p = kruskal(*label_pos) # unpack the elements
    computed_pos_labels.append([stat, p])

for i, (stat, p), pos in zip(range(len(computed_pos_labels)), computed_pos_labels, pos_labels):
       print(f'{pos} : fstat {stat}; p {p}')


pos_ADJ : fstat 75.63363133229193; p 3.7702107990073795e-17
pos_ADP : fstat 52.88463642780589; p 3.2828203050086716e-12
pos_ADV : fstat 15.343928130378757; p 0.000465702255291164
pos_AUX : fstat 10.672755272313356; p 0.004813274601537372
pos_CCONJ : fstat 30.78256967100686; p 2.0684733341776858e-07
pos_DET : fstat 17.70000951293461; p 0.0001433810542873743
pos_INTJ : fstat 1.6205299951697043; p 0.44474019552820465
pos_NOUN : fstat 74.42562883952174; p 6.897314694451533e-17
pos_NUM : fstat 157.83110536457033; p 5.33839781071815e-35
pos_PART : fstat 15.798490526270482; p 0.0003710234598959158
pos_PRON : fstat 6.770194727289172; p 0.03387434405551494
pos_PROPN : fstat 49.15895156684152; p 2.1147999875958165e-11
pos_PUNCT : fstat 39.48929409787203; p 2.6607786034061635e-09
pos_SCONJ : fstat 50.565192976835576; p 1.0469064051354758e-11
pos_SYM : fstat 27.912685893597782; p 8.686348924034588e-07
pos_VERB : fstat 45.05819719363287; p 1.6433754586485928e-10
pos_X : fstat 4.769992151936845; p 0

In [28]:
for pos in pos_labels:
    label_pos = values_by_label(Liar_computed, pos,labels,df_label)
    dunn_results = sp.posthoc_dunn(label_pos, p_adjust='bonferroni')
    a = dunn_table(dunn_results)
    print(f'Dunn results: {pos}')
    print(a)

Dunn results: pos_ADJ
group          1            2                3       
metric     value reject value reject     value reject
1            1.0  False   0.0   True  0.895962  False
2            0.0   True   1.0  False       0.0   True
3       0.895962  False   0.0   True       1.0  False
Dunn results: pos_ADP
group      1            2            3       
metric value reject value reject value reject
1        1.0  False   0.0   True   1.0  False
2        0.0   True   1.0  False   0.0   True
3        1.0  False   0.0   True   1.0  False
Dunn results: pos_ADV
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False  0.003806   True       1.0  False
2       0.003806   True       1.0  False  0.001214   True
3            1.0  False  0.001214   True       1.0  False
Dunn results: pos_AUX
group          1                2                3       
metric     value reject     value reject     value reject
1     