In [1]:
#basic libraries
import os
import pandas as pd 
import numpy as np
import requests
import json
import glob
import pickle

#Visiualization and ML libraries
import sklearn
import matplotlib.pyplot as plt

from scipy.stats import norm
import statistics
import seaborn as sns
from scipy.stats import f_oneway
import scikit_posthocs as sp
from scipy.stats import kruskal
import statsmodels.stats.multicomp as mc
from scipy.stats import mode


from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from mlxtend.plotting import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import RandomOverSampler


In [2]:
Liar_computed = pd.read_csv('/Users/sandrobarreshamers/Thesis_IS_fake_news/ThesisData/Liar_computed_v2.csv')


In [3]:
Liar_computed.columns

Index(['Unnamed: 0', 'json_id', 'claim', 'object', 'binary label',
       'compressed_size', 'readability', 'vader_neg', 'vader_neu', 'vader_pos',
       'vader_compound', 'tot_ner_count', 'ner_counts', 'input_vector_ner',
       'NER_CARDINAL', 'NER_DATE', 'NER_EVENT', 'NER_FAC', 'NER_GPE',
       'NER_LANGUAGE', 'NER_LAW', 'NER_LOC', 'NER_MONEY', 'NER_NORP',
       'NER_ORDINAL', 'NER_ORG', 'NER_PERCENT', 'NER_PERSON', 'NER_PRODUCT',
       'NER_QUANTITY', 'NER_TIME', 'NER_WORK_OF_ART', 'pos counts',
       'input_vector_pos', 'pos_ADJ', 'pos_ADP', 'pos_ADV', 'pos_AUX',
       'pos_CCONJ', 'pos_DET', 'pos_INTJ', 'pos_NOUN', 'pos_NUM', 'pos_PART',
       'pos_PRON', 'pos_PROPN', 'pos_PUNCT', 'pos_SCONJ', 'pos_SYM',
       'pos_VERB', 'pos_X'],
      dtype='object')

In [4]:
from faKy import values_by_label, dunn_table
labels = [0, 1, 2]
df_label = 'binary label'

df_Liar_true = Liar_computed[(Liar_computed['binary label'] == 0)]
df_Liar_false = Liar_computed[(Liar_computed['binary label'] == 1)]
df_Liar_between = Liar_computed[(Liar_computed['binary label'] == 2)]

print(len(df_Liar_true))
print(len(df_Liar_false))
print(len(df_Liar_between))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/sandrobarreshamers/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


1676
2833
5730


### Test for parametic

In [5]:
quality_attributes = ['compressed_size', 'readability', 'vader_neg', 'vader_neu', 'vader_pos', 
                      'vader_compound']
qa_dict = {}
for qa in quality_attributes:
    qa_dict[qa] = pd.to_numeric(Liar_computed[qa], errors='coerce').dropna().tolist()



In [6]:
from scipy.stats import kstest

test_result = []
for qa in quality_attributes:
    stat, p = kstest(qa_dict[qa], 'norm')
    test_result.append({'QA': qa, 'non-parametric': p < 0.05, 'p-value': p})

kolmogorov_smirnof_results = pd.DataFrame(test_result)

print(kolmogorov_smirnof_results)



                QA  non-parametric  p-value
0  compressed_size            True      0.0
1      readability            True      0.0
2        vader_neg            True      0.0
3        vader_neu            True      0.0
4        vader_pos            True      0.0
5   vader_compound            True      0.0


In [7]:
latex_table = kolmogorov_smirnof_results.to_latex(index=False, float_format=lambda x: "%.2f" % x)
print(latex_table)

\begin{tabular}{llr}
\toprule
             QA &  non-parametric &  p-value \\
\midrule
compressed\_size &            True &     0.00 \\
    readability &            True &     0.00 \\
      vader\_neg &            True &     0.00 \\
      vader\_neu &            True &     0.00 \\
      vader\_pos &            True &     0.00 \\
 vader\_compound &            True &     0.00 \\
\bottomrule
\end{tabular}



  latex_table = kolmogorov_smirnof_results.to_latex(index=False, float_format=lambda x: "%.2f" % x)


# Kruskal Dunn tests

### Compressed

In [8]:
label_com = values_by_label(Liar_computed, 'compressed_size',labels, df_label)
stat, p = kruskal(*label_com) # unpack the elements 

print('Kruskal results:')
print(f'F-statistic: {stat:.3f}')
print(f'p-value: {p:.3e}')


Kruskal results:
F-statistic: 79.488
p-value: 5.488e-18


In [10]:
dunn_results = sp.posthoc_dunn(label_com, p_adjust='bonferroni')
print('Dunn results for the information complexity')
df_dunn_IC = dunn_table(dunn_results)
print(df_dunn_IC)


Dunn results for the information complexity
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False  0.000049   True  0.028887   True
2       0.000049   True       1.0  False       0.0   True
3       0.028887   True       0.0   True       1.0  False


In [11]:
latex_table_DIC = df_dunn_IC.to_latex(index=False, float_format=lambda x: "%.2f" % x)
print(latex_table_DIC)

\begin{tabular}{llllll}
\toprule
    1 & \multicolumn{2}{l}{2} & \multicolumn{2}{l}{3} \\
value & reject & value & reject & value & reject \\
\midrule
 1.00 &  False &  0.00 &   True &  0.03 &   True \\
 0.00 &   True &  1.00 &  False &  0.00 &   True \\
 0.03 &   True &  0.00 &   True &  1.00 &  False \\
\bottomrule
\end{tabular}



  latex_table_DIC = df_dunn_IC.to_latex(index=False, float_format=lambda x: "%.2f" % x)


### Readability

In [12]:
label_fke = values_by_label(Liar_computed, 'readability',labels,df_label)
stat, p = kruskal(*label_fke) # unpack the elements 

print('Kruskal results:')
print(f'F-statistic: {stat:.3f}')
print(f'p-value: {p:.3e}')



Kruskal results:
F-statistic: 49.863
p-value: 1.487e-11


In [13]:
dunn_results = sp.posthoc_dunn(label_fke, p_adjust='bonferroni')
df_dunn_R = dunn_table(dunn_results)

print('Dunn results for the FKE-readability')
dunn_table(dunn_results)

Dunn results for the FKE-readability


group,1,1,2,2,3,3
metric,value,reject,value,reject,value,reject
1,1.0,False,0.0,True,0.012227,True
2,0.0,True,1.0,False,0.0,True
3,0.012227,True,0.0,True,1.0,False


In [14]:
latex_table = df_dunn_R.to_latex(index=True, float_format=lambda x: "%.2f" % x)
print(latex_table)

\begin{tabular}{lllllll}
\toprule
group & \multicolumn{2}{l}{1} & \multicolumn{2}{l}{2} & \multicolumn{2}{l}{3} \\
metric & value & reject & value & reject & value & reject \\
\midrule
1 &  1.00 &  False &  0.00 &   True &  0.01 &   True \\
2 &  0.00 &   True &  1.00 &  False &  0.00 &   True \\
3 &  0.01 &   True &  0.00 &   True &  1.00 &  False \\
\bottomrule
\end{tabular}



  latex_table = df_dunn_R.to_latex(index=True, float_format=lambda x: "%.2f" % x)


### VADER labels

In [15]:
vader_labels = ['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']

computed_labels = []
for vader in vader_labels:
    label_vader = values_by_label(Liar_computed, vader,labels,df_label)
    stat, p = kruskal(*label_vader) # unpack the elements
    computed_labels.append([stat, p])

for i, (stat, p), vader in zip(range(len(computed_labels)), computed_labels, vader_labels):
    print(f'{vader} : fstat {stat}; p {p}')

vader_neg : fstat 10.409693080422452; p 0.005489892854085175
vader_neu : fstat 10.17546297354846; p 0.00617200529376983
vader_pos : fstat 4.323321766239063; p 0.11513373846730762
vader_compound : fstat 1.999867736891534; p 0.36790377041511385


In [16]:
df_dict = {
    "vader_neg": pd.DataFrame(),
    "vader_neu": pd.DataFrame(),
    "vader_pos": pd.DataFrame(),
    "vader_compound": pd.DataFrame()
}

for vader in vader_labels:
    label_vader = values_by_label(Liar_computed, vader, labels, df_label)
    dunn_results = sp.posthoc_dunn(label_vader, p_adjust='bonferroni')
    result_dunn_test = dunn_table(dunn_results)
    
    df_dict[vader] = result_dunn_test
        
    print(f'Dunn results: {vader}')
    print(result_dunn_test)


Dunn results: vader_neg
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False       1.0  False  0.022119   True
2            1.0  False       1.0  False  0.041474   True
3       0.022119   True  0.041474   True       1.0  False
Dunn results: vader_neu
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False  0.060342  False  0.004317   True
2       0.060342  False       1.0  False       1.0  False
3       0.004317   True       1.0  False       1.0  False
Dunn results: vader_pos
group          1                2                3       
metric     value reject     value reject     value reject
1            1.0  False  0.144254  False  0.197013  False
2       0.144254  False       1.0  False       1.0  False
3       0.197013  False       1.0  False       1.0  False
Dunn results: vader_compound
group      1                2

In [46]:
vader_results = pd.concat([df_dict["vader_neg"], df_dict["vader_neu"], df_dict["vader_pos"], df_dict["vader_compound"]], axis=1)

vader_results.columns = vader_results.columns.set_names(['Dunn results', 'metric'])

vader_results = vader_results.reset_index()

print(vader_results)

Dunn results index         1                2                3         \
metric                 value reject     value reject     value reject   
0                1       1.0  False       1.0  False  0.022119   True   
1                2       1.0  False       1.0  False  0.041474   True   
2                3  0.022119   True  0.041474   True       1.0  False   

Dunn results         1                2  ...                          3  \
metric           value reject     value  ...     value reject     value   
0                  1.0  False  0.060342  ...  0.144254  False  0.197013   
1             0.060342  False       1.0  ...       1.0  False       1.0   
2             0.004317   True       1.0  ...       1.0  False       1.0   

Dunn results            1                2                3         
metric       reject value reject     value reject     value reject  
0             False   1.0  False       1.0  False       1.0  False  
1             False   1.0  False       1.0  False  

  vader_results = vader_results.reset_index()


In [48]:
latex_table_vader = vader_results.to_latex(index=True, float_format=lambda x: "%.2f" % x)
#print(latex_table_vader)

  latex_table_vader = vader_results.to_latex(index=True, float_format=lambda x: "%.2f" % x)


### NER

In [26]:
label_ner_sum = values_by_label(Liar_computed, 'tot_ner_count',labels,df_label)
stat, p = kruskal(*label_ner_sum) 

print('NER count results:')
print(f'F-statistic: {stat:.3f}')
print(f'p-value: {p:.3e}')

dunn_results = sp.posthoc_dunn(label_ner_sum, p_adjust='bonferroni')


print('Dunn results for the FKE-readability')
df_dunn_NER = dunn_table(dunn_results)

NER count results:
F-statistic: 42.182
p-value: 6.924e-10
Dunn results for the FKE-readability


In [27]:
latex_table = df_dunn_NER.to_latex(index=True, float_format=lambda x: "%.2f" % x)
print(latex_table)

\begin{tabular}{lllllll}
\toprule
group & \multicolumn{2}{l}{1} & \multicolumn{2}{l}{2} & \multicolumn{2}{l}{3} \\
metric & value & reject & value & reject & value & reject \\
\midrule
1 &  1.00 &  False &  0.00 &   True &  0.41 &  False \\
2 &  0.00 &   True &  1.00 &  False &  0.00 &   True \\
3 &  0.41 &  False &  0.00 &   True &  1.00 &  False \\
\bottomrule
\end{tabular}



  latex_table = df_dunn_NER.to_latex(index=True, float_format=lambda x: "%.2f" % x)


### NER counts tag

In [28]:
ner_labels = [ 'NER_CARDINAL',
       'NER_DATE', 'NER_EVENT', 'NER_FAC', 'NER_GPE', 'NER_LANGUAGE',
       'NER_LAW', 'NER_LOC', 'NER_MONEY', 'NER_NORP', 'NER_ORDINAL', 'NER_ORG',
       'NER_PERCENT', 'NER_PERSON', 'NER_PRODUCT', 'NER_QUANTITY', 'NER_TIME',
       'NER_WORK_OF_ART']
computed_ner_labels = []
for ner in ner_labels:
    label_ner = values_by_label(Liar_computed, ner, labels, df_label)
    stat, p = kruskal(*label_ner) # unpack the elements 
    computed_ner_labels.append([stat, p])

for i, (stat, p), pos in zip(range(len(computed_ner_labels)), computed_ner_labels, ner_labels):
       print(f'{pos} : fstat {stat}; p {p}')

NER_CARDINAL : fstat 49.83110092286668; p 1.5111720797740373e-11
NER_DATE : fstat 51.61532051517984; p 6.192636087610597e-12
NER_EVENT : fstat 0.17192881583577338; p 0.9176268908905847
NER_FAC : fstat 1.62191142550578; p 0.44443311279487685
NER_GPE : fstat 4.891785484232781; p 0.08664874537809603
NER_LANGUAGE : fstat 0.2765099377408568; p 0.8708766170086077
NER_LAW : fstat 5.044810889786552; p 0.08026629818911608
NER_LOC : fstat 0.4509002696889834; p 0.7981568596715132
NER_MONEY : fstat 35.30762339462919; p 2.1530147012843794e-08
NER_NORP : fstat 0.7534040417699982; p 0.6861204930154862
NER_ORDINAL : fstat 13.269417872165102; p 0.0013139611361827818
NER_ORG : fstat 15.497830751288278; p 0.00043120998788284286
NER_PERCENT : fstat 51.19931270863278; p 7.624484865681605e-12
NER_PERSON : fstat 82.52962410037611; p 1.1992778976434277e-18
NER_PRODUCT : fstat 1.096845519071406; p 0.5778605169801413
NER_QUANTITY : fstat 3.320164395658514; p 0.19012335173241685
NER_TIME : fstat 3.71035263653848

In [29]:
df_dict_ner = {}
for label in ner_labels:
    df_dict_ner[label] = pd.DataFrame()


for ner in ner_labels:
    label_ner = values_by_label(Liar_computed, ner, labels, df_label)
    dunn_results = sp.posthoc_dunn(label_ner, p_adjust='bonferroni')
    result_dunn_test_ner = dunn_table(dunn_results)
    
    df_dict_ner[ner] = result_dunn_test_ner
        
    #print(f'Dunn results: {ner}')
    #print(result_dunn_test_ner)

    


In [30]:
df_list_ner = []

for label in df_dict_ner.keys():
    df_list_ner.append(df_dict_ner[label])

result_ner_tag = pd.concat(df_list_ner, axis=0)

result_ner_tag.columns = result_ner_tag.columns.set_names(['Dunn results', 'metric'])
result_ner_tag = result_ner_tag.reset_index()

print(result_ner_tag)


Dunn results index         1                2                3       
metric                 value reject     value reject     value reject
0                1       1.0  False       0.0   True       1.0  False
1                2       0.0   True       1.0  False       0.0   True
2                3       1.0  False       0.0   True       1.0  False
3                1       1.0  False       0.0   True  0.020809   True
4                2       0.0   True       1.0  False       0.0   True
5                3  0.020809   True       0.0   True       1.0  False
6                1       1.0  False       1.0  False       1.0  False
7                2       1.0  False       1.0  False       1.0  False
8                3       1.0  False       1.0  False       1.0  False
9                1       1.0  False  0.644083  False  0.852657  False
10               2  0.644083  False       1.0  False       1.0  False
11               3  0.852657  False       1.0  False       1.0  False
12               1  

### Significant features:
- CARDINAL p 0.0
- DATE  p 0.0
- ORDINAL p 0.0
- ORG p 0.0
- PERCENT 0.0
- PERSON 0.0


In [31]:
latex_table_ner_tags = result_ner_tag.to_latex(index=True, float_format=lambda x: "%.2f" % x)
#print(latex_table_ner_tags)

  latex_table_ner_tags = result_ner_tag.to_latex(index=True, float_format=lambda x: "%.2f" % x)


### Pos labels

In [32]:
pos_labels = [ 'pos_ADJ',
       'pos_ADP', 'pos_ADV', 'pos_AUX', 'pos_CCONJ', 'pos_DET', 'pos_INTJ',
       'pos_NOUN', 'pos_NUM', 'pos_PART', 'pos_PRON', 'pos_PROPN', 'pos_PUNCT',
       'pos_SCONJ', 'pos_SYM', 'pos_VERB', 'pos_X']

computed_pos_labels = []
for pos in pos_labels:
    label_pos = values_by_label(Liar_computed, pos, labels, df_label)
    stat, p = kruskal(*label_pos) # unpack the elements
    computed_pos_labels.append([stat, p])

for i, (stat, p), pos in zip(range(len(computed_pos_labels)), computed_pos_labels, pos_labels):
       print(f'{pos} : fstat {stat}; p {p}')


pos_ADJ : fstat 75.63363133229193; p 3.7702107990073795e-17
pos_ADP : fstat 52.88463642780589; p 3.2828203050086716e-12
pos_ADV : fstat 15.343928130378757; p 0.000465702255291164
pos_AUX : fstat 10.672755272313356; p 0.004813274601537372
pos_CCONJ : fstat 30.78256967100686; p 2.0684733341776858e-07
pos_DET : fstat 17.70000951293461; p 0.0001433810542873743
pos_INTJ : fstat 1.6205299951697043; p 0.44474019552820465
pos_NOUN : fstat 74.42562883952174; p 6.897314694451533e-17
pos_NUM : fstat 157.83110536457033; p 5.33839781071815e-35
pos_PART : fstat 15.798490526270482; p 0.0003710234598959158
pos_PRON : fstat 6.770194727289172; p 0.03387434405551494
pos_PROPN : fstat 49.15895156684152; p 2.1147999875958165e-11
pos_PUNCT : fstat 39.48929409787203; p 2.6607786034061635e-09
pos_SCONJ : fstat 50.565192976835576; p 1.0469064051354758e-11
pos_SYM : fstat 27.912685893597782; p 8.686348924034588e-07
pos_VERB : fstat 45.05819719363287; p 1.6433754586485928e-10
pos_X : fstat 4.769992151936845; p 0

In [33]:
for pos in pos_labels:
    label_pos = values_by_label(Liar_computed, pos,labels,df_label)
    dunn_results = sp.posthoc_dunn(label_pos, p_adjust='bonferroni')
    a = dunn_table(dunn_results)
    latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
    print(f'Dunn results: {pos}')
    print(latex_table_pos)

Dunn results: pos_ADJ
\begin{tabular}{lllllll}
\toprule
group & \multicolumn{2}{l}{1} & \multicolumn{2}{l}{2} & \multicolumn{2}{l}{3} \\
metric & value & reject & value & reject & value & reject \\
\midrule
1 &  1.00 &  False &  0.00 &   True &  0.90 &  False \\
2 &  0.00 &   True &  1.00 &  False &  0.00 &   True \\
3 &  0.90 &  False &  0.00 &   True &  1.00 &  False \\
\bottomrule
\end{tabular}

Dunn results: pos_ADP
\begin{tabular}{lllllll}
\toprule
group & \multicolumn{2}{l}{1} & \multicolumn{2}{l}{2} & \multicolumn{2}{l}{3} \\
metric & value & reject & value & reject & value & reject \\
\midrule
1 &  1.00 &  False &  0.00 &   True &  1.00 &  False \\
2 &  0.00 &   True &  1.00 &  False &  0.00 &   True \\
3 &  1.00 &  False &  0.00 &   True &  1.00 &  False \\
\bottomrule
\end{tabular}



  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)


Dunn results: pos_ADV
\begin{tabular}{lllllll}
\toprule
group & \multicolumn{2}{l}{1} & \multicolumn{2}{l}{2} & \multicolumn{2}{l}{3} \\
metric & value & reject & value & reject & value & reject \\
\midrule
1 &  1.00 &  False &  0.00 &   True &  1.00 &  False \\
2 &  0.00 &   True &  1.00 &  False &  0.00 &   True \\
3 &  1.00 &  False &  0.00 &   True &  1.00 &  False \\
\bottomrule
\end{tabular}

Dunn results: pos_AUX
\begin{tabular}{lllllll}
\toprule
group & \multicolumn{2}{l}{1} & \multicolumn{2}{l}{2} & \multicolumn{2}{l}{3} \\
metric & value & reject & value & reject & value & reject \\
\midrule
1 &  1.00 &  False &  0.01 &   True &  0.01 &   True \\
2 &  0.01 &   True &  1.00 &  False &  1.00 &  False \\
3 &  0.01 &   True &  1.00 &  False &  1.00 &  False \\
\bottomrule
\end{tabular}

Dunn results: pos_CCONJ
\begin{tabular}{lllllll}
\toprule
group & \multicolumn{2}{l}{1} & \multicolumn{2}{l}{2} & \multicolumn{2}{l}{3} \\
metric & value & reject & value & reject & value & reject

  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)
  latex_table_pos = a.to_latex(index=True, float_format=lambda x: "%.2f" % x)


### Significant POS features
- ADJ p: 0.00
- ADP p: 0.00
- AUX p: 0.01
- CCONJ p: 0.05
- DET p: 0.00
- NOUN p: 0.00
- NUM p:0.00
- PART p:0.00
- PUNCT p:0.00
- SCONJ p: 0.00
- VERB p:0 .00

## aggregate results


In [36]:
def get_stats(df, column_name):
    avg = df[column_name].mean()
    max_val = df[column_name].max()
    mode = df[column_name].mode()[0]
    std = df[column_name].std()
    return pd.DataFrame({'avg': [avg], 'max': [max_val], 'most common': [mode], 'std': [std]})

In [37]:
stat_columns = ['compressed_size', 'readability']
df_Liar_true_agg = df_Liar_true[stat_columns]
df_Liar_false_agg = df_Liar_false[stat_columns]
df_Liar_between_agg = df_Liar_between[stat_columns]

In [38]:
stat_columns = ['compressed_size', 'readability']
labels = [0, 1, 2]

rows = []
for feature in stat_columns:
    for label in labels:
        df = Liar_computed[Liar_computed['binary label'] == label][[feature]]
        stats = get_stats(df, feature)
        rows.append({'feature': feature, 'label': label, 'avg': stats.iloc[0]['avg'], 'max': stats.iloc[0]['max'], 
                     'most common': stats.iloc[0]['most common'],'std': stats.iloc[0]['std']})
        
df_stats = pd.DataFrame(rows)



In [39]:
print(df_stats)

           feature  label          avg            max  most common  \
0  compressed_size      0  8555.133055  149837.000000  5592.000000   
1  compressed_size      1  8130.459583  197589.000000  5964.000000   
2  compressed_size      2  8775.910471  294076.000000  5951.000000   
3      readability      0    60.759599     127.215714    74.270000   
4      readability      1    56.334270     124.155000    60.705000   
5      readability      2    59.047204     151.000000    56.978462   

           std  
0  4779.259754  
1  4929.506473  
2  5112.168963  
3    21.520138  
4    22.948483  
5    22.439190  


In [40]:
# Assuming you have a dataframe called stats_df
latex_table = df_stats.to_latex(index=False, float_format=lambda x: "%.2f" % x)
print(latex_table)


\begin{tabular}{lrrrrr}
\toprule
        feature &  label &     avg &       max &  most common &     std \\
\midrule
compressed\_size &      0 & 8555.13 & 149837.00 &      5592.00 & 4779.26 \\
compressed\_size &      1 & 8130.46 & 197589.00 &      5964.00 & 4929.51 \\
compressed\_size &      2 & 8775.91 & 294076.00 &      5951.00 & 5112.17 \\
    readability &      0 &   60.76 &    127.22 &        74.27 &   21.52 \\
    readability &      1 &   56.33 &    124.16 &        60.71 &   22.95 \\
    readability &      2 &   59.05 &    151.00 &        56.98 &   22.44 \\
\bottomrule
\end{tabular}



  latex_table = df_stats.to_latex(index=False, float_format=lambda x: "%.2f" % x)


### Discrete values

- total sum of the counts
- average sum of count /total
- max number of count for a individual

In [41]:
count_features = ['tot_ner_count','NER_CARDINAL', 'NER_DATE','NER_ORDINAL', 'NER_ORG','NER_PERCENT', 'NER_PERSON', 
                  'pos_ADJ','pos_ADP','pos_AUX', 'pos_CCONJ', 'pos_DET', 'pos_NOUN','pos_NUM','pos_PART',
                    'pos_PUNCT', 'pos_SCONJ','pos_VERB', ]
df_Liar_true_count = df_Liar_true[count_features]
df_Liar_false_count = df_Liar_false[count_features]
df_Liar_between_count = df_Liar_between[count_features]



In [42]:


max_counts = pd.concat([df_Liar_true[count_features].max(axis=0), df_Liar_false[count_features].max(axis=0),
                        df_Liar_between[count_features].max(axis=0)], axis=1)
avg = pd.concat([df_Liar_true[count_features].mean(axis=0), df_Liar_false[count_features].mean(axis=0),
                        df_Liar_between[count_features].mean(axis=0)], axis=1)

std = pd.concat([df_Liar_true[count_features].std(axis=0), df_Liar_false[count_features].std(axis=0),
                 df_Liar_between[count_features].std(axis=0)], axis=1)
feature_count_stats = pd.DataFrame(index=count_features, columns=['Max Counts','avg','std'])


feature_count_stats['std'] = list(zip(std[0].round(2), std[1].round(2), std[2].round(2)))


feature_count_stats['Max Counts'] = list(zip(max_counts[0].astype(int), max_counts[1].astype(int), max_counts[2].astype(int)))
feature_count_stats['avg'] = list(zip(avg[0].round(2), avg[1].round(2), avg[2].round(2)))


feature_count_stats.index.name = 'Feature'

print(feature_count_stats)


                  Max Counts                 avg                 std
Feature                                                             
tot_ner_count   (42, 59, 69)  (2.28, 2.12, 2.33)  (1.76, 1.78, 1.74)
NER_CARDINAL       (5, 5, 6)  (0.29, 0.19, 0.28)   (0.6, 0.48, 0.58)
NER_DATE           (5, 4, 4)  (0.32, 0.21, 0.28)  (0.62, 0.48, 0.57)
NER_ORDINAL        (2, 2, 3)  (0.05, 0.03, 0.04)  (0.24, 0.18, 0.21)
NER_ORG          (7, 12, 16)  (0.32, 0.36, 0.32)  (0.63, 0.64, 0.63)
NER_PERCENT        (4, 3, 6)  (0.16, 0.08, 0.14)   (0.47, 0.3, 0.41)
NER_PERSON       (11, 9, 11)  (0.34, 0.51, 0.45)  (0.66, 0.71, 0.68)
pos_ADJ         (21, 32, 45)  (1.34, 1.07, 1.31)   (1.34, 1.23, 1.4)
pos_ADP         (20, 27, 31)  (2.19, 1.95, 2.18)  (1.55, 1.54, 1.59)
pos_AUX           (8, 8, 11)   (1.0, 0.92, 0.92)  (0.99, 0.96, 0.95)
pos_CCONJ          (5, 4, 7)   (0.35, 0.3, 0.38)  (0.62, 0.58, 0.64)
pos_DET         (23, 33, 39)   (2.01, 1.85, 2.0)  (1.59, 1.57, 1.64)
pos_NOUN       (59, 76, 129)  (4.2

In [43]:
latex_table = feature_count_stats.to_latex(index=True, float_format=lambda x: "%.2f" % x)
print(latex_table)

\begin{tabular}{llll}
\toprule
{} &     Max Counts &                 avg &                 std \\
Feature       &                &                     &                     \\
\midrule
tot\_ner\_count &   (42, 59, 69) &  (2.28, 2.12, 2.33) &  (1.76, 1.78, 1.74) \\
NER\_CARDINAL  &      (5, 5, 6) &  (0.29, 0.19, 0.28) &   (0.6, 0.48, 0.58) \\
NER\_DATE      &      (5, 4, 4) &  (0.32, 0.21, 0.28) &  (0.62, 0.48, 0.57) \\
NER\_ORDINAL   &      (2, 2, 3) &  (0.05, 0.03, 0.04) &  (0.24, 0.18, 0.21) \\
NER\_ORG       &    (7, 12, 16) &  (0.32, 0.36, 0.32) &  (0.63, 0.64, 0.63) \\
NER\_PERCENT   &      (4, 3, 6) &  (0.16, 0.08, 0.14) &   (0.47, 0.3, 0.41) \\
NER\_PERSON    &    (11, 9, 11) &  (0.34, 0.51, 0.45) &  (0.66, 0.71, 0.68) \\
pos\_ADJ       &   (21, 32, 45) &  (1.34, 1.07, 1.31) &   (1.34, 1.23, 1.4) \\
pos\_ADP       &   (20, 27, 31) &  (2.19, 1.95, 2.18) &  (1.55, 1.54, 1.59) \\
pos\_AUX       &     (8, 8, 11) &   (1.0, 0.92, 0.92) &  (0.99, 0.96, 0.95) \\
pos\_CCONJ     &      (5

  latex_table = feature_count_stats.to_latex(index=True, float_format=lambda x: "%.2f" % x)
