In [1]:
import pandas as pd 
import numpy as np

import sklearn
import matplotlib.pyplot as plt

from scipy.stats import norm
import statistics
import seaborn as sns
from scipy.stats import f_oneway
import scikit_posthocs as sp
from scipy.stats import kruskal
import statsmodels.stats.multicomp as mc

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from mlxtend.plotting import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score



# Importing data
- This notebook will classify the different claims with the help of the significant features.
- We import the computed Liar data frame and conduct several classification algorithms

In [16]:
train_data = pd.read_csv('/Users/sandrobarreshamers/Thesis_IS_fake_news/ThesisData/Liar_computed_final_version.csv')


In [18]:
train_data.columns

Index(['Unnamed: 0', 'json_id', 'claim', 'object', 'binary label',
       'readability', 'compressed_size', 'vader_neg', 'vader_neu', 'vader_pos',
       'vader_compound', 'tot_ner_count', 'ner_counts', 'input_vector_ner',
       'NER_CARDINAL', 'NER_DATE', 'NER_EVENT', 'NER_FAC', 'NER_GPE',
       'NER_LANGUAGE', 'NER_LAW', 'NER_LOC', 'NER_MONEY', 'NER_NORP',
       'NER_ORDINAL', 'NER_ORG', 'NER_PERCENT', 'NER_PERSON', 'NER_PRODUCT',
       'NER_QUANTITY', 'NER_TIME', 'NER_WORK_OF_ART', 'pos counts',
       'input_vector_pos', 'pos_ADJ', 'pos_ADP', 'pos_ADV', 'pos_AUX',
       'pos_CCONJ', 'pos_DET', 'pos_INTJ', 'pos_NOUN', 'pos_NUM', 'pos_PART',
       'pos_PRON', 'pos_PROPN', 'pos_PUNCT', 'pos_SCONJ', 'pos_SYM',
       'pos_VERB', 'pos_X'],
      dtype='object')

In [19]:
test_data = pd.read_csv('test_data.csv')


In [20]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,id,binary_label,statement,readability,compressed_size,vader_neg,vader_neu,vader_pos,vader_compound,...,pos_NOUN,pos_NUM,pos_PART,pos_PRON,pos_PROPN,pos_PUNCT,pos_SCONJ,pos_SYM,pos_VERB,pos_X
0,0,11972.json,0,Building a wall on the U.S.-Mexico border will...,49.542727,5197,0.0,1.0,0.0,0.0,...,3.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0,0.0
1,1,11685.json,1,Wisconsin is on pace to double the number of l...,81.855,5548,0.0,0.894,0.106,0.0772,...,4.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
2,2,11096.json,1,Says John McCain has done nothing to help the ...,103.625,4816,0.201,0.799,0.0,-0.3089,...,1.0,0.0,1.0,1.0,2.0,1.0,0.0,0.0,3.0,0.0
3,3,5209.json,2,Suzanne Bonamici supports a plan that will cut...,43.963077,6020,0.127,0.602,0.271,0.34,...,3.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,3.0,0.0
4,4,9524.json,1,When asked by a reporter whether hes at the ce...,62.625,10724,0.225,0.683,0.092,-0.5994,...,5.0,0.0,1.0,1.0,3.0,2.0,1.0,0.0,4.0,0.0


In [21]:
test_data.columns

Index(['Unnamed: 0', 'id', 'binary_label', 'statement', 'readability',
       'compressed_size', 'vader_neg', 'vader_neu', 'vader_pos',
       'vader_compound', 'tot_ner_count', 'ner_counts', 'input_vector_ner',
       'NER_CARDINAL', 'NER_DATE', 'NER_EVENT', 'NER_FAC', 'NER_GPE',
       'NER_LANGUAGE', 'NER_LAW', 'NER_LOC', 'NER_MONEY', 'NER_NORP',
       'NER_ORDINAL', 'NER_ORG', 'NER_PERCENT', 'NER_PERSON', 'NER_PRODUCT',
       'NER_QUANTITY', 'NER_TIME', 'NER_WORK_OF_ART', 'pos counts',
       'input_vector_pos', 'pos_ADJ', 'pos_ADP', 'pos_ADV', 'pos_AUX',
       'pos_CCONJ', 'pos_DET', 'pos_INTJ', 'pos_NOUN', 'pos_NUM', 'pos_PART',
       'pos_PRON', 'pos_PROPN', 'pos_PUNCT', 'pos_SCONJ', 'pos_SYM',
       'pos_VERB', 'pos_X'],
      dtype='object')

## Classification pre-face
- in this code blocks, we prepare the data and set up the experiment to later classify the False and True label
- Based on the previously conducted KST and Dunn tests, we now train the different models on the significant features
- We divide the data in train and test data
- we instantiate the K-fold cross validation for both the train and test data with k=5

In [7]:
no_feature_selection = ['readability', 'compressed_size', 'vader_neg', 'vader_neu', 'vader_pos',
       'vader_compound', 'tot_ner_count',
       'NER_CARDINAL', 'NER_DATE', 'NER_EVENT', 'NER_FAC', 'NER_GPE',
       'NER_LANGUAGE', 'NER_LAW', 'NER_LOC', 'NER_MONEY', 'NER_NORP',
       'NER_ORDINAL', 'NER_ORG', 'NER_PERCENT', 'NER_PERSON', 'NER_PRODUCT',
       'NER_QUANTITY', 'NER_TIME', 'NER_WORK_OF_ART', 'pos_ADJ', 'pos_ADP', 'pos_ADV', 'pos_AUX',
       'pos_CCONJ', 'pos_DET', 'pos_INTJ', 'pos_NOUN', 'pos_NUM', 'pos_PART',
       'pos_PRON', 'pos_PROPN', 'pos_PUNCT', 'pos_SCONJ', 'pos_SYM',
       'pos_VERB', 'pos_X']

all_significant_features = ['readability', 'compressed_size', 'vader_neg', 'vader_neu', 'tot_ner_count','NER_CARDINAL', 'NER_DATE', 'NER_MONEY', 'NER_ORDINAL', 'NER_ORG', 'NER_PERCENT', 'NER_PERSON','NER_WORK_OF_ART',
                            'pos_ADJ', 'pos_ADP', 'pos_ADV', 'pos_AUX', 'pos_CCONJ', 'pos_DET', 'pos_NOUN', 'pos_NUM', 'pos_PART', 'pos_PROPN', 'pos_PUNCT', 'pos_SCONJ', 'pos_SYM', 'pos_VERB']


three_way_significant_features= ['readability', 'compressed_size','NER_CARDINAL','NER_PERSON','pos_AUX','pos_PART','pos_VERB']


## Classification Algorithms
- In the following code blocks, we perform the three ML classification algorithms discussed in the research paper; the three models are:
  - Naive Bias
  - Random Forest
  - Gradient Booster 
  - Dummy classifier
  
- The three models are trained and evaluated on the three feature inputs
  - No feature selection
  - Two-way significant feature selection
  - Three-way significant feature selection

## Naive Bayes

### Naive Bayes no feature selection

In [23]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

# Define the feature sets and their names
feature_selections = [
    (no_feature_selection, 'no_feature_selection'),
    (all_significant_features, 'all_significant_features'),
    (three_way_significant_features, 'three_way_significant_features')
]

NB_CMs = {'cm_NB_all': None, 'cm_NB_s': None, 'cm_NB_three': None}
NB_k_fold_cross_validations_train = []
NB_k_fold_cross_validations_test = []

# Define a custom scoring function for F1 score
scorer = make_scorer(f1_score, average='macro')

for i, (features, feature_name) in enumerate(feature_selections):
    X_train_selected = train_data[features]
    y_train = train_data['binarylabel']
    
    X_test_selected = test_data[features]
    y_test = test_data['binary_label']
    
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train_selected, y_train)

    nb_model = GaussianNB()
    nb_model.fit(X_train_resampled, y_train_resampled)

    y_pred_train = nb_model.predict(X_train_resampled)
    y_pred_test = nb_model.predict(X_test_selected)

    cm_NB_train = confusion_matrix(y_train_resampled, y_pred_train)
    cm_NB_test = confusion_matrix(y_test, y_pred_test)
    NB_CMs[list(NB_CMs.keys())[i]] = {'train': cm_NB_train, 'test': cm_NB_test}

    kfold_train = KFold(n_splits=5, shuffle=True, random_state=42)
    scores_train = cross_val_score(nb_model, X_train_resampled, y_train_resampled, cv=kfold_train, scoring=scorer)
    print(f"F1 score k-fold cross-validation on training data for {feature_name}: {scores_train.mean():.2f}, cv:(+/- {scores_train.std()/scores_train.mean():.2f})")
    NB_k_fold_cross_validations_train.append(scores_train)

    kfold_test = KFold(n_splits=5, shuffle=True, random_state=42)
    scores_test = cross_val_score(nb_model, X_test_selected, y_test, cv=kfold_test, scoring=scorer)
    print(f"F1 score k-fold cross-validation on test data for {feature_name}: {scores_test.mean():.2f}, cv:(+/- {scores_test.std()/scores_test.mean():.2f})")
    NB_k_fold_cross_validations_test.append(scores_test)


F1 score k-fold cross-validation on training data for no_feature_selection: 0.37, cv:(+/- 0.05)
F1 score k-fold cross-validation on test data for no_feature_selection: 0.34, cv:(+/- 0.06)
F1 score k-fold cross-validation on training data for all_significant_features: 0.37, cv:(+/- 0.04)
F1 score k-fold cross-validation on test data for all_significant_features: 0.33, cv:(+/- 0.11)
F1 score k-fold cross-validation on training data for three_way_significant_features: 0.36, cv:(+/- 0.05)
F1 score k-fold cross-validation on test data for three_way_significant_features: 0.27, cv:(+/- 0.02)


## Random forrest

In [26]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
import pandas as pd


RF_CMs = {'cm_RF_all': None, 'cm_RF_s': None, 'cm_RF_three': None}
RF_k_fold_cross_validations_train = []
RF_k_fold_cross_validations_test = []

for i, (features, feature_name) in enumerate(feature_selections):
    X_train_selected = train_data[features]
    y_train = train_data['binary label']
    
    X_test_selected = test_data[features]
    y_test = test_data['binary_label']
    
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train_selected, y_train)

    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train_resampled, y_train_resampled)

    y_pred_train = rf_model.predict(X_train_resampled)
    y_pred_test = rf_model.predict(X_test_selected)

    cm_RF_train = confusion_matrix(y_train_resampled, y_pred_train)
    cm_RF_test = confusion_matrix(y_test, y_pred_test)
    RF_CMs[list(RF_CMs.keys())[i]] = {'train': cm_RF_train, 'test': cm_RF_test}

    kfold_train = KFold(n_splits=5, shuffle=True, random_state=42)
    scores_train = cross_val_score(rf_model, X_train_resampled, y_train_resampled, cv=kfold_train, scoring=scorer)
    print(f"F1 score k-fold cross-validation on training data for {feature_name}: {scores_train.mean():.2f}, cv:(+/- {scores_train.std()/scores_train.mean():.2f})")
    RF_k_fold_cross_validations_train.append(scores_train)

    kfold_test = KFold(n_splits=5, shuffle=True, random_state=42)
    scores_test = cross_val_score(rf_model, X_test_selected, y_test, cv=kfold_test, scoring=scorer)
    print(f"F1 score k-fold cross-validation on test data for {feature_name}: {scores_test.mean():.2f}, cv:(+/- {scores_test.std()/scores_test.mean():.2f})")
    RF_k_fold_cross_validations_test.append(scores_test)


F1 score k-fold cross-validation on training data for no_feature_selection: 0.84, cv:(+/- 0.01)
F1 score k-fold cross-validation on test data for no_feature_selection: 0.28, cv:(+/- 0.08)
F1 score k-fold cross-validation on training data for all_significant_features: 0.84, cv:(+/- 0.01)
F1 score k-fold cross-validation on test data for all_significant_features: 0.29, cv:(+/- 0.06)
F1 score k-fold cross-validation on training data for three_way_significant_features: 0.77, cv:(+/- 0.01)
F1 score k-fold cross-validation on test data for three_way_significant_features: 0.32, cv:(+/- 0.06)


## Gradient Booster

In [28]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
import pandas as pd


GB_CMs = {'cm_GB_all': None, 'cm_GB_s': None, 'cm_GB_three': None}
GB_k_fold_cross_validations_train = []
GB_k_fold_cross_validations_test = []


for i, (features, feature_name) in enumerate(feature_selections):
    X_train_selected = train_data[features]
    y_train = train_data['binary label']
    
    X_test_selected = test_data[features]
    y_test = test_data['binary_label']
    
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train_selected, y_train)

    gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.08, max_depth=2, subsample=0.5, random_state=42)
    gb_model.fit(X_train_resampled, y_train_resampled)

    y_pred_train = gb_model.predict(X_train_resampled)
    y_pred_test = gb_model.predict(X_test_selected)

    cm_GB_train = confusion_matrix(y_train_resampled, y_pred_train)
    cm_GB_test = confusion_matrix(y_test, y_pred_test)
    GB_CMs[list(GB_CMs.keys())[i]] = {'train': cm_GB_train, 'test': cm_GB_test}

    kfold_train = KFold(n_splits=5, shuffle=True, random_state=42)
    scores_train = cross_val_score(gb_model, X_train_resampled, y_train_resampled, cv=kfold_train, scoring=scorer)
    print(f"F1 score k-fold cross-validation on training data for {feature_name}: {scores_train.mean():.2f}, cv:(+/- {scores_train.std()/scores_train.mean():.2f})")
    GB_k_fold_cross_validations_train.append(scores_train)

    kfold_test = KFold(n_splits=5, shuffle=True, random_state=42)
    scores_test = cross_val_score(gb_model, X_test_selected, y_test, cv=kfold_test, scoring=scorer)
    print(f"F1 score k-fold cross-validation on test data for {feature_name}: {scores_test.mean():.2f}, cv:(+/- {scores_test.std()/scores_test.mean():.2f})")
    GB_k_fold_cross_validations_test.append(scores_test)


F1 score k-fold cross-validation on training data for no_feature_selection: 0.44, cv:(+/- 0.02)
F1 score k-fold cross-validation on test data for no_feature_selection: 0.30, cv:(+/- 0.10)
F1 score k-fold cross-validation on training data for all_significant_features: 0.44, cv:(+/- 0.02)
F1 score k-fold cross-validation on test data for all_significant_features: 0.31, cv:(+/- 0.13)
F1 score k-fold cross-validation on training data for three_way_significant_features: 0.42, cv:(+/- 0.02)
F1 score k-fold cross-validation on test data for three_way_significant_features: 0.29, cv:(+/- 0.07)


## Baseline

In [29]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="uniform")
dummy_clf.fit(X_train, y_train)
y_pred_uniform = dummy_clf.predict(X_test)
accuracy_stratified = accuracy_score(y_test, y_pred_uniform)


In [31]:
# Train
scores_Dummy_train = cross_val_score(dummy_clf, X_train_resampled, y_train_resampled, cv=kfold_train, scoring='f1_macro')
print("f1 score k fold cross training data:%0.2f (+/- %0.2f)" % (scores_Dummy_train.mean(), scores_Dummy_train.std()))

#Test
scores_Dummy_test = cross_val_score(dummy_clf, X_test, y_test, cv=kfold_train, scoring='f1_macro')
print("f1 score k fold cross for the test data: %0.2f (+/- %0.2f)" % (scores_Dummy_test.mean(), scores_Dummy_test.std()))

f1 score k fold cross training data:0.33 (+/- 0.00)
f1 score k fold cross for the test data: 0.32 (+/- 0.01)


# Evaluation
- In this code block, the different classification ML models are evaluated and compared against each other
- Subsequently, the evaluation is displayed in a table and later printed in latex format
- this latex format is used in the research paper

In [38]:
eval_df = pd.DataFrame({
    'Model': model_names,
    'Feature Selection': feature_selection,
    'F1 Score Test (%)': [
        f"{np.mean(NB_k_fold_cross_validations_test[0]) * 100:.2f} (+/- {(NB_k_fold_cross_validations_test[0].std()/NB_k_fold_cross_validations_test[0].mean()) * 100:.2f})",
        f"{np.mean(NB_k_fold_cross_validations_test[1]) * 100:.2f} (+/- {(NB_k_fold_cross_validations_test[1].std()/NB_k_fold_cross_validations_test[1].mean()) * 100:.2f})",
        f"{np.mean(NB_k_fold_cross_validations_test[2]) * 100:.2f} (+/- {(NB_k_fold_cross_validations_test[2].std()/NB_k_fold_cross_validations_test[2].mean()) * 100:.2f})",
        f"{np.mean(RF_k_fold_cross_validations_test[0]) * 100:.2f} (+/- {(RF_k_fold_cross_validations_test[0].std()/RF_k_fold_cross_validations_test[0].mean()) * 100:.2f})",
        f"{np.mean(RF_k_fold_cross_validations_test[1]) * 100:.2f} (+/- {(RF_k_fold_cross_validations_test[1].std()/RF_k_fold_cross_validations_test[1].mean()) * 100:.2f})",
        f"{np.mean(RF_k_fold_cross_validations_test[2]) * 100:.2f} (+/- {(RF_k_fold_cross_validations_test[2].std()/RF_k_fold_cross_validations_test[2].mean()) * 100:.2f})",
        f"{np.mean(GB_k_fold_cross_validations_test[0]) * 100:.2f} (+/- {(GB_k_fold_cross_validations_test[0].std()/GB_k_fold_cross_validations_test[0].mean()) * 100:.2f})",
        f"{np.mean(GB_k_fold_cross_validations_test[1]) * 100:.2f} (+/- {(GB_k_fold_cross_validations_test[1].std()/GB_k_fold_cross_validations_test[1].mean()) * 100:.2f})",
        f"{np.mean(GB_k_fold_cross_validations_test[2]) * 100:.2f} (+/- {(GB_k_fold_cross_validations_test[2].std()/GB_k_fold_cross_validations_test[2].mean()) * 100:.2f})",
        f"{np.mean(scores_Dummy_test) * 100:.2f} (+/- {(scores_Dummy_test.std()/scores_Dummy_test.mean()) * 100:.2f})"
    ],
    'Relative Improvement (%)': [
        round(Relative_Improvement_NB_nf, 2),
        round(Relative_Improvement_NB_f, 2),
        round(Relative_Improvement_NB_three_f, 2),
        round(Relative_Improvement_RF_nf, 2),
        round(Relative_Improvement_RF_f, 2),
        round(Relative_Improvement_RF_three_f, 2),
        round(Relative_Improvement_GB_nf, 2),
        round(Relative_Improvement_GB_f, 2),
        round(Relative_Improvement_GB_three_f, 2),
        round(Relative_Improvement_DC, 2)
    ]
})

eval_df.set_index('Model', inplace=True)
print(eval_df)


                              Feature Selection  F1 Score Test (%)  \
Model                                                                
Naive Bayes                          Unselected   34.11 (+/- 6.44)   
Naive Bayes                 two-way significant  33.06 (+/- 11.07)   
Naive Bayes               three-way significant   27.35 (+/- 2.48)   
Random Forrest                       Unselected   27.59 (+/- 7.77)   
Random Forrest              two-way significant   29.40 (+/- 6.09)   
Random Forrest            three-way significant   31.72 (+/- 6.49)   
Gradient Booster                     Unselected   30.48 (+/- 9.72)   
Gradient Booster            two-way significant  30.94 (+/- 12.86)   
Gradient Booster          three-way significant   28.92 (+/- 6.98)   
Dummy Classifier Uniform          Nonapplicable   31.88 (+/- 2.33)   

                          Relative Improvement (%)  
Model                                               
Naive Bayes                                   7.00  


In [40]:
latex_table_ML_eval = eval_df.to_latex(index=True, float_format=lambda x: "%.2f" % x)
print(latex_table_ML_eval)

\begin{tabular}{lllr}
\toprule
{} &      Feature Selection &  F1 Score Test (\%) &  Relative Improvement (\%) \\
Model                    &                        &                    &                           \\
\midrule
Naive Bayes              &             Unselected &   34.11 (+/- 6.44) &                      7.00 \\
Naive Bayes              &    two-way significant &  33.06 (+/- 11.07) &                      3.71 \\
Naive Bayes              &  three-way significant &   27.35 (+/- 2.48) &                    -14.19 \\
Random Forrest           &             Unselected &   27.59 (+/- 7.77) &                    -13.44 \\
Random Forrest           &    two-way significant &   29.40 (+/- 6.09) &                     -7.76 \\
Random Forrest           &  three-way significant &   31.72 (+/- 6.49) &                     -0.47 \\
Gradient Booster         &             Unselected &   30.48 (+/- 9.72) &                     -4.37 \\
Gradient Booster         &    two-way significant &  30.94 (+/

  latex_table_ML_eval = eval_df.to_latex(index=True, float_format=lambda x: "%.2f" % x)


### End of notebook