In [7]:

import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from sklearn import model_selection as cv
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate
import sklearn.neighbors as nb
from sklearn.ensemble import (
    AdaBoostClassifier,
    BaggingClassifier,
    ExtraTreesClassifier,
    RandomForestClassifier,
    VotingClassifier,    
    GradientBoostingClassifier
)


import sklearn
import sklearn.datasets as ds
import sklearn.model_selection as cv
import sklearn.neighbors as nb
from sklearn.model_selection import (
    StratifiedKFold,
    cross_val_score,
    GridSearchCV,
    validation_curve,
    StratifiedShuffleSplit
)
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
)
from sklearn.svm import (
    LinearSVC,
    SVC,
)
from sklearn.ensemble import (
    ExtraTreesClassifier,
    AdaBoostClassifier,
    BaggingClassifier,
)
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import (
    mutual_info_classif,
    SelectKBest
)

from statsmodels.stats.proportion import proportion_confint
from statsmodels.stats.contingency_tables import mcnemar

In [8]:
import numpy as np
from statsmodels.stats.contingency_tables import mcnemar

def get_contingency_table(Y, ground_truth, model_1, model_2):
    contingency_table = [[0, 0], [0, 0]]
    Y_ = Y.copy()
    model_1_correct = Y_.apply(lambda row: int(row[ground_truth] == row[model_1]), axis=1)
    model_2_correct = Y_.apply(lambda row: int(row[ground_truth] == row[model_2]), axis=1)

    contingency_table[0][0] = ((model_1_correct == 1) & (model_2_correct == 1)).sum()
    contingency_table[0][1] = ((model_1_correct == 1) & (model_2_correct == 0)).sum()
    contingency_table[1][0] = ((model_1_correct == 0) & (model_2_correct == 1)).sum()
    contingency_table[1][1] = ((model_1_correct == 0) & (model_2_correct == 0)).sum()

    return np.array(contingency_table)

def mcnemar_test(contingency_table, significance=0.05):
    print("Contingency Table")
    print(contingency_table)
    test = mcnemar(contingency_table, exact=False, correction=True)
    print("P value:", test.pvalue)
    if test.pvalue <= significance:
        print("Reject Null Hypothesis")
        print("Conclusion: Models have statistically different error rates")
        if contingency_table[0][1] > contingency_table[1][0]:
            print("model_1 is better")
        elif contingency_table[0][1] < contingency_table[1][0]:
            print("model_2 is better")
        else:
            print("Are equally good")
    else:
        print("Accept Null Hypothesis")
        print("Conclusion: Models do not have statistically different error rates")

In [9]:
smoking_dataset = pd.read_csv(filepath_or_buffer="Dataset/Smoking_preprocessed.csv")
normalized_dataset = pd.read_csv(filepath_or_buffer="Dataset/Normalized_Smoking.csv")
standardized_dataset = pd.read_csv(filepath_or_buffer="Dataset/Standardized_Smoking.csv")

X = smoking_dataset.iloc[:, :-1]
y = smoking_dataset.loc[:, "smoking"]

X_norm = normalized_dataset.iloc[:, :-1]
y_norm = normalized_dataset.loc[:, "smoking"]

X_standar = standardized_dataset.iloc[:, :-1]
y_standar = standardized_dataset.loc[:, "smoking"]

(X_train, X_test,  y_train, y_test) = cv.train_test_split(X, y, test_size = .3, random_state = 1)
(X_norm_train, X_norm_test,  y_norm_train, y_norm_test) = cv.train_test_split(X_norm, y_norm, test_size = .3, random_state = 1)
(X_standar_train, X_standar_test,  y_standar_train, y_standar_test) = cv.train_test_split(X_standar, y_standar, test_size = .3, random_state = 1)

## Training models

### Decision Trees

In [10]:
best_params = {'max_depth': 42, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2}

cdt= DecisionTreeClassifier(
    criterion = "entropy", 
    min_samples_split = best_params["min_samples_split"],
    min_impurity_decrease = best_params["min_impurity_decrease"],
    min_samples_leaf = best_params["min_samples_leaf"],
    max_depth = best_params["max_depth"]    
)
clf = cdt.fit(X_train, y_train)
predDt = clf.predict(X_test)

### Bagging

In [16]:
undersampler = RandomUnderSampler(sampling_strategy="majority", random_state=1)
X_standar_train_Bg, y_standar_train_Bg = undersampler.fit_resample(X_standar_train, y_standar_train)

dt = DecisionTreeClassifier(
    criterion="entropy",
    min_samples_split=2,
    min_impurity_decrease=0.0,
    min_samples_leaf=1,
    max_depth=42
)

model = BaggingClassifier(
        estimator = dt,
        n_estimators = 50,
        max_features=0.35,
        random_state=1,
    )

clf = model.fit(X_standar_train_Bg, y_standar_train_Bg)
predsBg = clf.predict(X_standar_test)

### Random Forest

In [18]:
undersampler = RandomUnderSampler(sampling_strategy="majority", random_state=1)
X_standar_train_RF, y_standar_train_RF = undersampler.fit_resample(X_standar_train, y_standar_train)

random_forest_classifier = RandomForestClassifier(
    n_estimators = 50,
    max_features = 0.35,
    random_state=1
)

clf = random_forest_classifier.fit(X_standar_train_RF, y_standar_train_RF)
predsRF = clf.predict(X_standar_test)

### SVM

In [15]:
sampler = RandomUnderSampler(sampling_strategy='majority', random_state=1)
X_standar_train_SVM, y_standar_train_SVM = sampler.fit_resample(X_standar_train, y_standar_train) # Compensate for data unbalance

knc = SVC(kernel='linear', C=1.0)
knc.fit(X_standar_train_SVM, y_standar_train_SVM)
predsSVM = knc.predict(X_standar_test)

### KNN

In [19]:

knc = nb.KNeighborsClassifier(n_neighbors=27, weights='distance')
knc = knc.fit(X_norm_train, y_norm_train)
predsKNN = knc.predict(X_norm_test)

### Naive-Bayes

In [28]:
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=1)

def filterp(th,ProbClass1):
    """Given a threshold "th" and a set of probabilities of belonging to class 1 "ProbClass1", 
    return predictions.""" 
    y = np.zeros(ProbClass1.shape[0])
    for i,v in enumerate(ProbClass1):
        if ProbClass1[i]>th:
            y[i]=1
    return y  

X_under_train, y_under_train  = undersampler.fit_resample(X_standar_train, y_standar_train)

clf = GaussianNB()
clf.fit(X_under_train, y_under_train)

# Obtain probabilities for data on test set
probs = clf.predict_proba(X_standar_test)

# Generate predictions using probabilities and threshold found on 10 folds cross-validation
predsNB = filterp(0.009984,probs[:,1])

## Tests

### Decision Trees vs Bagging

In [27]:
Y = pd.DataFrame(y_test.copy())
Y["model_1"] = predDt
Y["model_2"] = predsBg

mcnemar_test(get_contingency_table(Y, 'smoking', 'model_1', 'model_2'))

Contingency Table
[[11035  1924]
 [ 1874  1816]]
P value: 0.4265586928944306
Accept Null Hypothesis
Conclusion: Models do not have statistically different error rates


### Decision Tree vs Random Forest

In [22]:
Y = pd.DataFrame(y_test.copy())
Y["model_1"] = predDt
Y["model_2"] = predsRF

mcnemar_test(get_contingency_table(Y, 'smoking', 'model_1', 'model_2'))

Contingency Table
[[11225  1734]
 [ 1869  1821]]
P value: 0.025588316017032677
Reject Null Hypothesis
Conclusion: Models have statistically different error rates
model_2 is better


### Bagging vs Random Forest 

In [23]:
Y = pd.DataFrame(y_test.copy())
Y["model_1"] = predsBg
Y["model_2"] = predsRF

mcnemar_test(get_contingency_table(Y, 'smoking', 'model_1', 'model_2'))

Contingency Table
[[12179   730]
 [  915  2825]]
P value: 5.715598308361687e-06
Reject Null Hypothesis
Conclusion: Models have statistically different error rates
model_2 is better


### SVM vs Random Forest

In [24]:
Y = pd.DataFrame(y_test.copy())
Y["model_1"] = predsSVM
Y["model_2"] = predsRF

mcnemar_test(get_contingency_table(Y, 'smoking', 'model_1', 'model_2'))

Contingency Table
[[11184   445]
 [ 1910  3110]]
P value: 6.240888016339758e-200
Reject Null Hypothesis
Conclusion: Models have statistically different error rates
model_2 is better


### Naive-Bayes vs Random Forest

In [29]:
Y = pd.DataFrame(y_test.copy())
Y["model_1"] = predsNB
Y["model_2"] = predsRF

mcnemar_test(get_contingency_table(Y, 'smoking', 'model_1', 'model_2'))

Contingency Table
[[11138   491]
 [ 1956  3064]]
P value: 1.7128950776428786e-192
Reject Null Hypothesis
Conclusion: Models have statistically different error rates
model_2 is better


### KNN vs Random Forest

In [30]:
Y = pd.DataFrame(y_test.copy())
Y["model_1"] = predsKNN
Y["model_2"] = predsRF

mcnemar_test(get_contingency_table(Y, 'smoking', 'model_1', 'model_2'))

Contingency Table
[[11763  1550]
 [ 1331  2005]]
P value: 4.876153060712708e-05
Reject Null Hypothesis
Conclusion: Models have statistically different error rates
model_1 is better
