In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance, distance_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, LeaveOneOut
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
import matplotlib.pyplot as plt
from collections import Counter
from copy import deepcopy
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from threading import Thread
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Creation usefull instruments

In [24]:
class Result:
    
    def __init__(self, model_name, model_parameters, model_score, data_for_graphics):
        self.__name = model_name
        self.__parameters = model_parameters
        self.__score = model_score
        if len(data_for_graphics) == 3:
            self.__X_test = data_for_graphics[0]
            self.__y_test = data_for_graphics[1]
            self.__y_hat = data_for_graphics[2]
        else:
            self.__X_test = None
            self.__y_test = None
            self.__y_hat = None

        
    def get_name(self):
        return self.__name
    
    def get_parameters(self):
        return self.__parameters
    
    def get_score(self):
        return self.__score

    def show_result_graphics(self):
        X_test = self.__X_test
        y_test = self.__y_test
        y_hat = self.__y_hat
        if  X_test is not None and y_test is not None and y_hat is not None:
            plt.subplot(1, 2, 1)
            plt.title("y_hat")
            plt.scatter(np.linspace(0, len(y_hat), len(y_hat)), y_hat, c='orange')
            plt.subplot(1, 2, 2)
            plt.title("y_test")
            plt.scatter(np.linspace(0, len(y_test), len(y_test)), y_test)
    
    def classification_report(self):
        print(self.__name)
        print(classification_report(self.__y_hat, self.__y_test))
    
class Results:
    
    def __init__(self):
        self.__results = {}
        
    def create_id(self):
        new_key = 0
        keys = self.__results.keys()
        if keys:
            new_key = max(self.__results.keys()) + 1
        return new_key
    
    def add_result(self, model_name, model_parameters, model_score, data_for_graphics):
        self.__results.update({model_name: Result(model_name, model_parameters, model_score, data_for_graphics)})
        
    def get_scores(self, show_model_parameters=False):
        results = {}
        result_dataframe_index = self.__results.keys()
        result_dataframe_columns = ["accuracy", "precision", "recall", "f_score"]
        result_df = pd.DataFrame(index=result_dataframe_index, columns=result_dataframe_columns)

        for key, result_object in self.__results.items():
            model_scores = result_object.get_score()
            result_df.loc[key]["accuracy"] = model_scores.get("accuracy")
            result_df.loc[key]["precision"] = model_scores.get("precision")
            result_df.loc[key]["recall"] = model_scores.get("recall")
            result_df.loc[key]["f_score"] = model_scores.get("f_score")
            result_df.loc[key]["parameters"] = result_object.get_parameters()
 
        return result_df
    
    def get_results(self):
        return self.__results
    
    def get_concrete_result(self, model_name):
        return self.__results.get(model_name)

## Data loading from scv files
1. all_data - data of morning and afternoon datasets
2. morning_data - data of morning dataset
3. afternoon_data - data of afternoon dataset

In [25]:
separator = ','
all_data = pd.read_csv("all_data_preprocessed_filtering.csv", sep=separator)
morning_data = pd.read_csv("morning_data.csv", sep=separator)
afternoon_data =pd.read_csv("afternoon_data.csv", sep=separator)
datasets = [all_data, morning_data, afternoon_data]

## Class balancing

### Oversampling

In [26]:
df = deepcopy(all_data)
df_major = df[df.respiratory_disease == 1]
df_minor = df[df.respiratory_disease == 0]
df_minor_upsampled = resample(df_minor, replace=True, n_samples=df_major.shape[0])
df_oversampled = pd.concat([df_major, df_minor_upsampled])

### Undersampling

In [27]:
df = deepcopy(all_data)
df_major = df[df.respiratory_disease == 1]
df_minor = df[df.respiratory_disease == 0]
df_major_downsampled = resample(df_major, replace=True, n_samples=df_minor.shape[0])
df_undersampled = pd.concat([df_minor, df_major_downsampled])

## Function for process

In [28]:
# function for creation data for model fitting process
def create_models_data(results_collector, df, models_data, test_size=0.3):
    models = []
    for model_data in models_data:
        dataframe_info = [results_collector, df, test_size]
        dataframe_info.extend(model_data)
        models.append(dataframe_info)
    return models

# classification report in dict view
def classification_scores(y_hat, y_test):
    scores = {}
    scores["accuracy"] = accuracy_score(y_hat, y_test)
    scores["precision"] = precision_score(y_hat, y_test)
    scores["recall"] = recall_score(y_hat, y_test)
    scores["f_score"] = f1_score(y_hat, y_test)
    return scores

In [29]:
def basic_model(r_collector, df, test_size, model_object, model_saving_name):
    results = r_collector
    def model_function(df, test_size, model_object, model_saving_name):
        data = deepcopy(df)
        y = data['respiratory_disease']
        X = data.drop(['respiratory_disease'], axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)  
        model_object.fit(X_train, y_train)
        y_hat = model_object.predict(X_test)
        model_score = classification_scores(y_hat, y_test)
        data_for_graphics = [X_test, y_test, y_hat]
        results.add_result(model_saving_name, model_object.get_params(), model_score, data_for_graphics)

    model_thread = Thread(target=model_function, args=[df, test_size, model_object, model_saving_name])
    model_thread.start()

def GSCV_model(r_collector, df, test_size, model_object, model_saving_name, parameters):
    def model_function(df, test_size, model_object, model_saving_name, parameters):
        results = r_collector
        data = deepcopy(df)
        y = data['respiratory_disease']
        X = data.drop(['respiratory_disease'], axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)  
        classifier = GridSearchCV(model_object, parameters)
        classifier.fit(X_train, y_train)
        model_object = classifier.best_estimator_
        model_object.fit(X_train, y_train)
        y_hat = model_object.predict(X_test)
        model_score = classification_scores(y_hat, y_test)
        data_for_graphics = [X_test, y_test, y_hat]
        results.add_result(model_saving_name, model_object.get_params(), model_score, data_for_graphics)

    model_thread = Thread(target=model_function, args=[df, test_size, model_object, model_saving_name, parameters])
    model_thread.start()
        
def KFOLD_model(r_collector, df, test_size, model_object, model_saving_name, n_splits=2, shuffle=False, random_state=None):
    def model_function(df, test_size, model_object, model_saving_name, n_splits, shuffle=shuffle, random_state=None):
        results = r_collector
        data = deepcopy(df)
        y = data['respiratory_disease']
        X = data.drop(['respiratory_disease'], axis=1)
        accuracies = []
        recalls = []
        precisions = []
        f_scores = []
        kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model_object.fit(X_train, y_train)
            y_hat = model_object.predict(X_test)
            model_score = classification_scores(y_hat, y_test)
            accuracies.append(model_score.get("accuracy"))
            recalls.append(model_score.get("recall"))
            precisions.append(model_score.get("precision"))
            f_scores.append(model_score.get("f_score"))

        model_score = {"accuracy": np.mean(accuracies) / len(accuracies),
                       "precision": np.mean(precisions) / len(precisions),
                       "recall": np.mean(recalls) / len(recalls),
                       "f_score": np.mean(f_scores) / len(f_scores)}     
            
        data_for_graphics = []
        results.add_result(model_saving_name, model_object.get_params(), model_score, data_for_graphics)

    model_thread = Thread(target=model_function, args= [df, test_size, model_object, model_saving_name, n_splits, shuffle, random_state])
    model_thread.start()

## Models list

In [58]:
LGR_basic = [LogisticRegression(), "LogisticRegression_basic"]
DTC_basic = [DecisionTreeClassifier(), "DecisionTreeClassifier_basic"]
SVC_basic = [SVC(), "SVC_basic"]
KNC_basic = [KNeighborsClassifier(), "KNeighborsClassifier_basic"]
RFC_basic = [RandomForestClassifier(), "RandomForestClassifier_basic"]
LSVC_basic = [LinearSVC(), "LinearSVC_basic"]

basic_models_data = [LGR_basic, DTC_basic, SVC_basic, KNC_basic, RFC_basic, LSVC_basic]


parameters = {'penalty': ('l2',), 'C':[1, 10], 'solver': ('newton-cg', 'lbfgs', 'liblinear')}
LGR_gscv = [LogisticRegression(), "LogisticRegression_GSCV", parameters]
            
parameters = {'criterion': ('gini', 'entropy'), 'max_depth': (2, 5, 10), 'max_features': (3, 6, 9)}
DTC_gscv = [DecisionTreeClassifier(), "DecisionTreeClassifier_GSCV", parameters]

parameters = {'kernel':('rbf', 'sigmoid'), 'C':[1, 10]}
SVC_gscv = [SVC(), "SVC_GSCV", parameters]

parameters = {'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'), 'n_neighbors': (5, 10, 15, 50)}
KNC_gscv = [KNeighborsClassifier(), "KNeighborsClassifier_GSCV", parameters]

parameters = {'criterion': ('gini', 'entropy'), 'max_depth': (2, 5, 10), 'n_estimators': (50, 100, 200)}
RFC_gscv = [RandomForestClassifier(), "RandomForestClassifier_GSCV", parameters]


parameters = {'penalty':('l2',), 'C':[1, 10]}
LSVC_gscv = [LinearSVC(), "LinearSVC_GSCV", parameters]

gscv_models_data = [LGR_gscv, DTC_gscv, SVC_gscv, KNC_gscv, RFC_gscv, LSVC_gscv]

LGR_kfolds = [LogisticRegression(), "LogisticRegression_KFold", 2, True, 123]
DTC_kfolds = [DecisionTreeClassifier(), "DecisionTreeClassifier_KFold", 2, True, 123]
SVC_kfolds = [SVC(), "SVC_KFold", 2, True, 123]
KNC_kfolds = [KNeighborsClassifier(), "KNeighborsClassifier_KFold", 2, True, 123]
RFC_kfolds = [RandomForestClassifier(), "RandomForestClassifier_KFold", 2, True, 123]
LSCV_kfolds = [LinearSVC(), "LinearSVC_KFold", 2, True, 123]

kfold_model_data = [LGR_kfolds, DTC_kfolds, SVC_kfolds, KNC_kfolds, RFC_kfolds, LSCV_kfolds]

### Imbalanced classes

In [59]:
results_imbalanced = Results()  # object for saving results
basic_models = create_models_data(results_imbalanced, all_data, models_data=basic_models_data, test_size=0.3)
gscv_models = create_models_data(results_imbalanced, all_data, models_data=gscv_models_data, test_size=0.3)
kfolds_models = create_models_data(results_imbalanced, all_data, models_data=kfold_model_data, test_size=0.3)

In [61]:
for model in basic_models:
    basic_model(*model)
    
for model in gscv_models:
    GSCV_model(*model)
    
for model in kfolds_models:
    KFOLD_model(*model)

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


In [62]:
results_imbalanced.get_scores().sort_values(by="f_score")

Unnamed: 0,accuracy,precision,recall,f_score
LinearSVC_KFold,0.245488,0.245576,0.47952,0.247927
DecisionTreeClassifier_KFold,0.450792,0.474607,0.472151,0.473374
LogisticRegression_KFold,0.461777,0.499741,0.461926,0.48009
RandomForestClassifier_KFold,0.464406,0.495595,0.467725,0.481257
SVC_KFold,0.465122,0.5,0.464837,0.481777
KNeighborsClassifier_KFold,0.466316,0.498446,0.467217,0.482326
DecisionTreeClassifier_basic,0.872611,0.931217,0.927944,0.929577
LogisticRegression_GSCV,0.91242,0.998252,0.9136,0.954052
DecisionTreeClassifier_GSCV,0.91242,1.0,0.91242,0.954205
SVC_GSCV,0.923567,0.996528,0.925806,0.959866


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


### Balanced classes (oversampled)

In [44]:
results_oversampled = Results()  # object for saving results
basic_models = create_models_data(results_oversampled, df_oversampled, models_data=basic_models_data, test_size=0.3)
gscv_models = create_models_data(results_oversampled, df_oversampled, models_data=gscv_models_data, test_size=0.3)
kfolds_models = create_models_data(results_oversampled, df_oversampled, models_data=kfold_model_data, test_size=0.3)

In [46]:
for model in basic_models:
    basic_model(*model)
    
for model in gscv_models:
    GSCV_model(*model)
    
for model in kfolds_models:
    KFOLD_model(*model)

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
Exception in thread Thread-142:
Traceback (most recent call last):
  File "C:\Users\Gleb\Anaconda3\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "C:\Users\Gleb\Anaconda3\lib\threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-29-8bce9be7db57>", line 25, in model_function
    classifier.fit(X_train, y_train)
  File "C:\Users\Gleb\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 638, in fit
    cv.split(X, y, groups)))
  File "C:\Users\Gleb\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\Users\Gleb\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\Gleb\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.p

In [48]:
results_oversampled.get_scores().sort_values(by="f_score")

Unnamed: 0,accuracy,precision,recall,f_score
LogisticRegression_KFold,0.32487,0.328452,0.324765,0.32626
LinearSVC_KFold,0.278756,0.478624,0.267778,0.342546
KNeighborsClassifier_KFold,0.450389,0.416316,0.481996,0.446754
DecisionTreeClassifier_KFold,0.479793,0.459629,0.5,0.478938
RandomForestClassifier_KFold,0.488472,0.476953,0.5,0.488203
SVC_KFold,0.491839,0.483677,0.5,0.491703
LogisticRegression_basic,0.646805,0.680203,0.646302,0.662819
LinearSVC_basic,0.625216,0.965217,0.572755,0.718912
DecisionTreeClassifier_GSCV,0.911054,0.840678,0.982178,0.905936
KNeighborsClassifier_GSCV,0.93696,0.879661,0.996161,0.934293


### Balanced classes (undersampled)

In [63]:
results_undersampled = Results()  # object for saving results
basic_models = create_models_data(results_undersampled, df_undersampled, models_data=basic_models_data, test_size=0.3)
gscv_models = create_models_data(results_undersampled, df_undersampled, models_data=gscv_models_data, test_size=0.3)
kfolds_models = create_models_data(results_undersampled, df_undersampled, models_data=kfold_model_data, test_size=0.3)

In [64]:
for model in basic_models:
    basic_model(*model)
    
for model in gscv_models:
    GSCV_model(*model)
    
for model in kfolds_models:
    KFOLD_model(*model)

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


In [65]:
results_undersampled.get_scores().sort_values(by="f_score")

Unnamed: 0,accuracy,precision,recall,f_score
LinearSVC_GSCV,0.469388,0.0377358,0.666667,0.0714286
LinearSVC_KFold,0.269939,0.111073,0.400862,0.131923
RandomForestClassifier_KFold,0.331288,0.28252,0.351251,0.310592
DecisionTreeClassifier_KFold,0.31135,0.321853,0.307762,0.314194
LogisticRegression_KFold,0.332822,0.328139,0.335996,0.331761
SVC_KFold,0.338957,0.337888,0.351997,0.335126
KNeighborsClassifier_KFold,0.351227,0.346733,0.353529,0.349685
SVC_GSCV,0.591837,0.345455,0.826087,0.487179
LogisticRegression_basic,0.561224,0.510638,0.545455,0.527473
SVC_basic,0.612245,0.46,0.676471,0.547619


### Results review

Best results shows random forest, so we will research this model closer.
Also we will pay attention on decision tree, because this model is the easiestfor interpretation

## Random Forest

In [78]:
df = deepcopy(df_undersampled)
y = df['respiratory_disease']
X = df.drop(['respiratory_disease'], axis=1)
model_object = RandomForestClassifier()

In [88]:
parameters = {"criterion": ('gini', 'entropy'), "n_estimators": np.linspace(50, 400, 8, dtype=int),
              "max_depth": np.linspace(1, 9, 9, dtype=int),
              "max_features": ('auto', 'sqrt', 'log2'), "n_jobs": (-1,)}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)  
classifier = GridSearchCV(model_object, parameters)
classifier.fit(X_train, y_train)
print(classifier.best_estimator_)
print(classifier.best_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.7412280701754386


In [89]:
model_object = classifier.best_estimator_
model_object.fit(X_train, y_train)
y_hat = model_object.predict(X_test)
model_score = classification_scores(y_hat, y_test)
model_score

{'accuracy': 0.6326530612244898,
 'f_score': 0.6470588235294118,
 'precision': 0.6226415094339622,
 'recall': 0.673469387755102}

In [90]:
accuracies = []
recalls = []
precisions = []
f_scores = []



kf = KFold(n_splits=3, shuffle=True, random_state=123)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model_object.fit(X_train, y_train)
    y_hat = model_object.predict(X_test)
    model_score = classification_scores(y_hat, y_test)
    accuracies.append(model_score.get("accuracy"))
    recalls.append(model_score.get("recall"))
    precisions.append(model_score.get("precision"))
    f_scores.append(model_score.get("f_score"))

model_score = {"accuracy": np.mean(accuracies) / len(accuracies),
               "precision": np.mean(precisions) / len(precisions),
               "recall": np.mean(recalls) / len(recalls),
               "f_score": np.mean(f_scores) / len(f_scores)}     

model_score


{'accuracy': 0.2310567448182127,
 'f_score': 0.23014102937594616,
 'precision': 0.22946921085218955,
 'recall': 0.23339300632533713}