In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance, distance_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, LeaveOneOut
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.utils import resample
import matplotlib.pyplot as plt
from collections import Counter
from copy import deepcopy
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from threading import Thread
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Creation usefull instruments

In [2]:
class Result:
    
    def __init__(self, model_name, model_parameters, model_score, data_for_graphics):
        self.__name = model_name
        self.__parameters = model_parameters
        self.__score = model_score
        if len(data_for_graphics) == 3:
            self.__X_test = data_for_graphics[0]
            self.__y_test = data_for_graphics[1]
            self.__y_hat = data_for_graphics[2]
        else:
            self.__X_test = None
            self.__y_test = None
            self.__y_hat = None

        
    def get_name(self):
        return self.__name
    
    def get_parameters(self):
        return self.__parameters
    
    def get_score(self):
        return self.__score

    def show_result_graphics(self):
        X_test = self.__X_test
        y_test = self.__y_test
        y_hat = self.__y_hat
        if  X_test is not None and y_test is not None and y_hat is not None:
            plt.subplot(1, 2, 1)
            plt.title("y_hat")
            plt.scatter(np.linspace(0, len(y_hat), len(y_hat)), y_hat, c='orange')
            plt.subplot(1, 2, 2)
            plt.title("y_test")
            plt.scatter(np.linspace(0, len(y_test), len(y_test)), y_test)
    
    def classification_report(self):
        print(self.__name)
        print(classification_report(self.__y_hat, self.__y_test))
    
class Results:
    
    def __init__(self):
        self.__results = {}
        
    def create_id(self):
        new_key = 0
        keys = self.__results.keys()
        if keys:
            new_key = max(self.__results.keys()) + 1
        return new_key
    
    def add_result(self, model_name, model_parameters, model_score, data_for_graphics):
        self.__results.update({model_name: Result(model_name, model_parameters, model_score, data_for_graphics)})
        
    def get_scores(self, show_model_parameters=False):
        results = {}
        result_dataframe_index = self.__results.keys()
        result_dataframe_columns = ["accuracy", "precision", "recall", "f_score", "specific"]
        result_df = pd.DataFrame(index=result_dataframe_index, columns=result_dataframe_columns)

        for key, result_object in self.__results.items():
            model_scores = result_object.get_score()
            result_df.loc[key]["accuracy"] = model_scores.get("accuracy")
            result_df.loc[key]["precision"] = model_scores.get("precision")
            result_df.loc[key]["recall"] = model_scores.get("recall")
            result_df.loc[key]["f_score"] = model_scores.get("f_score")
            result_df.loc[key]["specific"] = model_scores.get("specific")
            result_df.loc[key]["parameters"] = result_object.get_parameters()
 
        return result_df
    
    def get_results(self):
        return self.__results
    
    def get_concrete_result(self, model_name):
        return self.__results.get(model_name)

## Data loading from scv files
1. all_data - data of morning and afternoon datasets
2. morning_data - data of morning dataset
3. afternoon_data - data of afternoon dataset

In [3]:
separator = ','
all_data = pd.read_csv("all_data_preprocessed_filtering.csv", sep=separator)
datasets = [all_data]

## Class balancing

### Oversampling

In [4]:
df = deepcopy(all_data)
df_major = df[df.respiratory_disease == 1]
df_minor = df[df.respiratory_disease == 0]
df_minor_upsampled = resample(df_minor, replace=True, n_samples=df_major.shape[0])
df_oversampled = pd.concat([df_major, df_minor_upsampled])

### Undersampling

In [5]:
df = deepcopy(all_data)
df_major = df[df.respiratory_disease == 1]
df_minor = df[df.respiratory_disease == 0]
df_major_downsampled = resample(df_major, replace=True, n_samples=df_minor.shape[0])
df_undersampled = pd.concat([df_minor, df_major_downsampled])

## Function for process

In [44]:
# function for creation data for model fitting process
def create_models_data(results_collector, df, models_data, test_size=0.3):
    models = []
    for model_data in models_data:
        dataframe_info = [results_collector, df, test_size]
        dataframe_info.extend(model_data)
        models.append(dataframe_info)
    return models

def specific(y_hat, y_test):
    print(y_test)
    y_test_zero = 0
    y_test_one = 0
    y_hat_zero = 0
    y_hat_one = 0
    for _y_test in y_test:
        if _y_test == 0:
            y_test_zero += 1
        if _y_test == 1:
            y_test_one += 1
    for _y_hat in y_hat:
        if _y_hat == 0:
            y_hat_zero += 1
        if _y_hat == 1:
            y_hat_one += 1
    
    
    right_classified = 0
    healthy_patiants = 0
    for index in range(len(y_test)):
        if y_test.iloc[index] == 0:
            healthy_patiants += 1
            if y_hat[index] == 0:
                right_classified += 1
    return right_classified / healthy_patiants

# classification report in dict view
def classification_scores(y_hat, y_test):
    scores = {}
    scores["accuracy"] = accuracy_score(y_hat, y_test)
    scores["precision"] = precision_score(y_hat, y_test)
    scores["recall"] = recall_score(y_hat, y_test)
    scores["f_score"] = f1_score(y_hat, y_test)
    scores["specific"] = specific(y_hat, y_test)
    return scores

In [52]:
def basic_model(r_collector, df, test_size, model_object, model_saving_name):
    results = r_collector
    def model_function(df, test_size, model_object, model_saving_name):
        data = deepcopy(df)
        y = data['respiratory_disease']
        X = data.drop(['respiratory_disease'], axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)  
        model_object.fit(X_train, y_train)
        y_hat = model_object.predict(X_test)
        model_score = classification_scores(y_hat, y_test)
        data_for_graphics = [X_test, y_test, y_hat]
        results.add_result(model_saving_name, model_object.get_params(), model_score, data_for_graphics)

    model_thread = Thread(target=model_function, args=[df, test_size, model_object, model_saving_name])
    model_thread.start()

def GSCV_model(r_collector, df, test_size, model_object, model_saving_name, parameters):
    def model_function(df, test_size, model_object, model_saving_name, parameters):
        results = r_collector
        data = deepcopy(df)
        y = data['respiratory_disease']
        X = data.drop(['respiratory_disease'], axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)  
        classifier = GridSearchCV(model_object, parameters)
        classifier.fit(X_train, y_train)
        model_object = classifier.best_estimator_
        model_object.fit(X_train, y_train)
        y_hat = model_object.predict(X_test)
        model_score = classification_scores(y_hat, y_test)
        data_for_graphics = [X_test, y_test, y_hat]
        results.add_result(model_saving_name, model_object.get_params(), model_score, data_for_graphics)

    model_thread = Thread(target=model_function, args=[df, test_size, model_object, model_saving_name, parameters])
    model_thread.start()
        
def KFOLD_model(r_collector, df, test_size, model_object, model_saving_name, n_splits=2, shuffle=False, random_state=None):
    def model_function(df, test_size, model_object, model_saving_name, n_splits, shuffle=shuffle, random_state=None):
        results = r_collector
        data = deepcopy(df)
        y = data['respiratory_disease']
        X = data.drop(['respiratory_disease'], axis=1)
        accuracies = []
        recalls = []
        precisions = []
        f_scores = []
        specifics = []
        kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model_object.fit(X_train, y_train)
            y_hat = model_object.predict(X_test)
            model_score = classification_scores(y_hat, y_test)
            accuracies.append(model_score.get("accuracy"))
            recalls.append(model_score.get("recall"))
            precisions.append(model_score.get("precision"))
            f_scores.append(model_score.get("f_score"))
            specifics.append(model_score.get("specific"))
        model_score = {"accuracy": np.mean(accuracies),
                       "precision": np.mean(precisions),
                       "recall": np.mean(recalls),
                       "f_score": np.mean(f_scores),
                        "specific": np.mean(specifics)}     
            
        data_for_graphics = []
        results.add_result(model_saving_name, model_object.get_params(), model_score, data_for_graphics)

    model_thread = Thread(target=model_function, args=[df, test_size, model_object, model_saving_name, n_splits, shuffle, random_state])
    model_thread.start()

## Models list

In [53]:
LGR_basic = [LogisticRegression(), "LogisticRegression_basic"]
DTC_basic = [DecisionTreeClassifier(), "DecisionTreeClassifier_basic"]
SVC_basic = [SVC(), "SVC_basic"]
KNC_basic = [KNeighborsClassifier(), "KNeighborsClassifier_basic"]
RFC_basic = [RandomForestClassifier(), "RandomForestClassifier_basic"]
LSVC_basic = [LinearSVC(), "LinearSVC_basic"]

basic_models_data = [LGR_basic, DTC_basic, SVC_basic, KNC_basic, RFC_basic, LSVC_basic]


parameters = {'penalty': ('l2',), 'C':[1, 10], 'solver': ('newton-cg', 'lbfgs', 'liblinear')}
LGR_gscv = [LogisticRegression(), "LogisticRegression_GSCV", parameters]
            
parameters = {'criterion': ('gini', 'entropy'), 'max_depth': (2, 5, 10), 'max_features': (3, 6, 9)}
DTC_gscv = [DecisionTreeClassifier(), "DecisionTreeClassifier_GSCV", parameters]

parameters = {'kernel':('rbf', 'sigmoid'), 'C':[1, 10]}
SVC_gscv = [SVC(), "SVC_GSCV", parameters]

parameters = {'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'), 'n_neighbors': (5, 10, 15, 50)}
KNC_gscv = [KNeighborsClassifier(), "KNeighborsClassifier_GSCV", parameters]

parameters = {'criterion': ('gini', 'entropy'), 'max_depth': (2, 5, 10), 'n_estimators': (50, 100, 200)}
RFC_gscv = [RandomForestClassifier(), "RandomForestClassifier_GSCV", parameters]


parameters = {'penalty':('l2',), 'C':[1, 10]}
LSVC_gscv = [LinearSVC(), "LinearSVC_GSCV", parameters]

gscv_models_data = [LGR_gscv, DTC_gscv, SVC_gscv, KNC_gscv, RFC_gscv, LSVC_gscv]

LGR_kfolds = [LogisticRegression(), "LogisticRegression_KFold", 2, True, 123]
DTC_kfolds = [DecisionTreeClassifier(), "DecisionTreeClassifier_KFold", 2, True, 123]
SVC_kfolds = [SVC(), "SVC_KFold", 2, True, 123]
KNC_kfolds = [KNeighborsClassifier(), "KNeighborsClassifier_KFold", 2, True, 123]
RFC_kfolds = [RandomForestClassifier(), "RandomForestClassifier_KFold", 2, True, 123]
LSCV_kfolds = [LinearSVC(), "LinearSVC_KFold", 2, True, 123]

kfold_model_data = [LGR_kfolds, DTC_kfolds, SVC_kfolds, KNC_kfolds, RFC_kfolds, LSCV_kfolds]

### Imbalanced classes

In [54]:
results_imbalanced = Results()  # object for saving results
basic_models = create_models_data(results_imbalanced, all_data, models_data=basic_models_data, test_size=0.3)
gscv_models = create_models_data(results_imbalanced, all_data, models_data=gscv_models_data, test_size=0.3)
kfolds_models = create_models_data(results_imbalanced, all_data, models_data=kfold_model_data, test_size=0.3)

In [55]:
for model in basic_models:
    basic_model(*model)
    
for model in gscv_models:
    GSCV_model(*model)
    
for model in kfolds_models:
    KFOLD_model(*model)

In [56]:
results_imbalanced.get_scores().sort_values(by="f_score")

651     1
1324    0
1885    1
460     1
1538    0
1916    1
561     1
574     1
1835    1
487     1
177     1
52      1
2061    1
1796    1
1997    1
1551    0
84      1
749     1
995     0
519     1
1293    1
603     1
687     1
682     1
1321    1
1482    1
1953    1
1252    1
795     1
1744    1
       ..
780     1
1109    1
1159    1
1804    1
1536    0
33      1
1204    1
1248    0
88      1
1718    1
1974    1
1117    1
1302    1
1783    1
1340    0
347     1
1052    1
781     1
563     1
747     1
1048    1
319     1
1453    1
1525    1
1671    1
2010    1
1675    1
465     1
1746    1
1984    1
Name: respiratory_disease, Length: 628, dtype: int64


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


Unnamed: 0,accuracy,precision,recall,f_score,specific
LogisticRegression_basic,0.921975,1,0.92185,0.959336,0.02


154     1
1297    1
2047    1
1417    1
1590    1
497     1
104     1
310     1
1594    1
488     1
1355    1
911     1
1479    1
1937    1
971     1
146     1
1454    1
1814    1
1595    1
1614    1
1029    1
2017    1
1358    1
980     1
838     1
74      1
434     1
564     1
445     0
351     1
       ..
1601    1
449     0
46      1
1824    1
644     1
704     1
1333    1
979     1
1288    1
754     1
1196    1
537     1
931     1
1049    1
1317    1
1432    1
1761    1
42      1
1374    1
1787    1
1313    1
1675    1
393     1
1853    0
1976    1
1334    1
332     1
1638    1
957     1
1085    1
Name: respiratory_disease, Length: 628, dtype: int64
0       0
2       0
5       1
7       1
12      0
13      1
15      1
18      1
19      1
20      0
21      1
28      1
30      1
31      1
33      1
35      1
36      1
37      1
40      1
42      1
43      1
45      1
48      1
50      1
54      1
55      1
57      1
61      1
63      0
64      0
       ..
2035    1
2037    1
2041   

0       0
2       0
5       1
7       1
12      0
13      1
15      1
18      1
19      1
20      0
21      1
28      1
30      1
31      1
33      1
35      1
36      1
37      1
40      1
42      1
43      1
45      1
48      1
50      1
54      1
55      1
57      1
61      1
63      0
64      0
       ..
2035    1
2037    1
2041    1
2042    1
2047    1
2048    1
2051    1
2052    1
2053    1
2054    1
2056    1
2057    1
2058    1
2059    1
2063    1
2064    1
2065    1
2074    1
2075    1
2076    1
2077    1
2078    1
2080    1
2083    1
2085    1
2086    1
2088    1
2089    1
2090    1
2091    1
Name: respiratory_disease, Length: 1047, dtype: int64
309     1
649     1
1045    1
598     1
1829    1
2047    1
1285    1
599     1
914     1
1130    1
517     1
980     1
1977    1
879     0
131     1
1657    1
857     1
402     1
1379    1
1344    0
124     1
332     1
435     1
464     1
1875    1
530     1
381     1
429     1
501     1
1229    1
       ..
101     1
1669    1
1699  

  **self._backend_args)


916     1
2021    1
2035    1
536     1
1822    1
1808    1
1073    1
942     1
1592    1
800     1
1898    1
806     0
671     1
1007    1
664     1
1976    1
826     1
1531    1
60      1
876     1
1109    1
1193    1
474     1
461     1
1533    1
642     1
1715    1
1166    1
1144    1
1778    1
       ..
362     1
1624    0
1000    1
757     1
274     1
1075    1
1679    1
882     1
2028    1
501     1
936     0
503     1
1933    1
994     0
971     1
46      1
1131    0
1150    1
880     0
1805    0
456     1
383     1
1167    1
308     1
928     1
941     1
1957    1
1168    1
1426    1
612     1
Name: respiratory_disease, Length: 628, dtype: int64


### Balanced classes (oversampled)

In [57]:
results_oversampled = Results()  # object for saving results
basic_models = create_models_data(results_oversampled, df_oversampled, models_data=basic_models_data, test_size=0.3)
gscv_models = create_models_data(results_oversampled, df_oversampled, models_data=gscv_models_data, test_size=0.3)
kfolds_models = create_models_data(results_oversampled, df_oversampled, models_data=kfold_model_data, test_size=0.3)

  **self._backend_args)


1710    1
1784    1
683     1
1843    1
1842    1
90      1
494     0
1467    1
1370    0
1664    0
1452    1
1480    1
1556    1
459     0
314     1
746     1
785     1
655     1
1624    0
774     1
410     1
988     1
1590    1
1255    1
68      0
1682    1
1904    1
1527    1
208     0
1805    0
       ..
793     1
1489    1
2000    1
2076    1
566     1
581     1
1305    1
1322    1
1321    1
15      1
645     1
1846    1
1025    1
1866    1
851     1
204     1
46      1
512     1
2074    1
1491    1
593     1
908     0
592     1
690     1
603     1
249     1
2071    1
511     1
315     1
2037    1
Name: respiratory_disease, Length: 628, dtype: int64
149     1
2057    1
1122    1
37      1
830     1
1164    1
1295    1
951     1
1956    1
1001    1
1757    1
1503    0
717     1
1430    1
914     1
173     1
973     1
222     1
1125    1
1764    1
1720    1
1445    0
109     1
1450    1
966     1
177     1
1087    1
759     1
1016    1
1161    1
       ..
140     1
135     1
833    

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


In [58]:
for model in basic_models:
    basic_model(*model)
    
for model in gscv_models:
    GSCV_model(*model)
    
for model in kfolds_models:
    KFOLD_model(*model)

1102    1
1018    1
590     1
688     1
1644    0
2076    1
14      1
1203    1
1298    1
171     1
1700    1
1778    1
864     1
1513    1
88      1
1368    0
1731    1
1539    0
979     1
1137    1
342     1
1205    1
327     1
121     1
1165    1
720     1
2025    1
97      1
265     1
985     1
       ..
877     1
1040    1
159     1
661     1
2013    1
1935    1
110     1
2020    1
1491    1
1650    1
469     1
808     0
820     1
1581    1
1660    1
1719    1
2074    1
802     1
1705    1
1138    1
49      1
1134    0
344     1
648     1
1723    1
935     0
1230    1
2031    1
1986    1
1431    1
Name: respiratory_disease, Length: 628, dtype: int64


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


2043    1
1408    0
1644    0
2027    1
215     1
36      1
1970    1
1013    0
2049    1
1280    1
140     1
1739    0
856     0
833     0
491     0
1385    0
1154    0
1829    1
210     0
597     1
1740    0
922     1
1373    0
1041    1
1810    0
1032    0
1336    0
948     0
900     1
1659    0
       ..
1624    0
936     0
299     0
1844    0
585     0
764     0
764     0
128     1
1158    1
1188    1
1178    1
1503    0
565     1
1431    1
152     0
387     1
233     1
1624    0
1862    0
1324    0
1805    0
763     0
333     0
1609    1
1368    0
1888    1
1058    1
1324    0
1009    1
90      1
Name: respiratory_disease, Length: 1158, dtype: int64
7       1
8       1
10      1
16      1
17      1
18      1
21      1
24      1
27      1
30      1
31      1
32      1
34      1
36      1
37      1
39      1
41      1
42      1
48      1
49      1
51      1
54      1
56      1
58      1
62      1
66      1
69      1
71      1
72      1
73      1
       ..
1429    0
1269    0
210   

  **self._backend_args)


3       1
4       1
5       1
6       1
9       1
13      1
14      1
15      1
19      1
22      1
23      1
25      1
26      1
28      1
29      1
33      1
35      1
38      1
40      1
43      1
44      1
45      1
46      1
47      1
50      1
52      1
53      1
55      1
57      1
59      1
       ..
333     0
1880    0
995     0
1347    0
1259    0
378     0
1664    0
1       0
1269    0
428     0
585     0
1303    0
1324    0
1079    0
764     0
12      0
1740    0
1344    0
428     0
1013    0
210     0
312     0
210     0
1350    0
1371    0
331     0
492     0
1132    0
1625    0
281     0
Name: respiratory_disease, Length: 1930, dtype: int64
3       1
4       1
5       1
6       1
9       1
13      1
14      1
15      1
19      1
22      1
23      1
25      1
26      1
28      1
29      1
33      1
35      1
38      1
40      1
43      1
44      1
45      1
46      1
47      1
50      1
52      1
53      1
55      1
57      1
59      1
       ..
333     0
1880    0
995   



7       1
8       1
10      1
16      1
17      1
18      1
21      1
24      1
27      1
30      1
31      1
32      1
34      1
36      1
37      1
39      1
41      1
42      1
48      1
49      1
51      1
54      1
56      1
58      1
62      1
66      1
69      1
71      1
72      1
73      1
       ..
1429    0
1269    0
210     0
1713    0
1032    0
1800    0
833     0
1445    0
1303    0
1391    0
707     0
85      0
333     0
312     0
210     0
1659    0
1315    0
449     0
208     0
331     0
1503    0
1445    0
833     0
1061    0
208     0
1554    0
378     0
856     0
152     0
507     0
Name: respiratory_disease, Length: 1930, dtype: int64


  **self._backend_args)


3       1
4       1
5       1
6       1
9       1
13      1
14      1
15      1
19      1
22      1
23      1
25      1
26      1
28      1
29      1
33      1
35      1
38      1
40      1
43      1
44      1
45      1
46      1
47      1
50      1
52      1
53      1
55      1
57      1
59      1
       ..
333     0
1880    0
995     0
1347    0
1259    0
378     0
1664    0
1       0
1269    0
428     0
585     0
1303    0
1324    0
1079    0
764     0
12      0
1740    0
1344    0
428     0
1013    0
210     0
312     0
210     0
1350    0
1371    0
331     0
492     0
1132    0
1625    0
281     0
Name: respiratory_disease, Length: 1930, dtype: int64


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


1315    0
333     0
1192    1
1486    1
479     0
2019    1
1813    1
908     0
853     1
1739    0
1119    0
270     1
81      0
74      1
1625    0
200     1
1154    0
1408    0
1488    0
1429    0
1488    0
1469    1
1107    1
1148    1
1253    1
1810    0
1856    1
210     0
1505    1
1324    0
       ..
1054    0
681     1
1097    1
449     0
1246    1
67      0
1798    1
1861    1
1993    1
1727    0
820     1
954     1
1368    0
1224    1
561     1
90      1
1356    0
2007    1
843     1
468     0
217     1
2010    1
299     0
214     1
1130    1
1371    0
1713    0
1544    1
1146    1
1313    1
Name: respiratory_disease, Length: 1158, dtype: int64
1853    0
1766    0
587     1
1541    1
383     1
1429    0
20      0
151     0
436     1
1324    0
507     0
537     1
995     0
7       1
1884    0
1347    0
815     1
893     0
68      0
935     0
1466    1
27      1
1981    1
2057    1
1013    0
1818    1
1495    1
1079    0
1236    0
1279    1
       ..
742     1
1872    1
940   

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


1303    0
783     1
713     1
2030    1
1643    0
1625    0
458     1
475     0
510     1
1574    1
1325    0
63      0
1000    1
382     1
313     1
310     1
677     1
724     1
585     0
1431    1
1054    0
1186    1
321     1
159     1
559     1
831     1
449     0
20      0
1133    0
464     1
       ..
592     1
626     1
1062    1
400     0
2004    1
428     0
1546    1
836     1
819     1
833     0
68      0
2068    1
297     1
1423    1
856     0
448     1
1902    1
1895    1
1368    0
852     1
1477    0
281     0
1740    0
284     0
151     0
165     1
20      0
1657    1
1259    0
1505    1
Name: respiratory_disease, Length: 1158, dtype: int64
1689    1
63      0
1399    1
599     1
312     0
1138    1
2044    1
579     1
1939    1
1777    0
375     1
16      1
1862    0
253     1
1654    1
1647    0
940     1
2008    1
1315    0
492     0
1236    0
11      0
462     0
1079    0
1347    0
281     0
1985    1
67      0
1169    1
1503    0
       ..
1858    1
901     1
1960  

In [29]:
results_oversampled.get_scores().sort_values(by="f_score")

Zeros in test:  971 Zeros in hat:  918 Ones in test:  959 Ones in hat:  1012 right classified:  615 healthy patients:  971 specific:  0.6333676622039135
Zeros in test:  971 Zeros in hat:  1043 Ones in test:  959 Ones in hat:  887 right classified:  960 healthy patients:  971 specific:  0.9886714727085479
Zeros in test:  971 Zeros in hat:  1124 Ones in test:  959 Ones in hat:  806 right classified:  957 healthy patients:  971 specific:  0.9855818743563337


Unnamed: 0,accuracy,precision,recall,f_score,specific
LinearSVC_basic,0.542314,0.1019,0.855072,0.182099,0.982729
LogisticRegression_basic,0.646805,0.63606,0.666084,0.650726,0.658318
LogisticRegression_KFold,0.663731,0.67467,0.660569,0.667368,0.652972
KNeighborsClassifier_KFold,0.905959,0.829511,0.979252,0.89817,0.982363
KNeighborsClassifier_basic,0.949914,0.898601,1.0,0.946593,1.0
DecisionTreeClassifier_KFold,0.954922,0.915531,0.993799,0.953057,0.994336
DecisionTreeClassifier_basic,0.96114,0.921875,1.0,0.95935,1.0
SVC_basic,0.993092,0.986486,1.0,0.993197,1.0


Zeros in test:  971 Zeros in hat:  213 Ones in test:  959 Ones in hat:  1717 right classified:  193 healthy patients:  971 specific:  0.19876416065911431
Zeros in test:  578 Zeros in hat:  587 Ones in test:  580 Ones in hat:  571 right classified:  578 healthy patients:  578 specific:  1.0
Zeros in test:  959 Zeros in hat:  993 Ones in test:  971 Ones in hat:  937 right classified:  948 healthy patients:  959 specific:  0.9885297184567258
Zeros in test:  971 Zeros in hat:  982 Ones in test:  959 Ones in hat:  948 right classified:  953 healthy patients:  971 specific:  0.9814624098867147


  **self._backend_args)


Zeros in test:  971 Zeros in hat:  990 Ones in test:  959 Ones in hat:  940 right classified:  960 healthy patients:  971 specific:  0.9886714727085479
Zeros in test:  42 Zeros in hat:  35 Ones in test:  586 Ones in hat:  593 right classified:  13 healthy patients:  42 specific:  0.30952380952380953


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


Zeros in test:  572 Zeros in hat:  65 Ones in test:  586 Ones in hat:  1093 right classified:  58 healthy patients:  572 specific:  0.10139860139860139


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


Zeros in test:  581 Zeros in hat:  624 Ones in test:  577 Ones in hat:  534 right classified:  560 healthy patients:  581 specific:  0.963855421686747


  **self._backend_args)


Zeros in test:  50 Zeros in hat:  11 Ones in test:  578 Ones in hat:  617 right classified:  11 healthy patients:  50 specific:  0.22


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


Zeros in test:  50 Zeros in hat:  0 Ones in test:  578 Ones in hat:  628 right classified:  0 healthy patients:  50 specific:  0.0
Zeros in test:  590 Zeros in hat:  596 Ones in test:  568 Ones in hat:  562 right classified:  590 healthy patients:  590 specific:  1.0
Zeros in test:  570 Zeros in hat:  536 Ones in test:  588 Ones in hat:  622 right classified:  350 healthy patients:  570 specific:  0.6140350877192983
Zeros in test:  601 Zeros in hat:  652 Ones in test:  557 Ones in hat:  506 right classified:  596 healthy patients:  601 specific:  0.9916805324459235
Zeros in test:  45 Zeros in hat:  7 Ones in test:  583 Ones in hat:  621 right classified:  7 healthy patients:  45 specific:  0.15555555555555556
Zeros in test:  574 Zeros in hat:  592 Ones in test:  584 Ones in hat:  566 right classified:  574 healthy patients:  574 specific:  1.0


### Balanced classes (undersampled)

In [16]:
results_undersampled = Results()  # object for saving results
basic_models = create_models_data(results_undersampled, df_undersampled, models_data=basic_models_data, test_size=0.3)
gscv_models = create_models_data(results_undersampled, df_undersampled, models_data=gscv_models_data, test_size=0.3)
kfolds_models = create_models_data(results_undersampled, df_undersampled, models_data=kfold_model_data, test_size=0.3)

In [17]:
for model in basic_models:
    basic_model(*model)
    
for model in gscv_models:
    GSCV_model(*model)
    
for model in kfolds_models:
    KFOLD_model(*model)

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


In [21]:
results_undersampled.get_scores().sort_values(by="f_score")

Unnamed: 0,accuracy,precision,recall,f_score
LinearSVC_basic,0.55102,0.148936,0.636364,0.241379
SVC_basic,0.520408,0.307692,0.592593,0.405063
LinearSVC_KFold,0.521472,0.54878,0.694444,0.421245
DecisionTreeClassifier_GSCV,0.489796,0.407407,0.55,0.468085
RandomForestClassifier_basic,0.571429,0.565217,0.541667,0.553191
RandomForestClassifier_GSCV,0.622449,0.490566,0.722222,0.58427
DecisionTreeClassifier_basic,0.642857,0.555556,0.625,0.588235
KNeighborsClassifier_basic,0.612245,0.583333,0.608696,0.595745
RandomForestClassifier_KFold,0.644172,0.527477,0.689269,0.596977
LinearSVC_GSCV,0.591837,0.697674,0.526316,0.6


### Results review

Best results shows random forest, so we will research this model closer.
Also we will pay attention on decision tree, because this model is the easiestfor interpretation

## Random Forest

### On undersampled data

In [19]:
df = deepcopy(df_undersampled)
y = df['respiratory_disease']
X = df.drop(['respiratory_disease'], axis=1)
model_object = RandomForestClassifier()

In [20]:
parameters = {"criterion": ('gini', 'entropy'), "n_estimators": np.linspace(50, 400, 8, dtype=int),
              "max_depth": np.linspace(1, 9, 9, dtype=int),
              "max_features": ('auto', 'sqrt', 'log2'), "n_jobs": (-1,)}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)  
classifier = GridSearchCV(model_object, parameters)
classifier.fit(X_train, y_train)
print(classifier.best_estimator_)
print(classifier.best_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=9, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.7280701754385965


In [22]:
model_object = classifier.best_estimator_
model_object.fit(X_train, y_train)
y_hat = model_object.predict(X_test)
model_score = classification_scores(y_hat, y_test)
model_score

{'accuracy': 0.6632653061224489,
 'f_score': 0.6857142857142857,
 'precision': 0.6923076923076923,
 'recall': 0.6792452830188679}

In [23]:
accuracies = []
recalls = []
precisions = []
f_scores = []



kf = KFold(n_splits=3, shuffle=True, random_state=123)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model_object.fit(X_train, y_train)
    y_hat = model_object.predict(X_test)
    model_score = classification_scores(y_hat, y_test)
    accuracies.append(model_score.get("accuracy"))
    recalls.append(model_score.get("recall"))
    precisions.append(model_score.get("precision"))
    f_scores.append(model_score.get("f_score"))

model_score = {"accuracy": np.mean(accuracies),
               "precision": np.mean(precisions),
               "recall": np.mean(recalls),
               "f_score": np.mean(f_scores)}     

model_score


{'accuracy': 0.6961150753199683,
 'f_score': 0.6953542280738264,
 'precision': 0.6976359338061465,
 'recall': 0.7044653813356626}

### On oversampled data

In [24]:
df = deepcopy(df_oversampled)
y = df['respiratory_disease']
X = df.drop(['respiratory_disease'], axis=1)
model_object = RandomForestClassifier()

In [25]:
parameters = {"criterion": ('gini', 'entropy'), "n_estimators": np.linspace(50, 400, 8, dtype=int),
              "max_depth": np.linspace(1, 9, 9, dtype=int),
              "max_features": ('auto', 'sqrt', 'log2'), "n_jobs": (-1,)}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)  
classifier = GridSearchCV(model_object, parameters)
classifier.fit(X_train, y_train)
print(classifier.best_estimator_)
print(classifier.best_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=9, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=350, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.9811250925240562


In [26]:
model_object = classifier.best_estimator_
model_object.fit(X_train, y_train)
y_hat = model_object.predict(X_test)
model_score = classification_scores(y_hat, y_test)
model_score

{'accuracy': 0.9792746113989638,
 'f_score': 0.977818853974122,
 'precision': 0.9566003616636528,
 'recall': 1.0}

In [27]:
accuracies = []
recalls = []
precisions = []
f_scores = []

kf = KFold(n_splits=3, shuffle=True, random_state=123)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model_object.fit(X_train, y_train)
    y_hat = model_object.predict(X_test)
    model_score = classification_scores(y_hat, y_test)
    accuracies.append(model_score.get("accuracy"))
    recalls.append(model_score.get("recall"))
    precisions.append(model_score.get("precision"))
    f_scores.append(model_score.get("f_score"))

model_score = {"accuracy": np.mean(accuracies),
               "precision": np.mean(precisions),
               "recall": np.mean(recalls),
               "f_score": np.mean(f_scores)}     

model_score

{'accuracy': 0.9805713553769543,
 'f_score': 0.9802498634304101,
 'precision': 0.9612912708144288,
 'recall': 1.0}

In [31]:
def specific(y_hat, y_test):
    y_test_zero = 0
    y_test_one = 0
    y_hat_zero = 0
    y_hat_one = 0
    for _y_test in y_test:
        if _y_test == 0:
            y_test_zero += 1
        if _y_test == 1:
            y_test_one += 1
    for _y_hat in y_hat:
        if _y_hat == 0:
            y_hat_zero += 1
        if _y_hat == 1:
            y_hat_one += 1
    
    
    right_classified = 0
    healthy_patiants = 0
    for index in range(len(y_test)):
        if y_test.iloc[index] == 0:
            healthy_patiants += 1
            if y_hat[index] == 0:
                right_classified += 1
                
    print("Zeros in test: ", y_test_zero, "Zeros in hat: ", y_hat_zero, "Ones in test: ", y_test_one, "Ones in hat: ", y_hat_one,
         "right classified: ", right_classified, "healthy patients: ", healthy_patiants, "specific: ", right_classified / healthy_patiants)
    return right_classified / healthy_patiants

In [35]:
y_test = pd.Series([0, 1, 0, 1, 0, 1, 0])
y_hat = [0, 1, 1, 1, 0, 1, 0]

In [50]:
specific(y_hat, y_test)

0    0
1    1
2    0
3    1
4    0
5    1
6    0
dtype: int64


0.75

In [51]:
y_test.

SyntaxError: invalid syntax (<ipython-input-51-c8e312ac0987>, line 1)