In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance, distance_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from collections import Counter
from copy import deepcopy
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Creation some help functions

In [2]:
class Color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

    
# function allows to print colorful and bold text
def pretty_print(text="", text_headers="", color="", bold=False):
    bold_part = Color.BOLD if bold else ""
    color_part = Color.__dict__.get(color.upper(), "")
    print(bold_part + color_part + text_headers + Color.END + text)
    


## Data loading from scv files
1. all_data - data of morning and afternoon datasets
2. morning_data - data of morning dataset
3. afternoon_data - data of afternoon dataset

In [3]:
separator = ','
all_data = pd.read_csv("all_data.csv", sep=separator)
morning_data = pd.read_csv("morning_data.csv", sep=separator)
afternoon_data =pd.read_csv("afternoon_data.csv", sep=separator)
datasets = [all_data, morning_data, afternoon_data]

In [4]:
np.argwhere(np.sum(all_data.isnull()) != 0)

array([[15],
       [16],
       [21],
       [22],
       [23],
       [32],
       [33],
       [37],
       [38]])

In [5]:
all_data.columns

Index(['cid', 'patient_id', 'date1', 'age', 'gender', 'weight', 'height',
       'cardiostimulator', 'smoking', 'diseasediabetes', 'diseasehypertonia',
       'respiratory_disease', 'p_a', 'p_da', 'p_t', 'p_left_slopes',
       'p_right_slopes', 'q_a', 'q_b_t', 'q_e_t', 'r_a', 'r_left_slopes',
       'r_right_slopes', 'r_b_t', 'r_e_t', 's_a', 's_da', 's_b_t', 's_e_t',
       't_a', 't_da', 't_t', 't_left_slopes', 't_right_slopes', 'interval_pq',
       'komplex_qrs', 'segment_st', 'interval_qt', 'zubets_p', 'pulse', 'sdnn',
       'skewness', 'amo', 'swai', 'mo', 'drr', 'rrnn', 'pnn50', 'si', 'sati',
       'rmi', 'kurtosis', 'cv', 'rmssd', 'nn50', 'lf', 'lfhf', 'hfp', 'ulf',
       'tp', 'vlf', 'vlfp', 'lfp', 'br', 'ulfp', 'ic', 'hf', 'tpfull', 'lfw',
       'lfhfw', 'hfpw', 'ulfw', 'tpw', 'vlfw', 'vlfpw', 'lfpw', 'brw', 'ulfpw',
       'icw', 'hfw', 'tpfullw'],
      dtype='object')

## Data engineering

## Description of all_data

In [14]:
research_data(all_data, "red", True)

[1m[91mFirst 5 strings of dataset[0m
      cid  patient_id  age  gender  weight  height  cardiostimulator  smoking  \
0  110110       10177   65       1      81     178                 0        1   
1  110114       10178   43       0      60     157                 0        0   
2  110116       10180   73       0      92     156                 0        0   
3  110177       10185   61       1      55     176                 0        1   
4  110183       10187   63       1      65     178                 0        1   

   diseasediabetes  diseasehypertonia  ...        vlfw     vlfpw      lfpw  \
0                1                  0  ...   25.077679  0.148477  0.355572   
1                0                  0  ...    8.028726  0.142290  0.103802   
2                0                  1  ...   30.364277  0.105113  0.121068   
3                0                  0  ...  743.100100  0.578435  0.301847   
4                0                  1  ...   42.380817  0.338698  0.270588   

    

## Description of morning_data

In [15]:
research_data(morning_data, "red", True)

[1m[91mFirst 5 strings of dataset[0m
      cid  patient_id                date1  age  gender  weight  height  \
0  110177       10185  2019-07-02 08:16:18   61       1      55     176   
1  110183       10187  2019-07-02 09:46:21   63       1      65     178   
2  110255       10185  2019-07-03 07:21:29   61       1      55     176   
3  110257       10187  2019-07-03 07:47:32   63       1      65     178   
4  110259       10200  2019-07-03 08:13:23   27       0      75     160   

   cardiostimulator  smoking  diseasediabetes  ...       ulfw          tpw  \
0                 0        1                0  ...  11.325395  1284.673100   
1                 0        1                0  ...   0.487702   125.128555   
2                 0        1                0  ...   0.663814   801.920530   
3                 0        1                0  ...   2.142578  1226.863200   
4                 0        0                0  ...   3.280306  3770.700700   

         vlfw     vlfpw      lfpw       

## Description of afternoon_data

In [13]:
research_data(afternoon_data, "red", True)

[1m[91mFirst 5 strings of dataset[0m
   patient_id     cid             date1  age  gender  weight  height  \
0       11470  117354  19.09.2019 12:39   89       0      54     154   
1       11470  117734  23.09.2019 12:13   89       0      54     154   
2       11470  117481  20.09.2019 13:04   89       0      54     154   
3       11470  117892  24.09.2019 13:23   89       0      54     154   
4       11470  117354  19.09.2019 12:39   89       0      54     154   

   cardiostimulator  smoking  diseasediabetes      ...                   tpw  \
0                 0        0                1      ...        149,7740000000   
1                 0        0                1      ...        223,9548000000   
2                 0        0                1      ...        224,9972500000   
3                 0        0                1      ...        206,7881500000   
4                 0        0                1      ...        149,7740000000   

            vlfw         vlfpw          lfpw  

# Data Preprocessing

In [6]:
# convert string data to appropriate format
def string_preprocessing(string):
        
    def change_double_dot_floats(element):
        counter = Counter(list(element))
        if counter["."] > 1:
            second_dot_index = element.rfind(".")
            element_list = list(element)
            element_list[second_dot_index] = ""
            element = ''.join(element_list)
        return element  
    
    if not pd.isnull(string):
        string = string.replace(",", ".")
        string = change_double_dot_floats(string)
        string = int(string) if string in ["0", "1"] else float(string)
    return string
        
def appropriate_strings(df_without_none, string_with_nan, distance_metrics):
    most_appropriate_indexes = {}
    for distance_metric in distance_metrics:
        string_distances = distance_metric(string_with_nan.values.reshape(1, -1), df_without_none) 
        string_distances = string_distances.reshape(string_distances.shape[1],)
        min_distance_string_index = np.where(string_distances == min(string_distances))[0][0]
        most_appropriate_indexes.update({distance_metric.__name__: min_distance_string_index})
    counter = Counter(most_appropriate_indexes.values())
    return {"appropriate_indexes": most_appropriate_indexes, "best_index": counter.most_common(1)[0][0]}  

In [7]:
def replace_all_nan_by_values_from_closest_vector(df, distance_metrics):
    # creation of dataset without nan values
    df["index"] = df.index  # create column which will duplicate indexes to drop and restore rows with nan
    
    columns_with_nan = [column for column in df.columns if np.sum(df[column].isnull()) != 0]  # find columns with nan
    df_without_none = df.drop(df[columns_with_nan], axis=1)  # create dataset without columns with nan
        
    # finding list of all possible nan indexes and dict of column with nan and its indexes
    all_nan_indexes = set()
    for column_with_nan in columns_with_nan:
        nan_indexes = set(sum(np.argwhere(pd.isnull(df[column_with_nan])).tolist(), []))
        all_nan_indexes = all_nan_indexes.union(nan_indexes)
    all_nan_indexes = list(all_nan_indexes)  # list of indexes of strings with nan
    
    strings_with_nan = {}  # dict with strings with nan to restore them after finding the closest one
    for nan_index in all_nan_indexes:
        strings_with_nan.update({nan_index: df_without_none.loc[nan_index]})     
    df_without_none.drop(df_without_none.index[all_nan_indexes], inplace=True, axis=0)    
    df_without_none.reset_index(inplace=True, drop=True)  # reset indexes in a row

    # change nan values of original dataset
    for nan_index in all_nan_indexes:
        prepared_string = strings_with_nan[nan_index]  # take string with nan values to find the closest one
        best_string_index = appropriate_strings(df_without_none, prepared_string, distance_metrics)["best_index"]
        string_with_nan = df.loc[nan_index]  # string in which nan values will be found and replaced 
        real_best_index = int(df_without_none.loc[best_string_index]["index"])  # index from original dataset 
        closest_string = df.loc[real_best_index]  # the closest string from dataset without nan strings
        for column_with_nan in columns_with_nan:
            if pd.isnull(string_with_nan[column_with_nan]):
                df.loc[nan_index, column_with_nan] = closest_string[column_with_nan]  # set value

In [8]:
def dataset_preprocessing(df, distance_metrics=[euclidean_distances]):
    date_column = df.date1
    df.drop(["date1"], inplace=True, axis=1)
#     for column in df.columns:
#         if type(df[column][0]) == str:
#             try:
#                 df[column] = df[column].apply(lambda x: string_preprocessing(x))
#             except:
#                 print(df[column])
    
    replace_all_nan_by_values_from_closest_vector(df, distance_metrics)
    df["date1"] = date_column

# Preprocessing of all_data

In [9]:
dataset_preprocessing(all_data)

# Researching of data and feature selection

In [10]:
df = deepcopy(all_data)

In [11]:
np.argwhere(np.sum(df.isnull()) != 0)

array([], shape=(0, 1), dtype=int64)

In [64]:
df_new = SelectKBest(mutual_info_classif, k=5).fit_transform(new_df, y)

In [58]:
negative_columns = set()
for column in df.columns: 
    for index in df.index:
        if type(df[column][index]) != str and df[column][index] < 0:
            negative_columns.add(column)
            
negative_columns

{'interval_pq',
 'komplex_qrs',
 'kurtosis',
 'p_a',
 'p_da',
 'p_left_slopes',
 'p_right_slopes',
 'q_a',
 'r_a',
 'r_left_slopes',
 'r_right_slopes',
 's_a',
 's_da',
 'segment_st',
 'skewness',
 't_a',
 't_da',
 't_left_slopes',
 't_right_slopes'}

In [None]:
S

# Models

In [11]:
y = df["respiratory_disease"]
df.drop(["respiratory_disease", "date1"], inplace=True, axis=1)
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3)

In [13]:
def use_model(X_train, X_test, y_train, y_test, model, parameters):
    classifier = GridSearchCV(model, parameters)
    classifier.fit(X_train, y_train)
    sorted(clf.cv_results_.keys())

### SVM

In [None]:

use_model(X_train, X_test, y_train, y_test, SVM(), parameters)

### Logistic Regression

In [None]:
parameters = {'kernel':('rbf',), 'C':[1, 10]}
classifier = GridSearchCV(SVC(), parameters)
classifier.fit(X_train, y_train)



### Decision Tree

### RandomForest

In [70]:
RFC = RandomForestClassifier()

In [71]:
RFC.fit(X_train, y_train)
y_hat = RFC.predict(X_test)



In [73]:
f1_score(y_test, y_hat)

0.9976830398517145