# Covid-19 Mortality Risk Prediction based on Patient Blood Reports

## Lets Import important libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Lets Load our dataset

In [2]:
data=pd.read_excel("train_file1591182717.xlsx")
data=data.rename(columns={'Admission':'Admission time','Discharge':'Discharge time'})
#tes=pd.read_excel("test_file.xlsx")

In [3]:
data.head()

Unnamed: 0,PATIENT_ID,RE_DATE,age,gender,Admission time,Discharge time,outcome,Hypersensitive cardiac troponinI,hemoglobin,Serum chloride,...,mean corpuscular hemoglobin,Activation of partial thromboplastin time,Hypersensitive c-reactive protein,HIV antibody quantification,serum sodium,thrombocytocrit,ESR,glutamic-pyruvic transaminase,eGFR,creatinine
0,1,2020-01-31 01:09:00,73,1,2020-01-30,2020-02-17,0,,,,...,,,,,,,,,,
1,1,2020-01-31 01:25:00,73,1,2020-01-30,2020-02-17,0,,136.0,,...,31.9,,,,,0.12,,,,
2,1,2020-01-31 01:44:00,73,1,2020-01-30,2020-02-17,0,,,103.1,...,,,43.1,,137.7,,,16.0,46.6,130.0
3,1,2020-01-31 01:45:00,73,1,2020-01-30,2020-02-17,0,,,,...,,,,,,,,,,
4,1,2020-01-31 01:56:00,73,1,2020-01-30,2020-02-17,0,19.9,,,...,,,,,,,,,,


## Data Preprocessing..

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6120 entries, 0 to 6119
Data columns (total 81 columns):
 #   Column                                                         Non-Null Count  Dtype         
---  ------                                                         --------------  -----         
 0   PATIENT_ID                                                     6120 non-null   int64         
 1   RE_DATE                                                        6106 non-null   datetime64[ns]
 2   age                                                            6120 non-null   int64         
 3   gender                                                         6120 non-null   int64         
 4   Admission time                                                 6120 non-null   datetime64[ns]
 5   Discharge time                                                 6120 non-null   datetime64[ns]
 6   outcome                                                        6120 non-null   int64         
 7

In [5]:
data.isnull().sum()

PATIENT_ID                          0
RE_DATE                            14
age                                 0
gender                              0
Admission time                      0
                                 ... 
thrombocytocrit                  5258
ESR                              5737
glutamic-pyruvic transaminase    5189
eGFR                             5184
creatinine                       5184
Length: 81, dtype: int64

So from the above we can see that one of the main filter colums is RE_DATE and it has 14 null values and it is date time series, so we can't replace it with -1, so replaced the value with deafult date '2020-02-06'.

In [6]:
import datetime
data['RE_DATE']=data['RE_DATE'].fillna(pd.Timestamp('2020-02-06'))
data['RE_DATE']=data['RE_DATE'].dt.normalize()
data['RE_DATE']=(data['RE_DATE'] - datetime.datetime(1970,1,1)).dt.total_seconds()
#data['RE_DATE']=pd.to_datetime(data['RE_DATE'], utc=False)
data['Admission time']=data['Admission time'].dt.normalize()
data['Admission time']=(data['Admission time'] - datetime.datetime(1970,1,1)).dt.total_seconds()
#data['Admission time']=pd.to_datetime(data['Admission time'], utc=False)
data['Discharge time']=data['Discharge time'].dt.normalize()
data['Discharge time']=(data['Discharge time'] - datetime.datetime(1970,1,1)).dt.total_seconds()

In [7]:
dataf=data.groupby(['PATIENT_ID','RE_DATE','age','gender','Admission time','Discharge time','outcome']).sum(min_count=1)

In [8]:
dataf

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Hypersensitive cardiac troponinI,hemoglobin,Serum chloride,Prothrombin time,procalcitonin,eosinophils(%),Interleukin 2 receptor,Alkaline phosphatase,albumin,basophil(%),...,mean corpuscular hemoglobin,Activation of partial thromboplastin time,Hypersensitive c-reactive protein,HIV antibody quantification,serum sodium,thrombocytocrit,ESR,glutamic-pyruvic transaminase,eGFR,creatinine
PATIENT_ID,RE_DATE,age,gender,Admission time,Discharge time,outcome,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
1,1.580429e+09,73,1,1.580342e+09,1.581898e+09,0,19.9,136.0,103.1,13.9,0.09,0.6,,46.0,33.3,0.3,...,31.9,,43.1,0.09,137.7,0.12,41.0,16.0,46.6,130.0
1,1.580774e+09,73,1,1.580342e+09,1.581898e+09,0,,,,,,,,,,,...,,,,,,,,,,
1,1.580947e+09,73,1,1.580342e+09,1.581898e+09,0,,140.0,101.4,,,0.3,,54.0,33.2,0.1,...,32.1,,3.6,,142.9,0.23,,42.0,72.7,90.0
1,1.581293e+09,73,1,1.580342e+09,1.581898e+09,0,,130.0,98.5,14.1,,0.2,,57.0,32.4,0.1,...,31.7,37.9,,,139.4,0.18,,29.0,64.8,99.0
1,1.581725e+09,73,1,1.580342e+09,1.581898e+09,0,,129.0,98.1,,,1.1,,61.0,35.9,0.3,...,32.6,,2.6,,140.0,0.16,,29.0,74.7,88.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,1.581293e+09,68,1,1.581120e+09,1.582070e+09,1,,,97.5,,,,1524.0,,,,...,,,,,135.0,,,,,
375,1.581379e+09,68,1,1.581120e+09,1.582070e+09,1,,,,,,,,,,,...,,,,,,,,,,
375,1.581552e+09,68,1,1.581120e+09,1.582070e+09,1,,,105.9,,,,,82.0,25.3,,...,,,,,139.9,,,17.0,84.3,82.0
375,1.581725e+09,68,1,1.581120e+09,1.582070e+09,1,,,,,,,,,,,...,,,,,,,,,,


In [9]:
dataf=dataf.reset_index()

In [10]:
#dataf.info()

In [11]:
dataf

Unnamed: 0,PATIENT_ID,RE_DATE,age,gender,Admission time,Discharge time,outcome,Hypersensitive cardiac troponinI,hemoglobin,Serum chloride,...,mean corpuscular hemoglobin,Activation of partial thromboplastin time,Hypersensitive c-reactive protein,HIV antibody quantification,serum sodium,thrombocytocrit,ESR,glutamic-pyruvic transaminase,eGFR,creatinine
0,1,1.580429e+09,73,1,1.580342e+09,1.581898e+09,0,19.9,136.0,103.1,...,31.9,,43.1,0.09,137.7,0.12,41.0,16.0,46.6,130.0
1,1,1.580774e+09,73,1,1.580342e+09,1.581898e+09,0,,,,...,,,,,,,,,,
2,1,1.580947e+09,73,1,1.580342e+09,1.581898e+09,0,,140.0,101.4,...,32.1,,3.6,,142.9,0.23,,42.0,72.7,90.0
3,1,1.581293e+09,73,1,1.580342e+09,1.581898e+09,0,,130.0,98.5,...,31.7,37.9,,,139.4,0.18,,29.0,64.8,99.0
4,1,1.581725e+09,73,1,1.580342e+09,1.581898e+09,0,,129.0,98.1,...,32.6,,2.6,,140.0,0.16,,29.0,74.7,88.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1713,375,1.581293e+09,68,1,1.581120e+09,1.582070e+09,1,,,97.5,...,,,,,135.0,,,,,
1714,375,1.581379e+09,68,1,1.581120e+09,1.582070e+09,1,,,,...,,,,,,,,,,
1715,375,1.581552e+09,68,1,1.581120e+09,1.582070e+09,1,,,105.9,...,,,,,139.9,,,17.0,84.3,82.0
1716,375,1.581725e+09,68,1,1.581120e+09,1.582070e+09,1,,,,...,,,,,,,,,,


In [12]:
dataf['outcome'].value_counts()

0    1056
1     662
Name: outcome, dtype: int64

In [13]:
Mdata=data.groupby(['PATIENT_ID','RE_DATE'], as_index=True,level=0).ffill().drop_duplicates(['PATIENT_ID','RE_DATE'],keep='last')

In [14]:
Mdata

Unnamed: 0,PATIENT_ID,RE_DATE,age,gender,Admission time,Discharge time,outcome,Hypersensitive cardiac troponinI,hemoglobin,Serum chloride,...,mean corpuscular hemoglobin,Activation of partial thromboplastin time,Hypersensitive c-reactive protein,HIV antibody quantification,serum sodium,thrombocytocrit,ESR,glutamic-pyruvic transaminase,eGFR,creatinine
7,1,1.580429e+09,73,1,1.580342e+09,1.581898e+09,0,,,,...,,,,,,,,,,
8,1,1.580774e+09,73,1,1.580342e+09,1.581898e+09,0,,,,...,,,,,,,,,,
10,1,1.580947e+09,73,1,1.580342e+09,1.581898e+09,0,,,101.4,...,,,3.6,,142.9,,,42.0,72.7,90.0
13,1,1.581293e+09,73,1,1.580342e+09,1.581898e+09,0,,,98.5,...,,,,,139.4,,,29.0,64.8,99.0
16,1,1.581725e+09,73,1,1.580342e+09,1.581898e+09,0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6107,375,1.581293e+09,68,1,1.581120e+09,1.582070e+09,1,,,,...,,,,,,,,,,
6109,375,1.581379e+09,68,1,1.581120e+09,1.582070e+09,1,,,,...,,,,,,,,,,
6111,375,1.581552e+09,68,1,1.581120e+09,1.582070e+09,1,,,,...,,,,,,,,,,
6112,375,1.581725e+09,68,1,1.581120e+09,1.582070e+09,1,,,,...,,,,,,,,,,


## Filling Missing values for Model 2
-Substitute all the missing data as -1.
Augment the training data by adding relevant rows to the training data. Expectation is not to have as many rows as the rows in the datasheet given, but use some criteria to group rows together

In [15]:
Mdata2=dataf.fillna(-1)

In [16]:
Mdata2

Unnamed: 0,PATIENT_ID,RE_DATE,age,gender,Admission time,Discharge time,outcome,Hypersensitive cardiac troponinI,hemoglobin,Serum chloride,...,mean corpuscular hemoglobin,Activation of partial thromboplastin time,Hypersensitive c-reactive protein,HIV antibody quantification,serum sodium,thrombocytocrit,ESR,glutamic-pyruvic transaminase,eGFR,creatinine
0,1,1.580429e+09,73,1,1.580342e+09,1.581898e+09,0,19.9,136.0,103.1,...,31.9,-1.0,43.1,0.09,137.7,0.12,41.0,16.0,46.6,130.0
1,1,1.580774e+09,73,1,1.580342e+09,1.581898e+09,0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.00,-1.0,-1.00,-1.0,-1.0,-1.0,-1.0
2,1,1.580947e+09,73,1,1.580342e+09,1.581898e+09,0,-1.0,140.0,101.4,...,32.1,-1.0,3.6,-1.00,142.9,0.23,-1.0,42.0,72.7,90.0
3,1,1.581293e+09,73,1,1.580342e+09,1.581898e+09,0,-1.0,130.0,98.5,...,31.7,37.9,-1.0,-1.00,139.4,0.18,-1.0,29.0,64.8,99.0
4,1,1.581725e+09,73,1,1.580342e+09,1.581898e+09,0,-1.0,129.0,98.1,...,32.6,-1.0,2.6,-1.00,140.0,0.16,-1.0,29.0,74.7,88.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1713,375,1.581293e+09,68,1,1.581120e+09,1.582070e+09,1,-1.0,-1.0,97.5,...,-1.0,-1.0,-1.0,-1.00,135.0,-1.00,-1.0,-1.0,-1.0,-1.0
1714,375,1.581379e+09,68,1,1.581120e+09,1.582070e+09,1,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.00,-1.0,-1.00,-1.0,-1.0,-1.0,-1.0
1715,375,1.581552e+09,68,1,1.581120e+09,1.582070e+09,1,-1.0,-1.0,105.9,...,-1.0,-1.0,-1.0,-1.00,139.9,-1.00,-1.0,17.0,84.3,82.0
1716,375,1.581725e+09,68,1,1.581120e+09,1.582070e+09,1,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.00,-1.0,-1.00,-1.0,-1.0,-1.0,-1.0


## Filling Missing Values for Model 1
-Do not fill any missing data. Substitute all the missing data as -1
Take the final data report of the patient as the input data for each patient, and fit the model. This implies that size of the training data is only 375 rows

In [17]:
Mdata1=Mdata.fillna(-1)

In [18]:
Mdata1

Unnamed: 0,PATIENT_ID,RE_DATE,age,gender,Admission time,Discharge time,outcome,Hypersensitive cardiac troponinI,hemoglobin,Serum chloride,...,mean corpuscular hemoglobin,Activation of partial thromboplastin time,Hypersensitive c-reactive protein,HIV antibody quantification,serum sodium,thrombocytocrit,ESR,glutamic-pyruvic transaminase,eGFR,creatinine
7,1,1.580429e+09,73,1,1.580342e+09,1.581898e+09,0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
8,1,1.580774e+09,73,1,1.580342e+09,1.581898e+09,0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
10,1,1.580947e+09,73,1,1.580342e+09,1.581898e+09,0,-1.0,-1.0,101.4,...,-1.0,-1.0,3.6,-1.0,142.9,-1.0,-1.0,42.0,72.7,90.0
13,1,1.581293e+09,73,1,1.580342e+09,1.581898e+09,0,-1.0,-1.0,98.5,...,-1.0,-1.0,-1.0,-1.0,139.4,-1.0,-1.0,29.0,64.8,99.0
16,1,1.581725e+09,73,1,1.580342e+09,1.581898e+09,0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6107,375,1.581293e+09,68,1,1.581120e+09,1.582070e+09,1,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
6109,375,1.581379e+09,68,1,1.581120e+09,1.582070e+09,1,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
6111,375,1.581552e+09,68,1,1.581120e+09,1.582070e+09,1,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
6112,375,1.581725e+09,68,1,1.581120e+09,1.582070e+09,1,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


## Filling Missing values for Model 3
-Try to fill the missing data by typical methods: Mean, Most Co-related value, etc, 
 Take the final data report of the patient as the input data for each patient, and fit the model. This implies that size of the   training data is only 375 rows
 
-So as the patient report depends upon age like patient with 25 years and patient with 75 years can't have same result, so imputed missing values of the patient based on there age group.

In [19]:
Mdata3=data.groupby(['PATIENT_ID','RE_DATE'], as_index=True,level=0).ffill().drop_duplicates(['PATIENT_ID','RE_DATE'],keep='last')
colstofill = list(Mdata3.columns)[7:]
#fill the median grouped by age
for col in colstofill:
    Mdata3[col] = Mdata3.groupby(['age'])[col].transform(lambda x: x.fillna(x.median()))
#fill in any remaining nulls with median from entire set
Mdata3[colstofill]=Mdata3[colstofill].fillna(Mdata3[col].median())
#sort to make sure each patientid is in order by date
Mdata3=Mdata3.sort_values(['PATIENT_ID','RE_DATE'], ascending=[True, True])
# Drop all duplicate PATIENT ID rows, except the last one
Mdata3= Mdata3.drop_duplicates(['PATIENT_ID'], keep='last').reset_index(drop=True)

In [20]:
Mdata3

Unnamed: 0,PATIENT_ID,RE_DATE,age,gender,Admission time,Discharge time,outcome,Hypersensitive cardiac troponinI,hemoglobin,Serum chloride,...,mean corpuscular hemoglobin,Activation of partial thromboplastin time,Hypersensitive c-reactive protein,HIV antibody quantification,serum sodium,thrombocytocrit,ESR,glutamic-pyruvic transaminase,eGFR,creatinine
0,1,1.581898e+09,73,1,1.580342e+09,1.581898e+09,0,75.00,75.0,105.10,...,75.00,39.1,13.20,0.100,144.55,75.000,58.0,34.0,69.50,90.0
1,2,1.581898e+09,61,1,1.580774e+09,1.582070e+09,0,31.70,136.0,100.10,...,31.40,36.0,10.80,0.080,139.40,0.300,15.0,18.0,88.50,64.0
2,3,1.580947e+09,70,2,1.579738e+09,1.581120e+09,0,2183.00,137.0,104.40,...,31.40,37.7,170.00,0.125,143.20,0.190,36.0,35.5,72.45,92.0
3,4,1.581898e+09,74,1,1.580429e+09,1.581984e+09,0,75.00,110.0,107.05,...,39.20,41.1,73.50,0.150,143.75,0.270,37.0,26.0,74.20,88.0
4,5,1.581984e+09,29,2,1.580515e+09,1.581984e+09,0,75.00,130.0,75.00,...,30.00,75.0,75.00,75.000,75.00,0.360,75.0,75.0,75.00,75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,371,1.580947e+09,63,1,1.580774e+09,1.580861e+09,1,7.20,75.0,100.20,...,75.00,37.0,54.10,0.100,139.50,75.000,7.0,20.0,95.60,56.0
371,372,1.581725e+09,79,1,1.580861e+09,1.581811e+09,1,6.80,102.0,119.30,...,30.40,34.8,208.05,0.115,154.50,0.070,75.0,103.0,16.40,298.0
372,373,1.581466e+09,61,2,1.581466e+09,1.581638e+09,1,31.70,136.0,100.10,...,31.40,36.7,10.80,0.080,139.40,0.300,15.0,18.0,88.50,64.0
373,374,1.580947e+09,33,1,1.580688e+09,1.581120e+09,1,75.00,148.5,128.20,...,30.50,50.5,61.70,0.085,164.70,0.135,75.0,1508.0,69.40,118.0


# Filling missing values for Model 4
-Try to fill the missing data by typical methods: Mean, Most Co-related value, etc.
Augment the training data by adding relevant rows to the training data. Expectation is not to have as many rows as the rows in the datasheet given, but use some criteria to group rows together.

-So as the patient report depends upon age like patient with 25 years and patient with 75 years can't have same result, so imputed missing values of the patient based on there age group.

In [21]:
Mdata4=data.groupby(['PATIENT_ID','RE_DATE','age','gender','Admission time','Discharge time','outcome']).sum(min_count=1)
Mdata4=Mdata4.reset_index()
colstofill = list(Mdata4.columns)[7:]
#fill the median grouped by age
for col in colstofill:
    Mdata4[col] = Mdata4.groupby(['age'])[col].transform(lambda x: x.fillna(x.median()))
#fill in any remaining nulls with median from entire set
Mdata4[colstofill]=Mdata4[colstofill].fillna(Mdata4[col].median())

In [22]:
Mdata4

Unnamed: 0,PATIENT_ID,RE_DATE,age,gender,Admission time,Discharge time,outcome,Hypersensitive cardiac troponinI,hemoglobin,Serum chloride,...,mean corpuscular hemoglobin,Activation of partial thromboplastin time,Hypersensitive c-reactive protein,HIV antibody quantification,serum sodium,thrombocytocrit,ESR,glutamic-pyruvic transaminase,eGFR,creatinine
0,1,1.580429e+09,73,1,1.580342e+09,1.581898e+09,0,19.9,136.0,103.10,...,31.90,39.0,43.10,0.09,137.70,0.120,41.0,16.0,46.6,130.0
1,1,1.580774e+09,73,1,1.580342e+09,1.581898e+09,0,28.1,128.0,99.25,...,30.40,39.0,89.30,0.09,139.30,0.160,41.0,33.0,74.3,87.0
2,1,1.580947e+09,73,1,1.580342e+09,1.581898e+09,0,28.1,140.0,101.40,...,32.10,39.0,3.60,0.09,142.90,0.230,41.0,42.0,72.7,90.0
3,1,1.581293e+09,73,1,1.580342e+09,1.581898e+09,0,28.1,130.0,98.50,...,31.70,37.9,89.30,0.09,139.40,0.180,41.0,29.0,64.8,99.0
4,1,1.581725e+09,73,1,1.580342e+09,1.581898e+09,0,28.1,129.0,98.10,...,32.60,39.0,2.60,0.09,140.00,0.160,41.0,29.0,74.7,88.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1713,375,1.581293e+09,68,1,1.581120e+09,1.582070e+09,1,409.5,122.0,97.50,...,31.65,38.8,107.85,0.09,135.00,0.225,39.0,35.0,77.4,87.0
1714,375,1.581379e+09,68,1,1.581120e+09,1.582070e+09,1,409.5,122.0,103.30,...,31.65,38.8,107.85,0.09,141.05,0.225,39.0,35.0,77.4,87.0
1715,375,1.581552e+09,68,1,1.581120e+09,1.582070e+09,1,409.5,122.0,105.90,...,31.65,38.8,107.85,0.09,139.90,0.225,39.0,17.0,84.3,82.0
1716,375,1.581725e+09,68,1,1.581120e+09,1.582070e+09,1,409.5,122.0,103.30,...,31.65,38.8,107.85,0.09,141.05,0.225,39.0,35.0,77.4,87.0


## Now lets split and train our data to select the best ML model

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,ExtraTreesClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,roc_curve,auc
import xgboost as xgb
from sklearn.model_selection import train_test_split,cross_val_score

In [27]:
LR=LogisticRegression()
SV=SVC()
KNN=KNeighborsClassifier()
DT=DecisionTreeClassifier()
GB=GaussianNB()
RFC=RandomForestClassifier()
GBC=GradientBoostingClassifier()
ADC=AdaBoostClassifier()
ETC=ExtraTreesClassifier()
XGBC=xgb.XGBClassifier()

In [28]:
models = []
models.append(('KNeighborsClassifier', KNN))
models.append(('SVC', SV))
models.append(('LogisticRegression', LR))
models.append(('DecisionTreeClassifier', DT))
models.append(('GaussianNB', GB))
models.append(('RandomForestClassifier', RFC))
models.append(('GradientBoostingClassifier', GBC))
models.append(('ExtraTreesClassifier',ETC))
models.append(('AdaBoostClassifier',ADC))
models.append(('XGBoostClassifier',XGBC))

In [32]:
def matrix(x,y,clf):
    Model=[]
    score=[]
    cvs=[]
    rocscore=[]
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.70)
    pre=model.predict(x_test)
    acc=accuracy_score(y_test,pre)
    score.append(acc*100)
    sc = cross_val_score(clf, x, y, cv=10, scoring='accuracy').mean()
    cvs.append(sc*100)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,pre)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    rocscore.append(roc_auc)
    return clf,acc,sc,roc_auc

In [34]:
def run_tests(Mdata):
    Model=[]
    score=[]
    cvs=[]
    rocscore=[]
    x1=Mdata.drop(['outcome'],axis=1)
    y1=Mdata['outcome']
    for name,model in models:
        Mod,sco,cv,rocsco= matrix(x1,y1,model)
        Model.append(Mod)
        score.append(sco)
        cvs.append(cv)
        rocscore.append(rocsco)
    result = pd.DataFrame({'Model': Model, 'Accuracy_score': score ,'Cross_val_score':cvs,'Roc_auc_curve':rocscore})
    print (result)

## Now lets pass the data into our alogrithms to get different models

## Model 1

In [35]:
run_tests(Mdata1)

                                               Model  Accuracy_score  \
0  KNeighborsClassifier(algorithm='auto', leaf_si...        0.999169   
1  SVC(C=1.0, break_ties=False, cache_size=200, c...        0.999169   
2  LogisticRegression(C=1.0, class_weight=None, d...        0.999169   
3  DecisionTreeClassifier(ccp_alpha=0.0, class_we...        0.999169   
4       GaussianNB(priors=None, var_smoothing=1e-09)        0.999169   
5  (DecisionTreeClassifier(ccp_alpha=0.0, class_w...        0.999169   
6  ([DecisionTreeRegressor(ccp_alpha=0.0, criteri...        0.999169   
7  (ExtraTreeClassifier(ccp_alpha=0.0, class_weig...        0.999169   
8  (DecisionTreeClassifier(ccp_alpha=0.0, class_w...        0.999169   
9  XGBClassifier(base_score=0.5, booster='gbtree'...        0.999169   

   Cross_val_score  Roc_auc_curve  
0         0.779991       0.999338  
1         0.614667       0.999338  
2         0.730994       0.999338  
3         0.962080       0.999338  
4         0.738093       0.

In [36]:
run_tests(Mdata2)

                                               Model  Accuracy_score  \
0  KNeighborsClassifier(algorithm='auto', leaf_si...        0.999169   
1  SVC(C=1.0, break_ties=False, cache_size=200, c...        0.999169   
2  LogisticRegression(C=1.0, class_weight=None, d...        0.999169   
3  DecisionTreeClassifier(ccp_alpha=0.0, class_we...        0.999169   
4       GaussianNB(priors=None, var_smoothing=1e-09)        0.999169   
5  (DecisionTreeClassifier(ccp_alpha=0.0, class_w...        0.999169   
6  ([DecisionTreeRegressor(ccp_alpha=0.0, criteri...        0.999169   
7  (ExtraTreeClassifier(ccp_alpha=0.0, class_weig...        0.999169   
8  (DecisionTreeClassifier(ccp_alpha=0.0, class_w...        0.999169   
9  XGBClassifier(base_score=0.5, booster='gbtree'...        0.999169   

   Cross_val_score  Roc_auc_curve  
0         0.766605       0.999338  
1         0.614667       0.999338  
2         0.731579       0.999338  
3         0.962080       0.999338  
4         0.819591       0.

In [None]:
#import sweetviz

In [None]:
#Report=sweetviz.analyze([dataf,"Train"],target_feat='outcome')

In [None]:
#Report.show_html('Report.html')

In [None]:
#data[data.columns[7:]].corr()['outcome'][:]

#s = c.unstack()
#so = s.sort_values(kind="quicksort")

In [None]:
#corr_matrix = dataf.corr()
#d1=corr_matrix["outcome"]
#d1.hist()

## Model 1
-Do not fill any missing data. Substitute all the missing data as -1
-Take the final data report of the patient as the input data for each patient, and fit the model. This implies that size of the   training data is only 375 rows