In [139]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [140]:
test_df = pd.read_csv('./data/class_test.csv')
train_df = pd.read_csv('./data/class_train.csv')

### 2.1 Data analysis

In [141]:
concat_df = pd.concat([test_df, train_df])

In [142]:
concat_df.shape

(116, 160)

In [143]:
concat_df.describe()

Unnamed: 0,Plasma renin activity,Aldosterone-to-renin ratio -more detailed ranges in Aldosterone/renin ratio article,Oxygen saturation,17α-Hydroxyprogesterone,Anti ds-DNA,Reticulocyte hemoglobin equivalent,Fibrinogen,Procalcitonin,SHBG -more detailed ranges in SHBG article,[H+],...,Luteinizing hormone (LH)-more detailed menstrual cycle ranges in separate diagram,Alpha 1-antitrypsin (AAT),Eosinophil cationic protein (ECP),Absolute content of carbon dioxide (CO2),Chloride (Cl),Amylase,IgM,Copper (Cu),Viscosity,Outcome
count,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,...,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0
mean,1397.34601,8.230417,8.045073,12184.702129,908.82203,2466.302301,7968.517631,413.760661,11916.589027,2762.817713,...,15065.529595,7.210626,2894.964683,7.344701,11288.014642,7.762595,6641.767308,7.316009,11026.80277,0.551724
std,321.323243,5.826776,6.182216,2866.106859,191.909497,583.469098,1744.304821,83.151072,2657.316862,657.354381,...,3730.556824,5.419995,716.210067,5.82697,2866.122202,5.807348,1398.213177,5.175228,2677.48972,0.499475
min,953.895798,0.006976,0.013735,8181.376094,637.176162,1606.945756,5518.843898,291.276226,7834.819418,1641.900914,...,9446.175374,0.159007,1898.15085,0.057244,7405.484356,0.178915,4644.152278,0.025516,6961.452208,0.0
25%,1202.926778,3.426605,2.770514,10339.481188,787.441129,2113.501732,6745.358858,357.447284,10264.680588,2319.567901,...,12767.476157,2.880577,2415.489452,3.118951,9392.334222,3.580492,5792.977792,3.000158,9372.633905,0.0
50%,1314.38375,6.533316,7.249626,11456.561327,877.324827,2305.793236,7654.728192,402.385246,11232.84224,2702.861677,...,14260.298054,5.645545,2762.972427,5.900408,10498.07107,7.060184,6401.50606,6.826692,10503.611056,1.0
75%,1516.568905,12.825365,11.62137,13134.871341,989.564344,2714.638853,8655.803182,452.149374,12847.353697,3030.492834,...,16302.060739,11.144046,3185.53454,10.298938,12693.954672,10.560377,7125.618139,11.410409,11980.339257,1.0
max,3032.716622,24.874929,31.516674,25987.040394,1796.244934,5397.214939,16495.702928,782.718144,24773.080464,5679.364743,...,33311.007208,24.839133,6265.097428,30.364303,24459.884594,27.231358,13458.48762,23.916894,23831.661258,1.0


In [144]:
concat_df['Outcome'].unique()

array([0, 1], dtype=int64)

In [145]:
concat_df.dtypes.unique()

array([dtype('float64'), dtype('int64')], dtype=object)

In [146]:
concat_df['Outcome'].value_counts()

1    64
0    52
Name: Outcome, dtype: int64

In [147]:
train_df['Outcome'].value_counts()

1    56
0    35
Name: Outcome, dtype: int64

In [148]:
test_df['Outcome'].value_counts()

0    17
1     8
Name: Outcome, dtype: int64

In [149]:
concat_df.isnull().values.any()

False

### Preprocessing

In [150]:
y = concat_df.pop('Outcome')

In [151]:
scaler = StandardScaler()
scaler.fit(concat_df)
scaled_df = pd.DataFrame(columns=concat_df.columns, data=scaler.transform(concat_df))

In [157]:
scaled_df

Unnamed: 0,Plasma renin activity,Aldosterone-to-renin ratio -more detailed ranges in Aldosterone/renin ratio article,Oxygen saturation,17α-Hydroxyprogesterone,Anti ds-DNA,Reticulocyte hemoglobin equivalent,Fibrinogen,Procalcitonin,SHBG -more detailed ranges in SHBG article,[H+],...,LDL cholesterol (Not valid when triglycerides >5.0 mmol/L),Luteinizing hormone (LH)-more detailed menstrual cycle ranges in separate diagram,Alpha 1-antitrypsin (AAT),Eosinophil cationic protein (ECP),Absolute content of carbon dioxide (CO2),Chloride (Cl),Amylase,IgM,Copper (Cu),Viscosity
0,-0.608446,0.888516,0.163281,-0.706935,-0.581061,-0.818857,-0.743022,-0.542813,-0.800350,-0.338222,...,-0.836165,-0.860983,0.719448,-0.851516,-1.045997,-0.559470,-0.464146,-0.659920,1.129705,-0.869063
1,-0.680225,-0.307084,0.253089,-0.757793,-0.634323,-0.852364,-0.794262,-0.588100,-0.799233,-0.482380,...,-0.790434,-0.900399,-0.075934,-0.919826,-0.336481,-0.659190,-0.024297,-0.701282,-1.388092,-0.920676
2,2.562353,0.581142,0.309802,1.866656,2.120181,2.431090,3.571763,2.870224,2.038830,3.447345,...,1.472169,1.878212,-0.623890,2.996246,0.736863,2.153884,-1.035956,2.437581,0.950992,2.254295
3,0.001253,1.514539,-0.543892,-0.139071,-0.175681,-0.019217,-0.122574,-0.175481,-0.013962,0.057796,...,-0.148486,-0.157087,1.254403,-0.146353,-0.938895,-0.164373,0.055158,-0.093652,-0.484452,-0.117207
4,-0.132761,1.355796,0.115813,-0.449307,-0.111574,-0.452023,0.124908,0.195675,-0.506542,0.346992,...,-0.458284,-0.640470,1.321461,-0.354396,0.160827,-0.269232,0.593325,-0.133522,-0.761913,-0.586766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,-0.075524,0.534999,0.200346,0.241549,0.200225,0.132268,0.026060,0.171219,0.184540,0.131879,...,-0.232362,0.325864,-0.952404,0.463821,1.649565,0.297484,-1.092748,0.050640,-0.271989,0.368775
112,-0.338438,0.237314,1.245253,-0.226854,-0.076856,-0.282913,-0.222072,-0.123460,-0.193686,-0.385776,...,-0.065411,-0.170027,-0.404426,-0.244730,0.200933,-0.421444,-0.585737,-0.160997,-1.381007,-0.203760
113,0.699561,-0.319997,-1.304119,1.002224,1.285999,0.769095,1.080090,1.374172,0.832229,1.100946,...,0.552174,1.024079,0.923025,1.278875,-0.227982,1.029520,0.189586,1.018203,3.221676,1.077897
114,-0.334533,0.899270,-0.655941,-0.132377,0.334408,-0.368975,0.144575,0.369054,-0.320910,0.002989,...,-0.208176,-0.067439,-0.722403,0.070678,0.301174,-0.289940,0.974255,0.042872,0.414661,-0.083393


In [158]:
# outliers_threshold = 3
# filter_mask = (scaled_df < outliers_threshold) & (scaled_df > -outliers_threshold)

# scaled_df[filter_mask]

Unnamed: 0,Plasma renin activity,Aldosterone-to-renin ratio -more detailed ranges in Aldosterone/renin ratio article,Oxygen saturation,17α-Hydroxyprogesterone,Anti ds-DNA,Reticulocyte hemoglobin equivalent,Fibrinogen,Procalcitonin,SHBG -more detailed ranges in SHBG article,[H+],...,LDL cholesterol (Not valid when triglycerides >5.0 mmol/L),Luteinizing hormone (LH)-more detailed menstrual cycle ranges in separate diagram,Alpha 1-antitrypsin (AAT),Eosinophil cationic protein (ECP),Absolute content of carbon dioxide (CO2),Chloride (Cl),Amylase,IgM,Copper (Cu),Viscosity
0,-0.608446,0.888516,0.163281,-0.706935,-0.581061,-0.818857,-0.743022,-0.542813,-0.800350,-0.338222,...,-0.836165,-0.860983,0.719448,-0.851516,-1.045997,-0.559470,-0.464146,-0.659920,1.129705,-0.869063
1,-0.680225,-0.307084,0.253089,-0.757793,-0.634323,-0.852364,-0.794262,-0.588100,-0.799233,-0.482380,...,-0.790434,-0.900399,-0.075934,-0.919826,-0.336481,-0.659190,-0.024297,-0.701282,-1.388092,-0.920676
2,2.562353,0.581142,0.309802,1.866656,2.120181,2.431090,,2.870224,2.038830,,...,1.472169,1.878212,-0.623890,2.996246,0.736863,2.153884,-1.035956,2.437581,0.950992,2.254295
3,0.001253,1.514539,-0.543892,-0.139071,-0.175681,-0.019217,-0.122574,-0.175481,-0.013962,0.057796,...,-0.148486,-0.157087,1.254403,-0.146353,-0.938895,-0.164373,0.055158,-0.093652,-0.484452,-0.117207
4,-0.132761,1.355796,0.115813,-0.449307,-0.111574,-0.452023,0.124908,0.195675,-0.506542,0.346992,...,-0.458284,-0.640470,1.321461,-0.354396,0.160827,-0.269232,0.593325,-0.133522,-0.761913,-0.586766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,-0.075524,0.534999,0.200346,0.241549,0.200225,0.132268,0.026060,0.171219,0.184540,0.131879,...,-0.232362,0.325864,-0.952404,0.463821,1.649565,0.297484,-1.092748,0.050640,-0.271989,0.368775
112,-0.338438,0.237314,1.245253,-0.226854,-0.076856,-0.282913,-0.222072,-0.123460,-0.193686,-0.385776,...,-0.065411,-0.170027,-0.404426,-0.244730,0.200933,-0.421444,-0.585737,-0.160997,-1.381007,-0.203760
113,0.699561,-0.319997,-1.304119,1.002224,1.285999,0.769095,1.080090,1.374172,0.832229,1.100946,...,0.552174,1.024079,0.923025,1.278875,-0.227982,1.029520,0.189586,1.018203,,1.077897
114,-0.334533,0.899270,-0.655941,-0.132377,0.334408,-0.368975,0.144575,0.369054,-0.320910,0.002989,...,-0.208176,-0.067439,-0.722403,0.070678,0.301174,-0.289940,0.974255,0.042872,0.414661,-0.083393


#### Hanterar först obalans mellan training data och test data

In [153]:
X_train, X_test, y_train, y_test = train_test_split(concat_df, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

In [154]:
y_train.value_counts()

1    51
0    41
Name: Outcome, dtype: int64

In [155]:
y_test.value_counts()

1    13
0    11
Name: Outcome, dtype: int64

In [None]:
## baseline