In [1]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [2]:
TRAIN_PATH = 'filtered_train_df_0705.csv'
VAL_PATH = 'filtered_val_df_0705.csv'
TEST_PATH = 'filtered_test_df_0705.csv'

In [3]:
train_df = pd.read_csv(TRAIN_PATH)

In [None]:
for path in [TRAIN_PATH,VAL_PATH,TEST_PATH]:
    df = pd.read_csv(path)
    df = df[['ID','Label']].groupby(by='ID').max().reset_index()
    total = df.shape[0]
    type_d = path.split('_')[1]
    df=df.groupby(by='Label').count().reset_index()
    print(f'{type_d}: Label 0: {df[df.Label==0]["ID"].values[0]}/{total}={df[df.Label==0]["ID"].values[0]/total}')
    print(f'{type_d}: Label 1: {df[df.Label==1]["ID"].values[0]}/{total}={df[df.Label==1]["ID"].values[0]/total} ')

In [38]:
train_small = train_df[['ID','Label']].groupby(by='ID').max().reset_index()
over = RandomOverSampler(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.5)
ids_over,labels_over = over.fit_resample(train_small[['ID']], train_small['Label'])
ids_over_under,labels_over_under = under.fit_resample(ids_over, labels_over)

In [39]:
ids_over,labels_over = over.fit_resample(train_small[['ID']], train_small['Label'])

In [5]:
s = train_df[train_df['ID'].isin([0,1,2,3])]
s

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,ID,Label,max_ICULOS,time_bm
75951,,,,,,,,,,,...,0,0.0,1.0,-98.60,1,0,0,0,23,-22
75952,61.0,99.0,36.44,124.0,65.00,43.0,17.5,,,,...,0,0.0,1.0,-98.60,2,0,0,0,23,-21
75953,64.0,98.0,,125.0,64.00,41.0,27.0,,,,...,0,0.0,1.0,-98.60,3,0,0,0,23,-20
75954,56.0,100.0,,123.0,65.00,41.0,9.0,,,,...,0,0.0,1.0,-98.60,4,0,0,0,23,-19
75955,66.0,99.0,,120.0,67.00,43.0,23.0,,,,...,0,0.0,1.0,-98.60,5,0,0,0,23,-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539766,58.0,99.0,,152.5,100.00,76.0,14.0,,,,...,1,,,-0.02,44,0,3,0,48,-4
539767,68.0,99.0,,128.0,74.33,77.0,17.0,,,,...,1,,,-0.02,45,0,3,0,48,-3
539768,71.0,95.0,,146.5,94.50,73.0,17.0,,,23.0,...,1,,,-0.02,46,0,3,0,48,-2
539769,71.0,96.0,37.00,149.0,94.00,73.0,18.0,,,,...,1,,,-0.02,47,0,3,0,48,-1


In [58]:
# FREQUENCY_ATTR =['5w_sum_BaseExcess', '5w_sum_FiO2', '5w_sum_pH', '5w_sum_PaCO2', '5w_sum_Glucose', '5w_sum_Lactate', '5w_sum_PTT']
LAB_ATTR = [ 'Hct',  'Glucose','Potassium']
CONST_ATTR = ['ID','max_ICULOS','Gender']
OTHER_ATTR = ['HR','MAP','O2Sat', 'Resp','SBP','ICULOS']
LABEL_ATTR= 'Label'
COLS = CONST_ATTR+LAB_ATTR+OTHER_ATTR

In [59]:
s = s[COLS+[LABEL_ATTR]]
s

Unnamed: 0,ID,max_ICULOS,Gender,Hct,Glucose,Potassium,HR,MAP,O2Sat,Resp,SBP,ICULOS,Label
75951,0,23,0,,,,,,,,,1,0
75952,0,23,0,,,,61.0,65.00,99.0,17.5,124.0,2,0
75953,0,23,0,,,,64.0,64.00,98.0,27.0,125.0,3,0
75954,0,23,0,,,,56.0,65.00,100.0,9.0,123.0,4,0
75955,0,23,0,,,,66.0,67.00,99.0,23.0,120.0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
539766,3,48,1,,,,58.0,100.00,99.0,14.0,152.5,44,0
539767,3,48,1,,,,68.0,74.33,99.0,17.0,128.0,45,0
539768,3,48,1,39.8,94.0,3.9,71.0,94.50,95.0,17.0,146.5,46,0
539769,3,48,1,,,,71.0,94.00,96.0,18.0,149.0,47,0


In [95]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
patients = list(set(train_df.ID.values))
imputed = pd.DataFrame()
all_data_means = train_df.mean()
for patient in patients:
    tmp_df = train_df[train_df['ID']==patient][COLS+['Label']]
    # print(tmp_df.shape)
    for f in LAB_ATTR+OTHER_ATTR:
        if tmp_df[f].isnull().all:
            tmp_df[f]=tmp_df[f].fillna(all_data_means[f])
    imp = IterativeImputer(max_iter=50, random_state=0)
    try:
        imp.fit(tmp_df)
        tmp_df= pd.DataFrame(imp.transform(tmp_df), columns = COLS+['Label'])
        imputed=imputed.append(tmp_df)
    except:
        print(tmp_df.shape)
print('done')

(33, 13)
(54, 13)
(58, 13)


KeyboardInterrupt: 

In [92]:
train_df.mean()['HR']

84.48628456940398

In [71]:
imputed[imputed.ID==3]

Unnamed: 0,ID,max_ICULOS,Gender,Hct,Glucose,Potassium,HR,MAP,O2Sat,Resp,SBP,ICULOS,Label
0,3.0,48.0,1.0,42.273606,160.732764,3.9,77.0,95.0,97.0,14.0,135.0,4.0,0.0
1,3.0,48.0,1.0,42.094533,160.737112,3.9,73.0,91.0,99.0,16.0,139.0,5.0,0.0
2,3.0,48.0,1.0,41.602434,160.743672,3.9,62.0,96.0,99.0,13.0,143.0,6.0,0.0
3,3.0,48.0,1.0,42.22518,160.736909,3.9,84.909192,91.480435,97.141373,16.401137,138.2946,7.0,0.0
4,3.0,48.0,1.0,42.190665,160.737245,3.9,84.497589,91.321775,97.113864,16.421269,138.219925,8.0,0.0
5,3.0,48.0,1.0,42.15615,160.73758,3.9,84.085986,91.163115,97.086355,16.441401,138.14525,9.0,0.0
6,3.0,48.0,1.0,41.013945,160.758143,3.9,74.0,106.0,98.0,14.0,152.0,10.0,0.0
7,3.0,48.0,1.0,40.966113,160.758971,3.9,75.5,107.5,98.0,14.0,152.0,11.0,0.0
8,3.0,48.0,1.0,41.762034,160.746731,3.9,92.0,99.5,98.0,16.5,143.0,12.0,0.0
9,3.0,48.0,1.0,41.618178,160.752475,3.9,98.0,98.5,97.5,20.5,148.0,13.0,0.0


In [70]:
s[s.ID==3]

Unnamed: 0,ID,max_ICULOS,Gender,Hct,Glucose,Potassium,HR,MAP,O2Sat,Resp,SBP,ICULOS,Label
539726,3,48,1,,,,77.0,95.0,97.0,14.0,135.0,4,0
539727,3,48,1,,,,73.0,91.0,99.0,16.0,139.0,5,0
539728,3,48,1,,,,62.0,96.0,99.0,13.0,143.0,6,0
539729,3,48,1,,,,,,,,,7,0
539730,3,48,1,,,,,,,,,8,0
539731,3,48,1,,,,,,,,,9,0
539732,3,48,1,,,,74.0,106.0,98.0,14.0,152.0,10,0
539733,3,48,1,,,,75.5,107.5,98.0,14.0,152.0,11,0
539734,3,48,1,,,,92.0,99.5,98.0,16.5,143.0,12,0
539735,3,48,1,,,,98.0,98.5,97.5,20.5,148.0,13,0


In [None]:
s = s.sort_values(by=['ID','ICULOS'], ascending =[True,True])
rolling = df[['ID']+attr].groupby('ID').rolling(window=window_size, closed='both').count()
rolling= rolling.rename(columns={at: f'{window_size}w_sum_{at}' for at in attr})
rolling=rolling[list(rolling.columns)[1:]].reset_index().set_index('level_1')
combined = df.join(rolling,how='left', rsuffix= 'r')

In [10]:
s[['Unit1','Unit2']]

Unnamed: 0,Unit1,Unit2
75951,0.0,1.0
75952,0.0,1.0
75953,0.0,1.0
75954,0.0,1.0
75955,0.0,1.0
...,...,...
539766,,
539767,,
539768,,
539769,,


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np

xgbc = XGBClassifier()
xgbc.fit(xtrain, ytrain)
y_train_pred = xgbc.predict(xtrain)
print(f'Train F1: {f1_score(ytrain,y_train_pred)}')
y_val_pred = xgbc.predict(xval)
print(f'Val F1: {f1_score(yval,y_val_pred)}')

In [28]:
from xgboost import XGBClassifier


ModuleNotFoundError: No module named 'xgboost'

In [None]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)
y_train_pred = clf.predict(xtrain)

y_pred=clf.predict(X_test)

print(f'Train F1: {f1_score(ytrain, y_train_pred)}')
y_val_pred = xgbc.predict(xval)
print(f'Val F1: {f1_score(yval, y_val_pred)}')


In [29]:
pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-1.6.0-py3-none-manylinux2014_x86_64.whl (193.7 MB)
[K     |████████████████████████████████| 193.7 MB 23 kB/s s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.6.0
Note: you may need to restart the kernel to use updated packages.
