In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

from sklearn.svm import LinearSVR
from sklearn.svm import SVR


# Data Loading

In [2]:
dat = pd.read_csv(filepath_or_buffer= "./train_features.csv")
age = pd.Series(dat.Age.values,index=dat.pid).to_dict()
pids = list(set(dat.pid))
column_names = list(dat)
dat_by_id = dat.groupby('pid')

In [3]:
# Loading the labels
labs = pd.read_csv("./train_labels.csv")
labs.isna().sum()
# No NAs

pid                       0
LABEL_BaseExcess          0
LABEL_Fibrinogen          0
LABEL_AST                 0
LABEL_Alkalinephos        0
LABEL_Bilirubin_total     0
LABEL_Lactate             0
LABEL_TroponinI           0
LABEL_SaO2                0
LABEL_Bilirubin_direct    0
LABEL_EtCO2               0
LABEL_Sepsis              0
LABEL_RRate               0
LABEL_ABPm                0
LABEL_SpO2                0
LABEL_Heartrate           0
dtype: int64

### Histogram of the output labels 

We should check for class imbalance.

### Loading the test data

In [4]:
test = pd.read_csv("./test_features.csv")
test_pids = list(set(test.pid))
test_column_names = list(test)
test_by_id = test.groupby('pid')

### Data Aggregation

In [5]:
selected_col_names = [j + str(i) for j in column_names[3:] for i in range(12)]
#[ + str(i) for i in range(37)]

In [6]:
out = dat_by_id.get_group(pids[0])[column_names[3:]].stack(dropna = False).transpose().to_list()

In [7]:
# # This block is for stacking the time-series next to each-other
# rows = []
# for i in pids:
#     rows.append(dat_by_id.get_group(i)[column_names[3:]].stack(dropna = False).transpose().to_list())
# dat_agg = pd.DataFrame(rows, columns = selected_col_names)
# print(dat_agg)

In [8]:
dat_summarystat = dat.describe()
dat_median = dat_summarystat.loc['50%',]

test_summarystat = test.describe()
test_median = test_summarystat.loc['50%',]

In [9]:
# Taking the median for each feature
# Sample for one vector
temp_dat = dat_by_id.get_group(pids[0])
tmp_median = temp_dat.median(skipna = True)
print(tmp_median)
tmp_median[tmp_median.isna()] = dat_median[tmp_median.isna()]
print(tmp_median.to_list())

pid                   1.00
Time                  8.50
Age                  34.00
EtCO2                  NaN
PTT                    NaN
BUN                  12.00
Lactate                NaN
Temp                 37.00
Hgb                   8.50
HCO3                 26.00
BaseExcess            0.00
RRate                18.00
Fibrinogen             NaN
Phosphate             4.60
WBC                   4.70
Creatinine            0.50
PaCO2                43.00
AST                    NaN
FiO2                  0.40
Platelets           143.00
SaO2                   NaN
Glucose             120.00
ABPm                 67.50
Magnesium             1.80
Potassium             4.10
ABPd                 48.50
Calcium               7.60
Alkalinephos           NaN
SpO2                100.00
Bilirubin_direct       NaN
Chloride            111.00
Hct                  23.10
Heartrate            75.00
Bilirubin_total        NaN
TroponinI              NaN
ABPs                111.00
pH                    7.37
d

  overwrite_input=overwrite_input)


### Train dataset aggregation

In [10]:
rows = []
for i in pids:
    temp_dat = dat_by_id.get_group(i)
    tmp_median = temp_dat.median(skipna = True)
    tmp_idx = tmp_median.isna()
    tmp_median[tmp_idx] = dat_median[tmp_idx]
    rows.append(tmp_median)
dat_agg = pd.DataFrame(rows, columns = list(dat))
print(dat_agg)

           pid  Time   Age  EtCO2   PTT   BUN  Lactate  Temp    Hgb  HCO3  \
0          1.0   8.5  34.0   33.0  32.2  12.0      2.1  37.0   8.50  26.0   
1          2.0   6.5  86.0   33.0  31.8  32.0      2.1  36.0  13.10  23.9   
2          4.0   6.5  66.0   33.0  34.6   8.0      2.1  37.0  10.55  23.9   
3          6.0   7.5  66.0   33.0  53.8  32.0      1.8  38.0  10.60  19.5   
4          8.0   6.5  42.0   33.0  32.2  18.0      2.1  36.0  10.50  23.9   
...        ...   ...   ...    ...   ...   ...      ...   ...    ...   ...   
18990  31653.0   6.5  52.0   33.0  25.8  11.0      1.7  36.0   9.10  23.0   
18991  31654.0   6.5  66.0   33.0  32.2  33.0      2.1  37.5  11.20  23.9   
18992  31656.0   6.5  44.0   33.0  32.2  15.0      2.1  38.0  12.40  24.0   
18993  31657.0   6.5  70.0   33.0  32.2  17.0      2.1  36.5  10.50  23.9   
18994  31658.0   6.5  60.0   33.0  32.2  13.0      2.1  36.5  14.40  23.9   

       ...  Alkalinephos   SpO2  Bilirubin_direct  Chloride    Hct  Heartra

### Test data aggregation

In [11]:
rows = []
for i in test_pids:
    temp_dat = test_by_id.get_group(i)
    tmp_median = temp_dat.median(skipna = True)
    tmp_idx = tmp_median.isna()
    tmp_median[tmp_idx] = dat_median[tmp_idx]
    rows.append(tmp_median)
test_agg = pd.DataFrame(rows, columns = list(dat))
print(test_agg)

  overwrite_input=overwrite_input)


           pid  Time   Age  EtCO2    PTT   BUN  Lactate  Temp    Hgb  HCO3  \
0          0.0   6.5  39.0   33.0  41.35  18.5      2.1  36.0   9.65  14.5   
1          3.0   6.5  84.0   33.0  32.20  17.0      2.1  35.0  10.50  23.9   
2          5.0   6.5  62.0   33.0  32.20  17.0      2.1  37.0  10.50  23.9   
3          7.0   6.5  71.0   33.0  41.80  21.0      2.1  36.0   8.70  25.0   
4          9.0   6.5  51.0   33.0  35.40  47.0      2.1  36.5  10.30  22.0   
...        ...   ...   ...    ...    ...   ...      ...   ...    ...   ...   
12659  31647.0   6.5  29.0   33.0  32.20  17.0      2.1  37.0  10.50  23.9   
12660  31649.0   6.5  83.0   33.0  28.80  22.0      2.2  36.0  12.30  24.0   
12661  31651.0   6.5  74.0   33.0  32.40  38.5      2.1  35.5  10.45  31.5   
12662  31652.0   6.5  40.0   33.0  32.20  11.0      2.1  37.0  11.05  23.9   
12663  31655.0   6.5  23.0   33.0  56.80  17.0      2.1  37.0  11.80  23.9   

       ...  Alkalinephos   SpO2  Bilirubin_direct  Chloride    

# Imputation

In [12]:
dat_agg.EtCO2.mean()

32.983969465648855

In [13]:
# # Simple imputation
# imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
# dat_agg_imputed = imp_mean.fit_transform(X = dat_agg, y = labs)
# dat_agg_imputed = pd.DataFrame(dat_agg_imputed, columns = selected_col_names)

# Imputing with the median
# Was done before
dat_agg_imputed = dat_agg

In [14]:
print(dat_agg_imputed.isna().sum().sum())
print(dat_agg_imputed.shape)

0
(18995, 37)


# Feature Generation

# Class Imbalance

# Prediction

In [15]:
list(labs)
dat_agg_imputed.iloc[:,3:]

Unnamed: 0,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,BaseExcess,RRate,Fibrinogen,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,33.0,32.2,12.0,2.1,37.0,8.50,26.0,0.0,18.0,233.0,...,72.0,100.0,0.3,111.0,23.10,75.0,0.8,0.15,111.0,7.370
1,33.0,31.8,32.0,2.1,36.0,13.10,23.9,-1.0,19.0,233.0,...,72.0,96.0,0.3,107.0,40.20,59.0,0.8,0.44,129.0,7.370
2,33.0,34.6,8.0,2.1,37.0,10.55,23.9,-1.0,14.0,233.0,...,130.0,100.0,0.1,107.0,33.55,72.0,0.6,0.02,124.0,7.370
3,33.0,53.8,32.0,1.8,38.0,10.60,19.5,-3.0,13.5,233.0,...,72.0,100.0,0.3,113.5,30.45,88.0,0.8,0.15,99.0,7.350
4,33.0,32.2,18.0,2.1,36.0,10.50,23.9,-1.0,18.0,233.0,...,72.0,98.0,0.3,107.0,30.90,81.0,0.8,0.08,209.0,7.370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18990,33.0,25.8,11.0,1.7,36.0,9.10,23.0,0.0,14.0,633.0,...,72.0,96.0,0.3,106.0,32.90,102.0,0.8,0.15,87.0,7.330
18991,33.0,32.2,33.0,2.1,37.5,11.20,23.9,-1.0,17.0,233.0,...,72.0,96.0,0.3,107.0,34.60,90.0,0.8,0.15,163.0,7.370
18992,33.0,32.2,15.0,2.1,38.0,12.40,24.0,-3.5,25.0,233.0,...,72.0,100.0,0.3,97.0,36.70,97.5,0.8,0.15,98.0,7.305
18993,33.0,32.2,17.0,2.1,36.5,10.50,23.9,-1.0,15.5,233.0,...,72.0,99.0,0.3,107.0,30.90,64.0,0.8,0.15,109.0,7.370


In [16]:
#clf = SVR(kernel= 'linear', C = 1.0, epsilon = 0.2)
clf = LinearSVR(random_state=0, tol=1e-5)
clf.fit(dat_agg_imputed.iloc[:,3:], labs.LABEL_BaseExcess)



LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=0, tol=1e-05, verbose=0)

In [17]:
pred = clf.predict(test_agg.iloc[:,3:])
# Should make it between 0 and 1
pred/np.sum(pred)

array([7.28190115e-05, 8.29605093e-05, 8.09207184e-05, ...,
       8.06536457e-05, 8.84117021e-05, 8.91986239e-05])

In [30]:
columns = [test_pids]
for i in range(1,12):
    clf = LinearSVR(random_state=0, tol=1e-5, max_iter=1000)
    clf.fit(dat_agg_imputed.iloc[:,3:], labs.iloc[:,i])
    pred = clf.predict(test_agg.iloc[:,3:])
    # Should make it between 0 and 1
    columns.append(pred/np.sum(pred))
                
for i in range(12, 16):
    clf = LinearSVR(random_state=0, tol=1e-5, max_iter=1000)
    clf.fit(dat_agg_imputed.iloc[:,3:], labs.iloc[:,i])
    pred = clf.predict(test_agg.iloc[:,3:])
    columns.append(pred)



In [40]:
result = pd.DataFrame(columns).transpose()
result.columns = list(labs)
result.to_csv('prediction.csv.zip', index=False, float_format='%.3f', compression='zip')

In [39]:
result.to_csv('prediction.csv', index=False, float_format='%.3f')