In [1]:
import time
from datetime import datetime
from pathlib import Path
from zipfile import ZipFile

import imblearn
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import (
    decomposition,
    discriminant_analysis,
    ensemble,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    pipeline,
    preprocessing,
    svm,
)

In [2]:
DATA = Path("public_data")

DROP_VARS = ["ADMITTIME", "DISCHTIME", "SUBJECT_ID", "HADM_ID"]

features = pd.read_csv(
    DATA / "mimic_synthetic_feat.name", header=None
).values.flatten()

labels = pd.read_csv(
    DATA / "mimic_synthetic_label.name", header=None
).values.flatten()

x_df = pd.read_csv(
    DATA / "mimic_synthetic_train.data",
    header=None,
    names=features,
    sep=" ",
)

# Remove time related data that are not needed
x_df.drop(columns=DROP_VARS, inplace=True)

ys = pd.Series(
    pd.read_csv(
        DATA / "mimic_synthetic_train.solution",
        header=None,
        names=labels,
        sep=" ",
    ).values.flatten()
)

# Load testing set
x_test_df = pd.read_csv(
    DATA / "mimic_synthetic_test.data",
    header=None,
    names=features,
    sep=" ",
)

# Remove time related data that are not needed
x_test_df.drop(columns=DROP_VARS, inplace=True)


In [3]:
def fill_prev (train, test):
    """
    Filling the cell containing NaN values with previous entry
    """
    
    na_cols = set(train.columns[train.isna().any()])
    for col in na_cols:
        train[col] = train[col].fillna(method='ffill').fillna(method='bfill')
    na_cols =  set(test.columns[test.isna().any()])
    for col in na_cols:
        test[col] = test[col].fillna(method='ffill').fillna(method='bfill')
    return train, test

In [4]:
x_df, x_test_df = fill_prev(x_df, x_test_df)

In [5]:
def merge_rare_categories(dfs, col, keeps=None, keep_n=5):
    if keeps is None:
        keeps = x_df[col].value_counts()[:keep_n].index
        print(keeps)

    for df in dfs:
        df.loc[~df[col].isin(keeps), col] = "OTHER"

In [6]:
merge_rare_categories([x_df, x_test_df], col="RELIGION", keep_n=5)
merge_rare_categories([x_df, x_test_df], col="LANGUAGE", keep_n=2)

Index(['CATHOLIC', 'NOT_SPECIFIED', 'UNOBTAINABLE', 'PROTESTANT_QUAKER',
       'JEWISH'],
      dtype='object')
Index(['ENGL', 'SPAN'], dtype='object')


In [7]:
# 1-Hot enconding
x_all_1hot_df = pd.get_dummies(pd.concat([x_df, x_test_df]))

x_1hot_df = x_all_1hot_df[: len(x_df)]
x_test_1hot_df = x_all_1hot_df[len(x_df) :]

In [8]:
const_cols = {col for col in x_1hot_df if len(x_1hot_df[col].unique()) == 1}
x_1hot_df.drop(const_cols, axis="columns", inplace=True)
x_test_1hot_df.drop(const_cols, axis="columns", inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [9]:
scaler = preprocessing.MinMaxScaler()
x_1hot_df = scaler.fit_transform(x_1hot_df)
x_test_1hot_df = scaler.fit_transform(x_test_1hot_df)

#pca = decomposition.PCA(n_components=150)
#x_1hot_df = pca.fit_transform(x_1hot_df)

In [10]:
x_train, x_valid, y_train, y_valid = model_selection.train_test_split(
    x_1hot_df, ys, test_size=0.3
)

In [11]:
#SMOTE
Xb=pd.DataFrame(x_train)
Yb=pd.DataFrame(y_train)

from imblearn.over_sampling import KMeansSMOTE

sm = KMeansSMOTE(k_neighbors=100, kmeans_estimator=1)

x_train, y_train = sm.fit_resample(Xb, np.ravel(Yb))



In [12]:
# Random Forest Parameter tuning 

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 50, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [None]:
#model = linear_model.LogisticRegression(max_iter=10000)
model = ensemble.RandomForestClassifier()

model = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
model.fit(x_train, y_train)
model.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [None]:
from sklearn.model_selection import cross_val_score

model.fit(x_train, y_train)
y_pred = model.predict(x_valid)  # predictions

CV_accuracy = cross_val_score(model, x_train, y_train, scoring="balanced_accuracy", cv=10)  # scoring
print(f"Balanced accuracy score: {np.mean(CV_accuracy):.3g}")

In [None]:
predictions = model.predict(x_test_1hot_df)  # compute predictions
# VARS: x_test_df, x_test_fact_df, x_test_1hot_df, x_test_1hot_pcs
# Models: model, search


predictions_file = "mimic_synthetic_test.csv"

pd.Series(predictions).to_csv(predictions_file, index=False, header=False)

print("Predictions saved.")

t_stamp = time.asctime().replace(" ", "_").replace(":", "-")

output_file = f"submission_{t_stamp}.zip"

!zip test_submission.zip mimic_synthetic_test.csv  # create a ZIP

with ZipFile(output_file, "w") as z:
    z.write(predictions_file)

print(f"The submission is ready: {output_file}")