In [18]:
import time
from datetime import datetime
from pathlib import Path
from zipfile import ZipFile

import imblearn
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import (
    decomposition,
    discriminant_analysis,
    ensemble,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    pipeline,
    preprocessing,
    svm,
)

In [19]:
DATA = Path("public_data")

DROP_VARS = ["ADMITTIME", "DISCHTIME", "SUBJECT_ID", "HADM_ID"]

features = pd.read_csv(
    DATA / "mimic_synthetic_feat.name", header=None
).values.flatten()

labels = pd.read_csv(
    DATA / "mimic_synthetic_label.name", header=None
).values.flatten()

x_df = pd.read_csv(
    DATA / "mimic_synthetic_train.data",
    header=None,
    names=features,
    sep=" ",
)

# Remove time related data that are not needed
x_df.drop(columns=DROP_VARS, inplace=True)

ys = pd.Series(
    pd.read_csv(
        DATA / "mimic_synthetic_train.solution",
        header=None,
        names=labels,
        sep=" ",
    ).values.flatten()
)

# Load testing set
x_test_df = pd.read_csv(
    DATA / "mimic_synthetic_test.data",
    header=None,
    names=features,
    sep=" ",
)

# Remove time related data that are not needed
x_test_df.drop(columns=DROP_VARS, inplace=True)


In [20]:
def fill_prev (train, test):
    """
    Filling the cell containing NaN values with previous entry
    """
    
    na_cols = set(train.columns[train.isna().any()])
    for col in na_cols:
        train[col] = train[col].fillna(method='ffill').fillna(method='bfill')
    na_cols =  set(test.columns[test.isna().any()])
    for col in na_cols:
        test[col] = test[col].fillna(method='ffill').fillna(method='bfill')
    return train, test

In [21]:
x_df, x_test_df = fill_prev(x_df, x_test_df)

In [22]:
def merge_rare_categories(dfs, col, keeps=None, keep_n=5):
    if keeps is None:
        keeps = x_df[col].value_counts()[:keep_n].index
        print(keeps)

    for df in dfs:
        df.loc[~df[col].isin(keeps), col] = "OTHER"

In [23]:
merge_rare_categories([x_df, x_test_df], col="RELIGION", keep_n=5)
merge_rare_categories([x_df, x_test_df], col="LANGUAGE", keep_n=2)

Index(['CATHOLIC', 'NOT_SPECIFIED', 'UNOBTAINABLE', 'PROTESTANT_QUAKER',
       'JEWISH'],
      dtype='object')
Index(['ENGL', 'SPAN'], dtype='object')


In [24]:
# 1-Hot enconding
x_all_1hot_df = pd.get_dummies(pd.concat([x_df, x_test_df]))

x_1hot_df = x_all_1hot_df[: len(x_df)]
x_test_1hot_df = x_all_1hot_df[len(x_df) :]

In [25]:
const_cols = {col for col in x_1hot_df if len(x_1hot_df[col].unique()) == 1}
x_1hot_df.drop(const_cols, axis="columns", inplace=True)
x_test_1hot_df.drop(const_cols, axis="columns", inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [26]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = preprocessing.MinMaxScaler()
x_1hot_df = scaler.fit_transform(x_1hot_df)
x_test_1hot_df = scaler.fit_transform(x_test_1hot_df)

pca = decomposition.PCA(n_components=150)
x_1hot_df = pca.fit_transform(x_1hot_df)
x_test_1hot_df = pca.fit_transform(x_test_1hot_df)


In [28]:
x_train, x_valid, y_train, y_valid = model_selection.train_test_split(
    x_1hot_df, ys, test_size=0.3
)

In [29]:
#SMOTE
Xb=pd.DataFrame(x_train)
Yb=pd.DataFrame(y_train)

from imblearn.over_sampling import SMOTE
sm = SMOTE()
x_train, y_train = sm.fit_resample(Xb, np.ravel(Yb))

In [35]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_validate
from sklearn.metrics import balanced_accuracy_score

lda = LinearDiscriminantAnalysis()
lda.fit(x_train, y_train)

results = cross_validate(lda, x_train, y_train, cv=2, scoring='balanced_accuracy')
sorted(results.keys())


print("Cross validation balanced accuracy: ")
print(results['test_score'])
print("The average score of cross validation: ", results['test_score'].mean())


y_pred = lda.predict(x_valid)  # predictions
score = balanced_accuracy_score(y_valid, y_pred)  # scoring
print(" Balanced accuracy score: {}".format(score))


#model = linear_model.LogisticRegression(max_iter=10000)
#model = ensemble.RandomForestClassifier(n_estimators=10)

Cross validation balanced accuracy: 
[0.79048253 0.79137063]
The average score of cross validation:  0.790926583777383
 Balanced accuracy score: 0.7487729399880051


In [None]:
predictions = model.predict(x_test_1hot_df)  # compute predictions
# VARS: x_test_df, x_test_fact_df, x_test_1hot_df, x_test_1hot_pcs
# Models: model, search


predictions_file = "mimic_synthetic_test.csv"

pd.Series(predictions).to_csv(predictions_file, index=False, header=False)

print("Predictions saved.")

t_stamp = time.asctime().replace(" ", "_").replace(":", "-")

output_file = f"submission_{t_stamp}.zip"

!zip test_submission.zip mimic_synthetic_test.csv  # create a ZIP

with ZipFile(output_file, "w") as z:
    z.write(predictions_file)

print(f"The submission is ready: {output_file}")