In [1]:
# base modules
import warnings
warnings.filterwarnings('ignore')

#Sys tools
import os
import sys
import copy
import logging

# custom module
from emlyon_module.imports import *
from emlyon_module.structured import *

#ML Tools
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV

#Jaona Modifs: Getting to the file
sys.path.append("/Users/jaonaandriamasy/Documents/4 - Courses/Machine Learning/Last assignment/ML - Desktop/ML - Desktop")

#Catboost
#!pip install catboost
import catboost
from catboost import CatBoostClassifier, CatBoostRegressor, Pool, cv

In [2]:
def rmse(y_gold, y_pred): 
    return math.sqrt(((y_gold - y_pred)**2).mean()) #Math is from emlyon_module

def print_score(m, X_train, y_train, X_valid, y_valid):
    print('RMSE on train set: {:.4f}'.format(rmse(m.predict(X_train), y_train)))
    print('RMSE on valid set: {:.4f}'.format(rmse(m.predict(X_valid), y_valid)))
    print('R^2 on train set: {:.4f}'.format(m.score(X_train, y_train)))
    print('R^2 on valid set: {:.4f}'.format(m.score(X_valid, y_valid)))
    if hasattr(m, 'oob_score_'): print('R^2 on oob set: {:.4f}'.format(m.oob_score_))
    return

def split_vals(df, n): 
    return df[:n].copy(), df[n:].copy()

- Import preprocessed features as DataFrames and target variables as Series

In [3]:
# training and test sets with original data distribution
X_train_base = pd.read_csv('data/X_train.csv')
y_train_base = pd.read_csv('data/y_train.csv', squeeze=True)

X_test = pd.read_csv('data/X_test.csv')
y_test = pd.read_csv('data/y_test.csv', squeeze=True)

In [5]:
#Validation set
n_total = len(X_train_base)
n_train = 40000 #20% of the training set

X_train, X_valid = split_vals(X_train_base, n_train)
y_train, y_valid = split_vals(y_train_base, n_train)

In [6]:
X_train.shape

(40000, 169)

In [7]:
model = CatBoostRegressor(
    n_estimators = 5000, # iterations, n_estimators, num_boost_round, num_trees
    learning_rate = 0.01,
    random_seed = 42,
    loss_function = 'RMSE',
    #allow_writing_files = False,
    logging_level = 'Silent',
)

%time model.fit(X_train, y_train, eval_set = (X_valid, y_valid), plot = True)
print_score(model, X_train, y_train, X_valid, y_valid) 

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CPU times: user 58.7 s, sys: 11.2 s, total: 1min 9s
Wall time: 29.9 s
RMSE on train set: 0.2727
RMSE on valid set: 0.2782
R^2 on train set: 0.0767
R^2 on valid set: 0.0248


In [8]:
predictions = model.predict(X_valid.iloc[:10, :])

print(np.exp(y_valid[:10]))
print([int(np.exp(pred)) for pred in predictions])

40000    1.0
40001    1.0
40002    1.0
40003    1.0
40004    1.0
40005    1.0
40006    1.0
40007    1.0
40008    1.0
40009    1.0
Name: readmitted_within_30_days, dtype: float64
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [9]:
model.tree_count_

1605

In [10]:
### $\bullet$ Hyperparameter tuning

In [12]:
cv_dataset = Pool(data = X_train, label = y_train)

params = {
    "n_estimators": 500,
    "loss_function": "RMSE",
    "verbose": False,
}
scores = catboost.cv(
    cv_dataset,
    params,
    fold_count = 5, 
    plot = "True",
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.2788255747
bestIteration = 380

Training on fold [1/5]

bestTest = 0.2842805309
bestIteration = 498

Training on fold [2/5]

bestTest = 0.2776225722
bestIteration = 465

Training on fold [3/5]

bestTest = 0.2694857975
bestIteration = 411

Training on fold [4/5]

bestTest = 0.2838998036
bestIteration = 470



In [4]:
# resampled / balanced training sets

X_train_rus = pd.read_csv('data/X_train_rus.csv')
y_train_rus = pd.read_csv('data/y_train_rus.csv', squeeze=True)

X_train_ros = pd.read_csv('data/X_train_ros.csv')
y_train_ros = pd.read_csv('data/y_train_ros.csv', squeeze=True)

X_train_smote = pd.read_csv('data/X_train_smote.csv')
y_train_smote = pd.read_csv('data/y_train_smote.csv', squeeze=True)

- We start by building a logistic regression model with default hyperparameter values and fit it on the preprocessed data.

In [27]:
imp = pd.DataFrame({'Column': X_train.columns, 'Feature Importance': clf.feature_importances_})

In [31]:
imp.sort_values(by='Feature Importance', ascending=False)

Unnamed: 0,Column,Feature Importance
170,readmitted_NO,0.360951
169,readmitted_>30,0.352534
1,num_lab_procedures,0.021185
3,num_medications,0.019504
0,time_in_hospital,0.015173
...,...,...
59,admission_source_id_ Transfer from Ambulatory ...,0.000000
133,chlorpropamide_Up,0.000000
66,admission_source_id_Normal Delivery,0.000000
63,admission_source_id_ Transfer from hospital in...,0.000000
