In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from scipy import stats
import statistics
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_error, make_scorer
import seaborn as sns
import os
from scipy.stats import skew, probplot, norm
import sklearn.model_selection as ms
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV, LassoCV, Lasso, ElasticNet 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold 
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor

### importing data

In [7]:
training_df = pd.read_csv('./whole_train_cleaned.csv')
test_df = pd.read_csv('./whole_test_cleaned.csv')
unique_train_df = pd.read_csv('./unique_train_cleaned.csv')
print(f'whole_train_cleaned dataset has {training_df.shape}')
print(f'whole_test_cleaned dataset has {test_df.shape}')
print(f'unique_train_df dataset has {unique_train_df.shape}')


whole_train_cleaned dataset has (89390, 94)
whole_test_cleaned dataset has (9930, 94)
unique_train_df dataset has (64484, 94)


In [8]:
#dropping the columns encounter_id and patient_nbr
training_df = training_df.drop(['encounter_id','patient_nbr'], axis = 1)
test_df = test_df.drop(['encounter_id','patient_nbr'], axis = 1)
unique_train_df = unique_train_df.drop(['encounter_id','patient_nbr'], axis = 1)


In [9]:
print(training_df.shape)
training_df.head(5)

(89390, 92)


Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,readmitted,...,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes
0,65,2,51,3,11,0,0,0,4,0,...,0,1,0,0,0,0,0,0,1,0
1,45,3,86,1,15,1,0,1,9,0,...,0,1,0,0,0,0,0,0,1,0
2,45,13,88,5,34,0,0,0,9,0,...,0,1,0,0,0,0,0,0,0,1
3,85,4,18,2,17,3,2,0,9,0,...,0,1,0,0,0,0,0,0,0,1
4,65,3,22,0,11,1,0,2,6,0,...,0,1,0,0,0,0,0,0,0,1


## Using the whole data set

In [18]:
#separate the input feature and outpout feature
X = training_df.loc[:, training_df.columns !='readmitted']
y = training_df[['readmitted']]


### Scaling the data

In [19]:
#scalar fit the data to training inputs (X)
scaler  = StandardScaler()
scaler.fit(X)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [20]:
#save it using a package called pickle
import pickle
scalerfile = 'scaler.sav'
pickle.dump(scaler, open(scalerfile, 'wb'))


In [21]:
# load it back
scaler = pickle.load(open(scalerfile, 'rb'))

In [22]:
#scale the training inputs (X)
X_tf = scaler.transform(X)

## Applying SMOTE method to balance the training dataset

In [25]:
# Data balancing applied using SMOTE
from imblearn.over_sampling import SMOTE

from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=20)
X_new, y_new = sm.fit_sample(X_tf, y)
print('New dataset shape {}'.format(Counter(y_new)))

Original dataset shape Counter({'readmitted': 1})
New dataset shape Counter({0: 79213, 1: 79213})


## Split train data into 80% train and 20% validate

In [26]:
# split train data set in train and test sets for model training
X_train, X_valid, y_train, y_valid = train_test_split(X_new, y_new, test_size=0.20, random_state=42)


### Model Selection

In [27]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
def calc_specificity(y_actual, y_pred, thresh):
    # calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)

def print_report(y_actual, y_pred, thresh):
    
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%calc_prevalence(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity

def calc_prevalence(y_actual):
    return (sum(y_actual)/len(y_actual))

In [28]:
# logistic regression
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(random_state = 42)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
thresh=0.5
y_train_preds = lr.predict_proba(X_train)[:,1]
y_valid_preds = lr.predict_proba(X_valid)[:,1]

print('Logistic Regression')
print('Training:')
lr_train_auc, lr_train_accuracy, lr_train_recall, \
    lr_train_precision, lr_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

Logistic Regression
Training:
AUC:0.667
accuracy:0.615
recall:0.566
precision:0.627
specificity:0.665
prevalence:0.499
 
Validation:
AUC:0.669
accuracy:0.618
recall:0.568
precision:0.636
specificity:0.669
prevalence:0.504
 


## Using the unique data set

In [31]:
#separate the input feature and outpout feature
X = unique_train_df.loc[:, unique_train_df.columns !='readmitted']
y = unique_train_df[['readmitted']]


### Scaling the data

In [32]:
#scalar fit the data to training inputs (X)
scaler  = StandardScaler()
scaler.fit(X)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [33]:
#save it using a package called pickle
import pickle
scalerfile = 'scaler.sav'
pickle.dump(scaler, open(scalerfile, 'wb'))


In [34]:
# load it back
scaler = pickle.load(open(scalerfile, 'rb'))

In [35]:
#scale the training inputs (X)
X_tf = scaler.transform(X)

## Applying SMOTE method to balance the training dataset

In [36]:
# Data balancing applied using SMOTE
from imblearn.over_sampling import SMOTE

from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=20)
X_new, y_new = sm.fit_sample(X_tf, y)
print('New dataset shape {}'.format(Counter(y_new)))

Original dataset shape Counter({'readmitted': 1})
New dataset shape Counter({0: 59528, 1: 59528})


## Split train data into 80% train and 20% validate

In [37]:
# split train data set in train and test sets for model training
X_train, X_valid, y_train, y_valid = train_test_split(X_new, y_new, test_size=0.20, random_state=42)


### Model Selection

In [38]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
def calc_specificity(y_actual, y_pred, thresh):
    # calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)

def print_report(y_actual, y_pred, thresh):
    
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%calc_prevalence(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity

def calc_prevalence(y_actual):
    return (sum(y_actual)/len(y_actual))

In [39]:
# logistic regression
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(random_state = 42)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
thresh=0.5
y_train_preds = lr.predict_proba(X_train)[:,1]
y_valid_preds = lr.predict_proba(X_valid)[:,1]

print('Logistic Regression')
print('Training:')
lr_train_auc, lr_train_accuracy, lr_train_recall, \
    lr_train_precision, lr_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

Logistic Regression
Training:
AUC:0.684
accuracy:0.628
recall:0.592
precision:0.639
specificity:0.664
prevalence:0.501
 
Validation:
AUC:0.683
accuracy:0.626
recall:0.591
precision:0.630
specificity:0.660
prevalence:0.495
 
