In [1]:
#from utils.data_loader import train_data_loader, test_data_loader
#from utils.inference_tools import pred_to_binary, export_csv

import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Lasso, RidgeClassifier, ElasticNet, Lars, LassoLars
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer

import pandas as pd
import numpy as np
import pickle
import datetime

import warnings
warnings.filterwarnings('ignore')

# Load Data and Pre-processing

In [None]:
# Print Current Time
time = str(datetime.datetime.now()).split()[1].split('.')[0]
print("Start:", time)


# Print Information
name = 'KHW'
model = 'Stacking'
summary = 'HyperParams tuning with 9 sklearn models'

print('Author Name :', name)
print('Model :', model)
print('Summary :', summary)


# Setting
pos_dir = "/data/train/positive/"
neg_dir = "/data/train/negative/"
test_dir = '/data/test/'

do_n4 = False
do_ws = True
do_resample = True

do_shuffle_train = True
do_shuffle_test = False
save_to_disk = False
return_patient_num_train = False
return_patient_num_test = True


# Data Load
X_train, y_train = train_data_loader(pos_dir, neg_dir, do_n4, do_ws, do_resample, do_shuffle_train, save_to_disk, return_patient_num_train)
X_test, patient_num = test_data_loader(test_dir, do_n4, do_ws, do_resample, do_shuffle_test, save_to_disk, return_patient_num_test)

In [2]:
# Example Code for local test

train = pd.read_csv("./example/data/my_train.csv")[:15]
test = pd.read_csv("./example/data/my_test.csv")[:10]

features = ['Pclass', 'Sex', 'Age', 'Parch', 'Fare', 'Embarked', 'Name_length', 'Has_Cabin', 'FamilySize', 'IsAlone', 'Title']

X_train = train[features]
y_train = train['Survived']
X_test = test

# Base Model

### xgboost

In [None]:
# Fit Model with Training Data
model1 = xgb.XGBClassifier(n_jobs=4)
model1.fit(X_train, y_train)


# Save model to file
pickle.dump(model1, open('/data/model/model1.pickle.dat', 'wb'))

### svm

In [None]:
# Fit Model with Training Data
model2 = SVC()
model2.fit(X_train, y_train)


# Save model to file
pickle.dump(model2, open('/data/model/model2.pickle.dat', 'wb'))

### logistic regression

In [None]:
# Fit Model with Training Data
model3 = LogisticRegression(n_jobs=4)
model3.fit(X_train, y_train)


# Save model to file
pickle.dump(model3, open('/data/model/model3.pickle.dat', 'wb'))

### random forest

In [None]:
# Fit Model with Training Data
model4 = RandomForestClassifier(n_jobs=4)
model4.fit(X_train, y_train)


# Save model to file
pickle.dump(mode4l, open('/data/model/model4.pickle.dat', 'wb'))

<br><br><br>

# Score

In [None]:
def f_half_score(y_true, y_pred) :
    return fbeta_score(y_true, y_pred, average='binary', beta=0.5)

scorer = make_scorer(f_half_score, greater_is_better = True)

# Modeling

### MLP

### CNN

### Separated

# Parameter Tuning & CV

### xgboost

In [None]:
model1 = xgb.XGBClassifier(n_jobs=4)

In [None]:
m1_params1 = {
    'max_depth' : [3,5,7,9,11],
    'min_child_weight' : [0.5, 1],
    'gamma' : [0, 0.1],
    'subsample' : [0.5, 0.7, 0.9],
    'colsample_bytree' : [0.5, 0.7, 0.9],
}

m1_grid_1 = GridSearchCV(model1, param_grid=m1_params1, scoring=scorer, cv=5, verbose=0, n_jobs=4)
m1_grid_1.fit(X_train, y_train)

best_model1 = m1_grid_1.best_estimator_

print("Best Score : {}".format(m1_grid_1.best_score_))
print("Best Params : {}".format(m1_grid_1.best_params_))

In [None]:
m1_params2 = {
    'learning_rate' : [0.01, 0.05, 0.07, 0.1, 0.2],
    'n_estimators' : [n for n in range(100,1101,200)]
}

m1_grid_2 = GridSearchCV(best_model1, param_grid=m1_params2, scoring=scorer, cv=5, verbose=0, n_jobs=4)
m1_grid_2.fit(X_train, y_train)

best_model1 = m1_grid_2.best_estimator_

print("Best Score : {}".format(m1_grid_2.best_score_))
print("Best Params : {}".format(m1_grid_2.best_params_))

### svm

In [None]:
model2 = SVC()

In [None]:
m2_params1 = {
    'C': [0.001, 0.01, 0.1, 1, 10, 50, 100], 
    'gamma' : [0.001, 0.01, 0.1, 1, 2, 5, 10, 20],
    'degree' : [2,3,4]
}

m2_grid_1 = GridSearchCV(model2, param_grid=m2_params1, scoring=scorer, cv=5, verbose=0, n_jobs=4)
m2_grid_1.fit(X_train, y_train)

best_model2 = m2_grid_1.best_estimator_

print("Best Score : {}".format(m2_grid_1.best_score_))
print("Best Params : {}".format(m2_grid_1.best_params_))

### logistic regression

In [None]:
model3 = LogisticRegression(n_jobs=4)

In [None]:
m3_params1 = {
    'C': [0.001, 0.01, 0.1, 1, 10, 50, 100],
    'max_iter' : [n for n in range(100,1101, 200)]
}

m3_grid_1 = GridSearchCV(model3, param_grid=m3_params1, scoring=scorer, cv=5, verbose=0, n_jobs=4)
m3_grid_1.fit(X_train, y_train)

best_model3 = m3_grid_1.best_estimator_

print("Best Score : {}".format(m3_grid_1.best_score_))
print("Best Params : {}".format(m3_grid_1.best_params_))

### random forest

In [None]:
model4 = RandomForestClassifier(n_jobs=4)

In [None]:
m4_params1 = {
    'max_depth' : [n for n in range(10, 101, 5)],
    'min_samples_leaf': [1, 2, 3, 4, 5,10, 20, 50],
    'min_samples_split': [1, 2, 4, 6, 8, 10, 12],
}

m4_grid_1 = GridSearchCV(model4, param_grid=m4_params1, scoring=scorer, cv=5, verbose=0, n_jobs=4)
m4_grid_1.fit(X_train, y_train)

best_model4 = m4_grid_1.best_estimator_

print("Best Score : {}".format(m4_grid_1.best_score_))
print("Best Params : {}".format(m4_grid_1.best_params_))

In [None]:
m4_params2 = {
    'n_estimators' : [n for n in range(100,1001,50)]
}

m4_grid_2 = GridSearchCV(best_model4, param_grid=m4_params2, scoring=scorer, cv=5, verbose=0, n_jobs=4)
m4_grid_2.fit(X_train, y_train)

best_model4 = m4_grid_2.best_estimator_

print("Best Score : {}".format(m4_grid_2.best_score_))
print("Best Params : {}".format(m4_grid_2.best_params_))

### lasso regression

In [None]:
model5 = LogisticRegression(n_jobs=4, penalty="l1")

In [None]:
m5_params1 = {
    'C': [0.001, 0.01, 0.1, 1, 10, 50, 100],
    'max_iter' : [n for n in range(100,1101, 200)]
}

m5_grid_1 = GridSearchCV(model5, param_grid=m5_params1, scoring=scorer, cv=5, verbose=0, n_jobs=4)
m5_grid_1.fit(X_train, y_train)

best_model5 = m3_grid_1.best_estimator_

print("Best Score : {}".format(m5_grid_1.best_score_))
print("Best Params : {}".format(m5_grid_1.best_params_))

### ridge regression

In [None]:
model6 = RidgeClassifier()

In [None]:
m6_params1 = {
    'alpha': [0.1, 1, 2, 5, 10, 20, 50, 100],
    'max_iter' : [None]+[n for n in range(100,1101, 200)]
}

m6_grid_1 = GridSearchCV(model6, param_grid=m6_params1, scoring=scorer, cv=5, verbose=0, n_jobs=4)
m6_grid_1.fit(X_train, y_train)

best_model6 = m6_grid_1.best_estimator_

print("Best Score : {}".format(m6_grid_1.best_score_))
print("Best Params : {}".format(m6_grid_1.best_params_))

### elasticNet

In [None]:
model7 = ElasticNet()

In [None]:
m7_params1 = {
    'alpha': [0.1, 1, 2, 5, 10, 20, 50, 100],
    'l1_ratio':[0.3, 0.4, 0.5, 0.6], 
    'max_iter' : [n for n in range(800, 2001, 200)]
}

m7_grid_1 = GridSearchCV(model7, param_grid=m7_params1, scoring=scorer, cv=5, verbose=0, n_jobs=4)
m7_grid_1.fit(X_train, y_train)

best_model7 = m7_grid_1.best_estimator_

print("Best Score : {}".format(m7_grid_1.best_score_))
print("Best Params : {}".format(m7_grid_1.best_params_))

### LARS

In [None]:
model8 = Lars()

In [None]:
m8_params1 = {
    'n_nonzero_coefs': [n for n in range(30, 150, 10)],
}

m8_grid_1 = GridSearchCV(model8, param_grid=m8_params1, scoring=scorer, cv=5, verbose=0, n_jobs=4)
m8_grid_1.fit(X_train, y_train)

best_model8 = m8_grid_1.best_estimator_

print("Best Score : {}".format(m8_grid_1.best_score_))
print("Best Params : {}".format(m8_grid_1.best_params_))

### LARS lasso

In [None]:
model9 = LassoLars()

In [None]:
m9_params1 = {
    'alpha': [0.1, 1, 2, 5, 10, 20, 50, 100],
    'max_iter' : [n for n in range(800, 2001, 200)]
}

m9_grid_1 = GridSearchCV(model9, param_grid=m9_params1, scoring=scorer, cv=5, verbose=0, n_jobs=4)
m9_grid_1.fit(X_train, y_train)

best_model9 = m9_grid_1.best_estimator_

print("Best Score : {}".format(m9_grid_1.best_score_))
print("Best Params : {}".format(m9_grid_1.best_params_))

# Model Stacking

In [None]:
from keras.models import Sequential, model_from_json
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

In [None]:
def stacking(models, data) : 
    result = []
    
    for model in models :
        result.append(model.predict_proba(data))
        
    return np.array(result).T

In [None]:
models = [best_model1, best_model2, best_model3, best_model4, best_model5, best_model6, best_model7, best_model8, best_model9]
S_train = stacking(models, X_train)

### weight

### NN

In [None]:
def stack_fn(num_models=9):
    model = Sequential()
    model.add(Dense(16, input_dim=num_models, activation='relu'))
    model.add(Dense(2, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
meta_model = KerasClassifier(build_fn=stack_fn)
meta_model.fit(S_train, y_train)

# Save

In [None]:
pickle.dump(model1, open('/data/model/model1.pickle.dat', 'wb'))
pickle.dump(model2, open('/data/model/model2.pickle.dat', 'wb'))
pickle.dump(model3, open('/data/model/model3.pickle.dat', 'wb'))
pickle.dump(model4, open('/data/model/model4.pickle.dat', 'wb'))
pickle.dump(model5, open('/data/model/model5.pickle.dat', 'wb'))
pickle.dump(model6, open('/data/model/model6.pickle.dat', 'wb'))
pickle.dump(model7, open('/data/model/model7.pickle.dat', 'wb'))
pickle.dump(model8, open('/data/model/model8.pickle.dat', 'wb'))
pickle.dump(model9, open('/data/model/model9.pickle.dat', 'wb'))

In [None]:
meta_model.model.save_weights('/data/model/model_weights.h5')

with open('/data/model/model_architecture.json', 'w') as f :
    f.write(meta_model.to_json())

# Loading & Prediction

In [None]:
model1 = pickle.load(open('/data/model/model1.pickle.dat', 'rb'))
model2 = pickle.load(open('/data/model/model2.pickle.dat', 'rb'))
model3 = pickle.load(open('/data/model/model3.pickle.dat', 'rb'))
model4 = pickle.load(open('/data/model/model4.pickle.dat', 'rb'))
model5 = pickle.load(open('/data/model/model5.pickle.dat', 'rb'))
model6 = pickle.load(open('/data/model/model6.pickle.dat', 'rb'))
model7 = pickle.load(open('/data/model/model7.pickle.dat', 'rb'))
model8 = pickle.load(open('/data/model/model8.pickle.dat', 'rb'))
model9 = pickle.load(open('/data/model/model9.pickle.dat', 'rb'))

with open('/data/model/model_architecture.json', 'r') as f :
    meta = model_from_json(f.read())

meta.models.load_weights('/data/model/model_weights.h5')

In [None]:
# Make Predictions for Test Data
models = [model1, model2, model3, model4, model5, model6, model7, model8, model9]
S_test = stacking(models, X_test)

threshold = 0.65
y_pred = meta.predict_proba(S_test)[:, 1]
y_pred_binary = pred_to_binary(y_pred, threshold = threshold)


# Make 'output.csv'
export_csv(patient_num, y_pred_binary, y_pred)