<a href="https://colab.research.google.com/github/Chansikan/Python-tutorial/blob/master/Hands_on_Python_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data Preparation

In [None]:
# 1. diabetes data 불러오기
import pandas as pd

df = pd.read_csv('diabetes_data.csv')
df = df.loc[:, 'AGE':'Y']

# 2. Y를 binary categorical variable로 바꾸기
Y_median = df['Y'].median()

df = df.assign(
    Y_cat = lambda dataframe: dataframe['Y'].map(lambda Y: 1 if Y >= Y_median else 0) 
)

# 3. Select variables, and create X and y
X = df.loc[:, 'AGE':'BSL']
y = df['Y_cat']
print(X)
print(y)

In [2]:
# 4. Split the dataset into Training and Test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                   random_state=0, stratify=y)

print('No of total cases in training / validation sets: {0} / {1}'
      .format(len(X_train), len(X_test)))
print('Proportion of positive cases in training / validation sets: {0:.2f} / {1:.2f}'
      .format(y_train.sum()/len(y_train), y_test.sum()/len(y_test)))

No of total cases in training / validation sets: 309 / 133
Proportion of positive cases in training / validation sets: 0.50 / 0.50


In [None]:
# 5. Standardization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train_scaled
#X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
#X_train

## Cross validation in the training set: hard coding

In [5]:
# 1. 모델 만들어 두기
from sklearn.linear_model import LogisticRegression

LASSO_1 = LogisticRegression(C=0.1, penalty='l1', solver='liblinear', 
                                   max_iter=10000, random_state=0)

In [None]:
# 2. Cross validation fold 나누기
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

k = 5
model = LASSO_1

kf = KFold(n_splits=k, shuffle=False)
kf.get_n_splits(X_train)

cv_results = []

for train_index, val_index in kf.split(X_train):
  print("Index for validation set:", val_index)
  X_train_fold, X_val_fold = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
  y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
  
  model.fit(X_train_fold, y_train_fold)
  y_proba = model.predict_proba(X_val_fold)
  auc = roc_auc_score(y_val_fold, y_proba[:,1])
  cv_results.append(auc)


In [None]:
import numpy as np

print(cv_results)
print("mean AUC:", np.mean(cv_results))
print("AUC SD:", np.std(cv_results))

## Cross validation을 이용하여 좋은 모델 정하기

In [9]:
# 1. 모델 만들어 두기
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

LASSO_1 = LogisticRegression(C=0.1, penalty='l1', solver='liblinear', 
                                   max_iter=10000, random_state=0)
LASSO_2 = LogisticRegression(C=0.5, penalty='l1', solver='liblinear', 
                                   max_iter=10000, random_state=0)
LASSO_3 = LogisticRegression(C=1, penalty='l1', solver='liblinear', 
                                   max_iter=10000, random_state=0)

ADA_1 = AdaBoostClassifier(n_estimators=30, random_state=0)
ADA_2 = AdaBoostClassifier(n_estimators=50, random_state=0)
ADA_3 = AdaBoostClassifier(n_estimators=70, random_state=0)

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score

model = LASSO_1
cv = cross_validate(model, X_train, y_train, cv=5, scoring='roc_auc')
cv

In [20]:
# loop through several models

cv_results = {}
model_list = [LASSO_1, LASSO_2, LASSO_3, ADA_1, ADA_2, ADA_3]
model_name = ['LASSO_1', 'LASSO_2', 'LASSO_3', 'ADA_1', 'ADA_2', 'ADA_3']

for name, model in zip(model_name, model_list):
  cv = cross_validate(model, X_train, y_train, cv=5, scoring='roc_auc')
  cv_results[name] = [np.mean(cv['test_score']), np.std(cv['test_score'])] 

In [None]:
cv_results

In [None]:
# Try testing!
LASSO_2.fit(X_train, y_train)
y_proba = LASSO_2.predict_proba(X_test)
roc_auc_score(y_test, y_proba[:,1])

## Hyperparameter optimization

In [None]:
# AdaBoost

cv_results = {}
n_estimator_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
for ne in n_estimator_list:
  model = AdaBoostClassifier(n_estimators=ne, random_state=0)
  cv = cross_validate(model, X_train, y_train, cv=5, scoring='roc_auc')
  cv_results[ne] = [round(np.mean(cv['test_score']), 3), 
                    round(np.std(cv['test_score']), 3)] 

cv_results

In [None]:
# LASSO

cv_results = {}
c_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
for c in c_list:
  model = LogisticRegression(C=c, penalty='l1', solver='liblinear', 
                                   max_iter=10000, random_state=0)
  cv = cross_validate(model, X_train, y_train, cv=5, scoring='roc_auc')
  cv_results[c] = [round(np.mean(cv['test_score']), 3), 
                    round(np.std(cv['test_score']), 3)] 

cv_results

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                   random_state=0, stratify=y)

selected_LASSO = LogisticRegression(C=1, penalty='l1', solver='liblinear', 
                                   max_iter=10000, random_state=0)
selected_ADA = AdaBoostClassifier(n_estimators=100, random_state=0)

selected_LASSO.fit(X_train, y_train)
selected_ADA.fit(X_train, y_train)

y_proba_LASSO = selected_LASSO.predict_proba(X_test)
y_proba_ADA = selected_ADA.predict_proba(X_test)

print("AUC for LASSO:", round(roc_auc_score(y_test, y_proba_LASSO[:,1]), 3))
print("AUC for ADA:", round(roc_auc_score(y_test, y_proba_ADA[:,1]), 3))

## Further simplify using SK-Learn API

In [None]:
from sklearn.model_selection import GridSearchCV

LASSO = LogisticRegression(penalty='l1', solver='liblinear', 
                           max_iter=10000, random_state=0)
ADA = AdaBoostClassifier(random_state=0)

param_grid_LASSO = {'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}
param_grid_ADA = {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                  'learning_rate': [0.1, 0.5, 1]}

selected_LASSO = GridSearchCV(LASSO, param_grid_LASSO, scoring='roc_auc', cv=5)
selected_LASSO.fit(X_train, y_train)

selected_ADA = GridSearchCV(ADA, param_grid_ADA, scoring='roc_auc', cv=5)
selected_ADA.fit(X_train, y_train)

In [None]:
print("LASSO AUC in CV:", selected_LASSO.best_score_ )
print("ADA AUC in CV:", selected_ADA.best_score_)

In [None]:
print(selected_ADA.best_params_)
print(selected_LASSO.best_params_)

In [None]:
print(selected_ADA.best_estimator_)
print(selected_LASSO.best_estimator_)

In [None]:
# Try testing!


