<a href="https://colab.research.google.com/github/Chansikan/Python-tutorial/blob/master/SC_MLSG_S1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hands-on machine learning

## Machine learning flow

1. Frame the problem and look at the big picture
- Binary classification (GBM vs. metastasis), evaluated using AUC
- AUC of two human readers: 0.75 and 0.9

2. Get the data
3. Explore the data
4. Prepare the data
- Standardization
- Feature selection
5. Shortlist promising models
6. Fine-tune the system
7. Present your solution
8. Launch, monitor, and maintain your system


## Data preparation

In [1]:
import pandas as pd

df_internal = pd.read_csv('GvM_training_cohort.csv')
df_external = pd.read_csv('GvM_ext_cohort.csv')

bool_idx = (df_internal.columns.str.startswith('CE_mask') | \
  df_internal.columns.str.startswith('periT2_mask')) & \
  (~ df_internal.columns.str.contains('EdgeContrast'))

X_int = df_internal.loc[:, bool_idx]
y_int = df_internal['Label']
X_ext = df_external.loc[:, bool_idx]
y_ext = df_external['Label']

# sanity check
print(all(X_int.columns == X_ext.columns))
print(X_int)
print(X_ext)

True
     CE_mask_with_CE_image_firstorder_10Percentile  ...  periT2_mask_with_T2_image_shape_VoxelVolume
0                                        27.397602  ...                                        10922
1                                        17.417849  ...                                        46168
2                                        -8.090730  ...                                       144196
3                                        28.986559  ...                                       171396
4                                         8.421994  ...                                        86985
..                                             ...  ...                                          ...
162                                      26.026626  ...                                        24236
163                                      14.857413  ...                                        56159
164                                      -6.727435  ...                               

## Very simple procedure

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = \
  train_test_split(X_int, y_int, test_size=0.3, 
                   random_state=2020, stratify=y_int)

print('No of total cases in training / validation sets: {0} / {1}'
      .format(len(X_train), len(X_val)))
print('Proportion of metastasis in training / validation sets: {0:.2f} / {1:.2f}'
      .format(y_train.sum()/len(y_train), y_val.sum()/len(y_val)))

No of total cases in training / validation sets: 116 / 51
Proportion of metastasis in training / validation sets: 0.34 / 0.35


In [0]:
# Prepare several model candidates
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

model_LASSO_1 = LogisticRegression(C=0.001, penalty='l1', solver='liblinear', 
                                   max_iter=10000, random_state=0)
model_LASSO_2 = LogisticRegression(C=0.01, random_state=0)
model_LASSO_3 = LogisticRegression(C=0.1, random_state=0)

model_ADA_1 = AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=0)
model_ADA_2 = AdaBoostClassifier(n_estimators=150, learning_rate=0.05, random_state=0)
model_ADA_3 = AdaBoostClassifier(n_estimators=200, learning_rate=1, random_state=0)

In [13]:
# train and validate!
from sklearn.metrics import roc_auc_score

model_LASSO_1.fit(X_train, y_train)
y_pred = model_LASOO_1.predict(X_val)
roc_auc_score(y_val, y_pred)

0.5

In [0]:
# try standardization
# try LASSO (regularization)
# how about ADA? try feature selection

In [0]:
# standardizarion: insert this cell above
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_int, y_int)
X_int = scaler.transform(X_int)

In [0]:
#### automation ####
model_list = [model_LASSO_1, model_LASSO_2, model_LASSO_3,
              model_ADA_1, model_ADA_2, model_ADA_3]

for clf in model_list:
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_val)
  print(clf, roc_auc_score(y_val, y_pred))

In [16]:
#### feature selection
from sklearn.feature_selection import SelectKBest, mutual_info_classif

X_new = SelectKBest(mutual_info_classif, k=20).fit_transform(X_int, y_int)
X_new.shape

(167, 20)

In [0]:
# information leak -> loop 안으로 standardization, feat. selection 가지고 들어와야함

X_int = df_internal.loc[:, bool_idx]
y_int = df_internal['Label']

X_train, X_val, y_train, y_val = \
  train_test_split(X_int, y_int, test_size=0.3, 
                   random_state=2020, stratify=y_int)
  
X_train = scaler.fit_transform(X_train, y_train)
X_val = scaler.transform(X_val)

results = []
best_auc = 0
comb = [(k, c) for k in [20, 40, 60, 80] for c in [0.001, 0.005, 0.01, 0.05, 0.1]]
for k, c in comb:
  #print(k, c)
  selector = SelectKBest(mutual_info_classif, k=k)
  X_train_sel = selector.fit_transform(X_train, y_train)
  X_val_sel = selector.transform(X_val)

  clf = LogisticRegression(C=c, penalty='l1', solver='liblinear')
  clf.fit(X_train_sel, y_train)
  y_pred = clf.predict(X_val_sel)
  auc = roc_auc_score(y_val, y_pred)
  results.append([(k, c), auc])
  if auc > best_auc:
    best_auc = auc
    best_model = [selector, clf, auc]

In [24]:
best_model

[SelectKBest(k=80, score_func=<function mutual_info_classif at 0x7f6a193988c8>),
 LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l1',
                    random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False),
 0.9015151515151516]

In [25]:
# test
final_selector = best_model[0]
final_clf = best_model[1]

X_int_scaled = scaler.fit_transform(X_int, y_int)
X_ext_scaled = scaler.transform(X_ext)

X_int_sel = final_selector.transform(X_int_scaled)
X_ext_sel = final_selector.transform(X_ext_scaled)

final_clf.fit(X_int_sel, y_int)
y_pred = final_clf.predict(X_ext_sel)

roc_auc_score(y_ext, y_pred)

0.7256250000000001

In [26]:
# class Pipeline -> info leak 되지 않으므로 더욱 편리함
from sklearn.pipeline import Pipeline

pipe = Pipeline([('scaler', StandardScaler()), 
                 ('feat_selector', final_selector),
                 ('clf', final_clf)])

pipe.fit(X_int, y_int)

y_pred = pipe.predict(X_ext)

roc_auc_score(y_ext, y_pred)

0.7256250000000001

In [0]:
# let's do cross validation, using Pipeline class

from sklearn.model_selection import GridSearchCV

SelectKBest(mutual_info_classif)

pipe = Pipeline([('scaler', StandardScaler()), 
                 ('fs', SelectKBest(mutual_info_classif)),
                 ('clf', LogisticRegression(penalty='l1', solver='liblinear'))])

param_grid = {
    'fs__k': [20, 40, 60, 80],
    'clf__C': [0.001, 0.01, 0.1]
}

gs_clf = GridSearchCV(pipe, param_grid)

In [29]:
print(gs_clf)
gs_clf.fit(X_int, y_int)
print(gs_clf)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('fs',
                                        SelectKBest(k=10,
                                                    score_func=<function mutual_info_classif at 0x7f6a193988c8>)),
                                       ('clf',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                 

In [33]:
gs_clf.best_score_
gs_clf.best_estimator_

y_pred = gs_clf.predict(X_ext)

roc_auc_score(y_ext, y_pred)

0.7256250000000001

In [2]:
type(X_int)

pandas.core.frame.DataFrame