In [1]:
# reference : https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/

In [7]:
import numpy as np 
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense 

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline

In [8]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [10]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
                , header=None)
d = df.values

In [16]:
X = d[:,:60].astype(float)
Y = d[:,60]

In [18]:
# encode class values (Y) as integers

encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

### baseline model

In [31]:
# baseline model ,  a single fully connected hidden layer with the same number of neurons as input variables

def create_baseline():
    model = Sequential()
    model.add(Dense(60, input_dim=60, kernel_initializer="normal", activation="relu"))
    model.add(Dense(1, kernel_initializer= "normal", activation ="sigmoid"))
    
    model.compile(loss="binary_crossentropy", optimizer = "adam", metrics=["accuracy"])
    
    return model

In [41]:
# evaluate model with standardized dataset

estimator = KerasClassifier(build_fn = create_baseline, nb_epoch=100, batch_size=5, verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X, encoded_Y, cv=kfold)

In [42]:
print("Results:%.2f%%  (%.2f%%)"%(results.mean()*100, results.std()*100))

Results:53.86%  (1.62%)


### baseline model with standardized dataset

In [44]:
# Rather than performing the standardization on the entire dataset, it is good practice to
# train the standardization procedure on the training data within the pass of a cross-validation run and 
# to use the trained standardization to prepare the “unseen” test fold. 

In [46]:
# 關於standardized model 用來預測的討串：
# https://stackoverflow.com/questions/38780302/predicting-new-data-using-sklearn-after-standardizing-the-training-data
# 結論是 testing data 可以被 standardize，參數是 training data訓練出來的！
# 這邊的基本假設是,  training, testing 的分佈必須是差不多的！

In [47]:
# 底下已經用 pipeline完成這些事了！

In [43]:
# evaluate baseline model with standardized dataset

np.random.seed(seed)
estimators = []
estimators.append(("standardized", StandardScaler()))
estimators.append(("mlp",KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state = seed)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)

print("Standardized: %.2f%%(%.2f%%)"%(results.mean()*100, results.std()*100))

Standardized: 84.11%(4.36%)


### Tuning Layers and Number of Neurons in The Model

In [49]:
# smaller model

def create_smaller():
    model = Sequential()
    model.add(Dense(30, input_dim=60, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_smaller, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Smaller: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Smaller: 85.52% (5.85%)


In [50]:
# larger model

def create_larger():
    model = Sequential()
    model.add(Dense(60, input_dim=60, kernel_initializer='normal', activation='relu'))
    model.add(Dense(30, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_larger, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Larger: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Larger: 84.57% (6.41%)
