## this notebook takes the results of the ant-colony-optimizer notebook
- previous notebook will store the results of specifically the best ant which has the lowest cost
- the resulting features/tour it has used will be used in this notebook
- this notebook will train both the baseline and tuned models using the best ant's features/path it has made
- performance like binary accuracy, binary cross entropy, loss, validation binary accuracy, validation binary cross entropy, and validation loss will all be recorded with the use of the reduced dataset

## import libraries

In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from utilities.data_preprocessor import preprocess
from utilities.data_visualizer import view_train_cross, train_cross_results_v2
from models.model_arcs import load_baseline, load_tuned

import json
import os

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## check current working directory

In [64]:
print(os.getcwd())

c:\Users\Mig\Desktop\projects\To Github\breast-cancer-classifier


## load data
- load the original data first

In [65]:
# use path below if in local machine
df = pd.read_csv('./data.csv')

# use path below if in google collab
# df = pd.read_csv('./sample_data/breast_cancer_data.csv')


X, Y = preprocess(df)
X_trains_orig, X_, Y_trains_orig, Y_ = train_test_split(X, Y, test_size=0.3, random_state=0)
X_cross_orig, X_tests_orig, Y_cross_orig, Y_tests_orig = train_test_split(X_, Y_, test_size=0.5, random_state=0)
view_train_cross(X_trains_orig, X_cross_orig, Y_trains_orig, Y_cross_orig)

(398, 30)
(398, 1)
(85, 30)
(85, 1)
     radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
478    -0.749028     -1.093640       -0.740560  -0.710995         0.586383   
303    -1.033042     -0.158159       -1.034246  -0.911788         0.742947   
155    -0.533178     -0.314072       -0.564266  -0.553431        -0.698865   
186     1.187949     -0.165140        1.096935   1.098139        -0.745834   
101    -2.029648     -1.363580       -1.984504  -1.454443         1.468835   
..           ...           ...             ...        ...              ...   
277     1.329956      0.160649        1.191672   1.271629        -0.507430   
9      -0.473535      1.105439       -0.329482  -0.509063         1.582699   
359    -1.332393     -0.225644       -1.324225  -1.070205         0.323071   
192    -1.251733     -0.248914       -1.286742  -1.043186        -1.911524   
559    -0.743348      1.079841       -0.718729  -0.714976        -0.266890   

     compactness_mean  conc

- [20, 16, 19, 14, 13, 11, 5, 3, 9, 28, 24, 15, 17, 21, 10] is the path of the best ant so use these feature indeces in loading the data with select features
- this dataset is the one with carefully selected features

In [66]:
features = df.columns[[20, 16, 19, 14, 13, 11, 5, 3, 9, 28, 24, 15, 17, 21, 10]]
features


Index(['symmetry_se', 'smoothness_se', 'concave points_se', 'perimeter_se',
       'texture_se', 'fractal_dimension_mean', 'area_mean', 'texture_mean',
       'concave points_mean', 'concavity_worst', 'perimeter_worst', 'area_se',
       'compactness_se', 'fractal_dimension_se', 'symmetry_mean'],
      dtype='object')

In [67]:
X_reduced, Y_reduced = preprocess(df, feat_idxs=features)
X_trains_reduced, X_, Y_trains_reduced, Y_ = train_test_split(X_reduced, Y_reduced, test_size=0.3, random_state=0)
X_cross_reduced, X_tests_reduced, Y_cross_reduced, Y_tests_reduced = train_test_split(X_, Y_, test_size=0.5, random_state=0)
view_train_cross(X_trains_reduced, X_cross_reduced, Y_trains_reduced, Y_cross_reduced)

(398, 15)
(398, 1)
(85, 15)
(85, 1)
     symmetry_se  smoothness_se  concave points_se  perimeter_se  texture_se  \
478    -0.255754      -0.694688          -0.609606     -0.643074   -0.092266   
303    -0.318715       0.611365          -0.421281     -0.906430    0.628029   
155    -0.309029      -0.507680          -0.628098     -0.684162   -0.425561   
186    -1.192902      -1.391716          -0.823561     -0.519316   -1.344707   
101     0.732247       1.049716          -1.913447     -0.650005    0.528240   
..           ...            ...                ...           ...         ...   
277    -0.150416       0.176681           0.459680     -0.249030   -0.705514   
9      -0.321136       0.036008           0.409395     -0.409420    0.693345   
359    -0.135886      -0.068329          -0.848541      0.198478    0.054696   
192     2.112542      -1.776065          -1.913447     -0.314869    6.655279   
559    -0.685583       0.386356           0.141749     -0.460408    3.061064   

   

In [68]:
X_trains_reduced

Unnamed: 0,symmetry_se,smoothness_se,concave points_se,perimeter_se,texture_se,fractal_dimension_mean,area_mean,texture_mean,concave points_mean,concavity_worst,perimeter_worst,area_se,compactness_se,fractal_dimension_se,symmetry_mean
478,-0.255754,-0.694688,-0.609606,-0.643074,-0.092266,0.417114,-0.710995,-1.093640,-0.753936,-0.060394,-0.751235,-0.571980,-0.242457,-0.068427,-0.119089
303,-0.318715,0.611365,-0.421281,-0.906430,0.628029,0.453972,-0.911788,-0.158159,-0.802687,-0.901735,-1.087219,-0.665707,-0.900724,-0.180768,-1.203419
155,-0.309029,-0.507680,-0.628098,-0.684162,-0.425561,-0.073377,-0.553431,-0.314072,-0.660562,-0.373672,-0.615412,-0.524236,-0.550967,-0.495851,0.578241
186,-1.192902,-1.391716,-0.823561,-0.519316,-1.344707,-1.211713,1.098139,-0.165140,0.237843,0.391533,0.951324,-0.251195,-0.910840,-1.024268,-0.695938
101,0.732247,1.049716,-1.913447,-0.650005,0.528240,2.180614,-1.454443,-1.363580,-1.261820,-1.305831,-1.693361,-0.671142,-0.818119,0.115403,0.432204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,-0.150416,0.176681,0.459680,-0.249030,-0.705514,-1.819865,1.271629,0.160649,0.245323,-0.245578,0.647508,-0.079142,-0.801352,-0.801478,-0.955155
9,-0.321136,0.036008,0.409395,-0.409420,0.693345,2.783096,-0.509063,1.105439,0.941760,3.995433,-0.286278,-0.360764,2.609587,2.377346,0.797298
359,-0.135886,-0.068329,-0.848541,0.198478,0.054696,0.962892,-1.070205,-0.225644,-0.899156,-0.756994,-0.937396,-0.216873,-0.921962,-0.409232,-1.115796
192,2.112542,-1.776065,-1.913447,-0.314869,6.655279,0.237079,-1.043186,-0.248914,-1.261820,-1.305831,-1.340697,-0.410268,-1.047490,-0.796939,-0.579108


## baseline model training and validation

In [69]:
# import then load baseline model architecture
baseline_model_orig = load_baseline()

# # begin model training
# baseline_history = baseline_model_orig.fit(
#     X_trains, Y_trains,
#     epochs=100,
#     validation_data=(X_cross, Y_cross)
# )

# # extract the history of accuracy and cost of model
# baseline_results = {
#     'train_loss': baseline_history.history['loss'],
#     'train_binary_crossentropy': baseline_history.history['binary_crossentropy'],
#     'train_binary_accuracy': baseline_history.history['binary_accuracy'],
#     'cross_val_loss': baseline_history.history['val_loss'],
#     'cross_val_binary_crossentropy': baseline_history.history['val_binary_crossentropy'],
#     'cross_val_binary_accuracy': baseline_history.history['val_binary_accuracy']
# }

## results visualization

In [70]:
# train_cross_results_v2(baseline_results, epochs=baseline_history.epochs[-1])

# # save baseline model
# baseline_model.save('./models/baseline_model.h5')

# # save results of trained model
# with open("./results/baseline_model_results.json", "w") as out_file:
#     json.dump(baseline_results, out_file)

## tuned model training and validation

In [71]:
# import then load tuned model architecture
tuned_model = load_tuned(param_file_path='./results/best_hyper_params.json')

{'layer_1': 201, 'layer_2': 901, 'layer_3': 101, 'layer_4': 601, 'activation': 'relu', 'learning_rate': 0.003, 'lambda': 0.01}
