### IMPORTS

In [1]:
# load, preprocess, scale, baseline
from wrangle import final_dataset

# manipulate data
import pandas as pd
import numpy as np

# visualize data
import matplotlib.pyplot as plt
from cluster_plots import (ncomponents_optimal, scatter_component_means,
                           nclusters_optimal, scatter_cluster_centers)
from dimensionality_reduction_plots import optimal_ncomponents, opt_component_plots
from nn_plot import (learn_curve_display,
                    val_curve_display)
import seaborn as sns

# put it all together
from steps import step2
from models import put_it_all_together

np.random.seed(123)

$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{Load, Clean, Preprocess, Scale, Baseline: Cardiovascular Disease}
$$

In [2]:
%%time
X_train_scaled_cd, X_test_scaled_cd, y_train_cd, y_test_cd = final_dataset(dataset='cvd')

CVD Loading and Cleaning...
CVD Loaded and Cleaned...

CVD Splitting...
CVD Split...

CVD Scaling...
CVD Scaled...

Baseline Accuracy Score: 0.51%

CPU times: user 96.4 ms, sys: 12.1 ms, total: 108 ms
Wall time: 108 ms


### Step 1
$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{KMeans Find and View Optimal Clusters}
$$

In [3]:
# nclusters_optimal(X_train=X_train_scaled_cd, y_train=y_train_cd, dset='cvd', model='km', step3=False)
# scatter_cluster_centers(X_train=X_train_scaled_cd, k=2, dset='cvd', xlabel='Height', ylabel='Weight', model='km', step3=False)

$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{Gaussian Mixture Find and View Optimal Components}
$$

In [4]:
# ncomponents_optimal(X_train=X_train_scaled_cd, y_train=y_train_cd, dset='cvd', model='gm', step3=False)
# scatter_component_means(X_train=X_train_scaled_cd, components=12, dset='cvd', xlabel='Height', ylabel='Weight', model='gm', step3=False)

### Step 2
$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{PCA, ICA, SRP, HLLE Find and View Optimal Components}
$$

In [5]:
# optimal_ncomponents(X_train_scaled_cd, 'cvd')
# opt_component_plots(X_train_scaled_cd,
#                     pca_comp=3,
#                     ica_comp=4, 
#                     srp_comp=4, 
#                     hlle_comp=4, 
#                     hlle_neigh=15,
#                     dset='cvd')

In [6]:
# dset = 'cvd'
# results_s2_cd = {dset: {'step2': {'pca': None,
#                                     'ica': None,
#                                     'sparseRP': None,
#                                     'manifold': None},}}

# results_s3_cd = step2(X_train=X_train_scaled_cd,
#                     y_train=y_train_cd,
#                     X_test=X_test_scaled_cd,
#                     y_test=y_test_cd,
#                     dset=dset,
#                     results=results_s2_cd,
#                     run_step1=False)

# pca_step3_cd = results_s3_cd['cvd']['step2']['pca'][1]
# ica_step3_cd = results_s3_cd['cvd']['step2']['ica'][1]
# srp_step3_cd = results_s3_cd['cvd']['step2']['sparseRP'][1]
# hlle_step3_cd = results_s3_cd['cvd']['step2']['manifold'][1]

### Step 3
$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{KMeans Find and View Optimal Clusters with DR}\\~\\
$$

In [8]:
# # pca
# nclusters_optimal(X_train=pca_step3_cd, y_train=y_train_cd, dset='cvd', step3=True, model='pca')
# scatter_cluster_centers(X_train=pca_step3_cd, k=2, dset='cvd', step3=True, xlabel='Height', ylabel='Weight', model='pca')

# # ica
# nclusters_optimal(X_train=ica_step3_cd, y_train=y_train_cd, dset='cvd', step3=True, model='ica')
# scatter_cluster_centers(X_train=ica_step3_cd, k=4, dset='cvd', step3=True, xlabel='Height', ylabel='Weight', model='ica')

# # srp
# nclusters_optimal(X_train=srp_step3_cd, y_train=y_train_cd, dset='cvd', step3=True, model='srp')
# scatter_cluster_centers(X_train=srp_step3_cd, k=2, dset='cvd', step3=True, xlabel='Height', ylabel='Weight', model='srp')

# # hlle
# nclusters_optimal(X_train=hlle_step3_cd, y_train=y_train_cd, dset='cvd', step3=True, model='hlle')
# scatter_cluster_centers(X_train=hlle_step3_cd, k=4, dset='cvd', step3=True, xlabel='Height', ylabel='Weight', model='hlle')

$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{Gaussian Mixture Find and View Optimal Components}
$$

In [9]:
# # pca
# ncomponents_optimal(pca_step3_cd, y_train_cd, 'cvd', step3=True, model='pca')
# scatter_component_means(pca_step3_cd, 12, 'cvd', 'Height', 'Weight', step3=True, model='pca')

# # ica
# ncomponents_optimal(ica_step3_cd, y_train_cd, 'cvd', step3=True, model='ica')
# scatter_component_means(ica_step3_cd, 12, 'cvd', 'Height', 'Weight', step3=True, model='ica')

# # srp
# ncomponents_optimal(srp_step3_cd, y_train_cd, 'cvd', step3=True, model='srp')
# scatter_component_means(srp_step3_cd, 12, 'cvd', 'Height', 'Weight', step3=True, model='srp')

# # hlle
# ncomponents_optimal(hlle_step3_cd, y_train_cd, 'cvd', step3=True, model='hlle')
# scatter_component_means(hlle_step3_cd, 12, 'cvd', 'Height', 'Weight', step3=True, model='hlle')

$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{Perform All Steps, Run All Models}
$$

In [10]:
%%time
# Run CVD Model
results_cv = put_it_all_together(X_train=X_train_scaled_cd,
                                y_train=y_train_cd,
                                X_test=X_test_scaled_cd,
                                y_test=y_test_cd,
                                dset='cvd')


step1_cv = results_cv['cvd']['step1']
step2_cv = results_cv['cvd']['step2']
step3_cv = results_cv['cvd']['step3']
step4_cv = results_cv['cvd']['step4']
step5_cv = results_cv['cvd']['step5']

pca_best_cvd = step4_cv['pca']
ica_best_cvd = step4_cv['ica']
srp_best_cvd = step4_cv['sparseRP']
hlle_best_cvd = step4_cv['manifold']

pca_train_cvd = results_cv['cvd']['step2']['pca'][1]
ica_train_cvd = results_cv['cvd']['step2']['ica'][1]
sparseRP_train_cvd = results_cv['cvd']['step2']['sparseRP'][1]
manifold_train_cvd = results_cv['cvd']['step2']['manifold'][1]

cd_params = {'param1_name': 'beta_1',
                'param1_range': [0.25, 0.50, 0.75, 0.90],
                'param2_name': 'beta_2',
                'param2_range': [0.22, 0.44, 0.66, 0.88],
                'sizes': [0.7, 0.75, 0.80, 0.90]}


Running All Steps
{'cvd': {'step1': {'gm': None, 'kmeans': None}, 'step2': {'pca': None, 'ica': None, 'sparseRP': None, 'manifold': None}, 'step3': {'pca': {'gm': None, 'kmeans': None}, 'ica': {'gm': None, 'kmean': None}, 'sparseRP': {'gm': None, 'kmeans': None}, 'manifold': {'gm': None, 'kmeans': None}}, 'step4': {'pca': None, 'ica': None, 'sparseRP': None, 'manifold': None}, 'step5': {'gm': None, 'kmeans': None}}}
{'cvd': {'step1': {'gm': None, 'kmeans': None}, 'step2': {'pca': None, 'ica': None, 'sparseRP': None, 'manifold': None}, 'step3': {'pca': {'gm': None, 'kmeans': None}, 'ica': {'gm': None, 'kmean': None}, 'sparseRP': {'gm': None, 'kmeans': None}, 'manifold': {'gm': None, 'kmeans': None}}, 'step4': {'pca': None, 'ica': None, 'sparseRP': None, 'manifold': None}, 'step5': {'gm': None, 'kmeans': None}}}
Step: 1
Fitting and Predicting Expectation Maximization
Fitting and Predicting Gaussian Mixture


Done with Gaussian Mixture
Done with Expectation Maximization

Fitting and Predicting Clustering
Fitting and Predicting KMeans
Done with KMeans
Done with Clustering
Step: 1 Complete

Step: 2
Fitting PCA
Fitting General PCA
Done with General PCA
Done with PCA

Fitting ICA
Fitting and Transforming FastICA
Done with FastICA
Done with ICA

Fitting and Transforming Randomized Projections
Fitting and Transforing with Sparse Random Projection
Done with Sparse Random Projection
Done with Randomized Projections

Fitting and Transforming Manifold Learning
Fitting and Transforming with Locally Linear Embedding: Heissan Mapping
Done with Locally Linear Embedding: Heissan Mapping
Done with Manifold Learning
Step: 2 Complete

Step: 3
Fitting and Predicting Expectation Maximization
Fitting and Predicting Gaussian Mixture
Done with Gaussian Mixture
Done with Expectation Maximization

Fitting and Predicting Clustering
Fitting and Predicting KMeans
Done with KMeans
Done with Clustering
Fitting and Predi



ICA NN Complete

Fitting and Predicting Sparse RP NN
Sparse RP NN Complete

Fitting and Predicting Maniforld Learning NN
Maniforld Learning NN Complete
Step: 4 Complete

Step: 5 (cvd) only
Fitting and Predicting GM NN




GM NN Complete
Fitting and Predicting Clustering NN




Clustering NN Complete
Step: 5 Complete

Completed All Steps
CPU times: user 6min 37s, sys: 3min 18s, total: 9min 55s
Wall time: 1h 55min 28s


### Step 4
$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{Learning and Validation Curves}
$$

In [None]:
# learning curves
# pca
fit_times_pca, score_times_pca = learn_curve_display(pca_train_cvd, y_train_cd, 'cvd',
                                                    cd_params['sizes'], 'pca')

# ica
fit_times_ica, score_times_ica = learn_curve_display(ica_train_cvd, y_train_cd, 'cvd',
                                                    cd_params['sizes'], 'ica')

# srp
fit_times_srp, score_times_srp = learn_curve_display(sparseRP_train_cvd, y_train_cd, 'cvd',
                                                    cd_params['sizes'], 'srp')

# hlle
fit_times_hlle, score_times_hlle = learn_curve_display(manifold_train_cvd, y_train_cd, 'cvd',
                                                    cd_params['sizes'], 'hlle')


In [None]:
## validation curves
# pca
val_curve_display(X_train=pca_train_cvd, y_train=y_train_cd, 
                  dset='cvd', param_name=cd_params['param1_name'], 
                  param_range=cd_params['param1_range'], model_name='pca', 
                  error=False)
val_curve_display(X_train=pca_train_cvd, y_train=y_train_cd, 
                  dset='cvd', param_name=cd_params['param2_name'], 
                    param_range=cd_params['param2_range'], model_name='pca', 
                    error=False)
val_curve_display(X_train=pca_train_cvd, y_train=y_train_cd, dset='cvd', 
                    param_name=None, param_range=None, model_name='pca', error=True)

# ica
val_curve_display(X_train=ica_train_cvd, y_train=y_train_cd, dset='cvd',
                    param_name=cd_params['param1_name'], param_range=cd_params['param1_range'],
                    model_name='ica',error=False)
val_curve_display(X_train=ica_train_cvd, y_train=y_train_cd, dset='cvd',
                  param_name=cd_params['param2_name'], param_range=cd_params['param2_range'],
                  model_name='ica', error=False)
val_curve_display(X_train=ica_train_cvd, y_train=y_train_cd, dset='cvd',
                    param_name=None, param_range=None, model_name='ica', error=True)

# srp
val_curve_display(X_train=sparseRP_train_cvd, y_train=y_train_cd, dset='cvd', 
                     param_name=cd_params['param1_name'], param_range=cd_params['param1_range'],
                    model_name='srp', error=False)
val_curve_display(X_train=sparseRP_train_cvd, y_train=y_train_cd, dset='cvd', 
                     param_name=cd_params['param2_name'], param_range=cd_params['param2_range'],
                    model_name='srp', error=False)
val_curve_display(X_train=sparseRP_train_cvd, y_train=y_train_cd, dset='cvd',
                    param_name=None, param_range=None, model_name='srp', error=True)

# hlle
val_curve_display(X_train=manifold_train_cvd, y_train=y_train_cd, dset='cvd', 
                    param_name=cd_params['param1_name'], param_range=cd_params['param1_range'],
                    model_name='hlle', error=False)
val_curve_display(X_train=manifold_train_cvd, y_train=y_train_cd, dset='cvd', 
                    param_name=cd_params['param2_name'], param_range=cd_params['param2_range'],
                    model_name='hlle', error=False)
val_curve_display(X_train=manifold_train_cvd, y_train=y_train_cd, dset='cvd',
                    param_name=None, param_range=None, model_name='hlle', error=True)


### Step 5
$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{Learning and Validation Curves}
$$

In [23]:
gm__best_cvd = step5_cv['gm']
cl_best_cvd = step5_cv['kmeans']

X_train_gm = step1_cv['gm'][1]
X_train_cl = step1_cv['kmeans'][1]

cd_params = {'param1_name': 'beta_1',
                'param1_range': [0.25, 0.50, 0.75, 0.90],
                'param2_name': 'beta_2',
                'param2_range': [0.22, 0.44, 0.66, 0.88],
                'sizes': [0.7, 0.75, 0.80, 0.90]}

In [None]:
# learning curves
# gm
fit_times_gm, score_times_gm = learn_curve_display(X_train_gm, y_train_cd, 'cvd', cd_params['sizes'], 'gm')

# cl
fit_times_cl, score_times_cl = learn_curve_display(X_train_cl, y_train_cd, 'cvd', cd_params['sizes'], 'km')

In [None]:
# validation curves
# gm
val_curve_display(X_train=X_train_gm, y_train=y_train_cd, dset='cvd', 
                    param_name=cd_params['param1_name'], param_range=cd_params['param1_range'],
                    model_name='gm', error=False)
val_curve_display(X_train=X_train_gm, y_train=y_train_cd, dset='cvd', 
                    param_name=cd_params['param2_name'], param_range=cd_params['param2_range'],
                    model_name='gm', error=False)
val_curve_display(X_train=X_train_gm, y_train=y_train_cd, dset='cvd', 
                    param_name=None, param_range=None, model_name='gm', error=True)

# cl
val_curve_display(X_train=X_train_cl, y_train=y_train_cd, dset='cvd',
                    param_name=cd_params['param1_name'], param_range=cd_params['param1_range'],
                    model_name='km', error=False)
val_curve_display(X_train=X_train_cl, y_train=y_train_cd, dset='cvd', 
                    param_name=cd_params['param2_name'], param_range=cd_params['param2_range'],
                    model_name='km', error=False)
val_curve_display(X_train=X_train_cl, y_train=y_train_cd, dset='cvd',
                    param_name=None, param_range=None, model_name='km', error=True)

In [79]:
print('fit times in seconds')
print(np.mean(fit_times_pca),  np.mean(fit_times_ica), np.mean(fit_times_srp), np.mean(fit_times_hlle))

print('score times in seconds')
print(np.mean(score_times_pca),  np.mean(score_times_ica), np.mean(score_times_srp), np.mean(score_times_hlle))

print('best score')
print(pca_best_cvd['best_score'], ica_best_cvd['best_score'], srp_best_cvd['best_score'], hlle_best_cvd['best_score'])

fit times in seconds
3.158353269100189 1.5379676520824432 2.022163838148117 0.549873873591423
score times in seconds
0.01816152036190033 0.01261650025844574 0.01673305034637451 0.00808747112751007
best score
0.7312529618967456 0.7260570434788759 0.734070521435291 0.7502351834430856


In [80]:
print('fit times in seconds')
print(np.mean(fit_times_gm), np.mean(fit_times_cl))

print('score times in seconds')
print(np.mean(score_times_gm), np.mean(score_times_cl))

print('best score')
print(gm__best_cvd['best_score'], cl_best_cvd['best_score'])

fit times in seconds
1.4442969858646393 1.3441387861967087
score times in seconds
0.014128506183624268 0.009821459650993347
best score
0.7169038983866063 0.7258258387030606


$$
\textbf{Nutrition Facts}\\~\\
\textbf{Load, Clean, Preprocess, Scale, Baseline: Nutrition Facts}
$$

In [39]:
%%time
X_train_scaled_nf, X_test_scaled_nf, y_train_nf, y_test_nf = final_dataset(dataset='nf')

NF Loading and Cleaning...
NF Loaded and Cleaned...

NF Splitting...
NF Split...

NF Scaling...
NF Scaled...

Baseline Accuracy Score: 0.44%

CPU times: user 17.4 s, sys: 53 ms, total: 17.5 s
Wall time: 17.6 s


### Step 1
$$
\textbf{Nutrition Facts}\\~\\
\textbf{KMeans Find and View Optimal Clusters}
$$

In [40]:
# nclusters_optimal(X_train=X_train_scaled_nf, y_train=y_train_nf, dset='nf', model='km', step3=False)
# scatter_cluster_centers(X_train=X_train_scaled_nf, k=8, dset='nf', xlabel='Protein', ylabel='Carbohydrate', model='km', step3=False)

$$
\textbf{Nutrition Facts}\\~\\
\textbf{Gaussian Mixture Find and View Optimal Components}
$$

In [41]:
# ncomponents_optimal(X_train=X_train_scaled_nf, y_train=y_train_nf, dset='nf', model='gm', step3=False)
# scatter_component_means(X_train=X_train_scaled_nf, components=8, dset='nf', xlabel='Protein', ylabel='Carbohydrates', model='gm', step3=False)

### Step 2
$$
\textbf{Nutrition Facts}\\~\\
\textbf{PCA, ICA, SRP, HLLE Find and View Optimal Components}
$$

In [42]:
# optimal_ncomponents(X_train_scaled_nf, dset='nf')
# opt_component_plots(X_train_scaled_nf,
#                     pca_comp=3, 
#                     ica_comp=4, 
#                     srp_comp=4, 
#                     hlle_comp=4, 
#                     hlle_neigh=15, 
#                     dset='nf')

In [43]:
# dset = 'nf'
# results_s2_nf = {dset: {'step2': {'pca': None,
#                     'ica': None,
#                     'sparseRP': None,
#                     'manifold': None},}}

# results_s3_nf = step2(X_train=X_train_scaled_nf,
#                       y_train=y_train_nf,
#                       X_test=X_test_scaled_nf,
#                       y_test=y_test_nf,
#                       dset=dset,
#                       results=results_s2_nf,
#                       run_step1=False)

# pca_step3_nf = results_s3_nf['nf']['step2']['pca'][1]
# ica_step3_nf = results_s3_nf['nf']['step2']['ica'][1]
# srp_step3_nf = results_s3_nf['nf']['step2']['sparseRP'][1]
# hlle_step3_nf = results_s3_nf['nf']['step2']['manifold'][1]

### Step 3
$$
\textbf{Nutrition Facts}\\~\\
\textbf{KMeans Find and View Optimal Clusters with DR}
$$

In [45]:
# # pca
# nclusters_optimal(X_train=pca_step3_nf, y_train=y_train_nf, dset='nf', step3=True, model='pca')
# scatter_cluster_centers(X_train=pca_step3_nf, k=8, dset='nf', step3=True, xlabel='Protein', ylabel='Carbohydrate', model='pca')

# # ica
# nclusters_optimal(X_train=ica_step3_nf, y_train=y_train_nf, dset='nf', step3=True, model='ica')
# scatter_cluster_centers(X_train=ica_step3_nf, k=8, dset='nf', step3=True, xlabel='Protein', ylabel='Carbohydrate', model='ica')

# # srp
# nclusters_optimal(X_train=srp_step3_nf, y_train=y_train_nf, dset='nf', step3=True, model='srp')
# scatter_cluster_centers(X_train=srp_step3_nf, k=8, dset='nf', step3=True, xlabel='Protein', ylabel='Carbohydrate', model='srp')

# # hlle
# nclusters_optimal(X_train=hlle_step3_nf, y_train=y_train_nf, dset='nf', step3=True, model='hlle')
# scatter_cluster_centers(X_train=hlle_step3_nf, k=8, dset='nf', step3=True, xlabel='Protein', ylabel='Carbohydrate', model='hlle')

$$
\textbf{Nutrition Facts}\\~\\
\textbf{Gaussian Mixture Find and View Optimal Components}
$$

In [46]:
# # pca
# ncomponents_optimal(pca_step3_nf, y_train_nf, 'nf', step3=True, model='pca')
# scatter_component_means(pca_step3_nf, 8, 'nf', 'Protein', 'Carbohydrates', step3=True, model='pca')

# # ica
# ncomponents_optimal(ica_step3_nf, y_train_nf, 'nf', step3=True, model='ica')
# scatter_component_means(ica_step3_nf, 8, 'nf', 'Protein', 'Carbohydrates', step3=True, model='ica')

# # srp
# ncomponents_optimal(srp_step3_nf, y_train_nf, 'nf', step3=True, model='srp')
# scatter_component_means(srp_step3_nf, 8, 'nf', 'Protein', 'Carbohydrates', step3=True, model='srp')

# # hlle
# ncomponents_optimal(hlle_step3_nf, y_train_nf, 'nf', step3=True, model='hlle')
# scatter_component_means(hlle_step3_nf, 8, 'nf', 'Protein', 'Carbohydrates', step3=True, model='hlle')

$$
\textbf{Nutrition Facts}\\~\\
\textbf{Perform All Steps, Run All Models}
$$

In [47]:
%%time
# Run NF Model
results_nf = put_it_all_together(X_train=X_train_scaled_nf,
                                y_train=y_train_nf,
                                X_test=X_test_scaled_nf, 
                                y_test=y_test_nf,
                                dset='nf')


Running All Steps
{'nf': {'step1': {'gm': None, 'kmeans': None}, 'step2': {'pca': None, 'ica': None, 'sparseRP': None, 'manifold': None}, 'step3': {'pca': {'gm': None, 'kmeans': None}, 'ica': {'gm': None, 'kmean': None}, 'sparseRP': {'gm': None, 'kmeans': None}, 'manifold': {'gm': None, 'kmeans': None}}, 'step4': {'pca': None, 'ica': None, 'sparseRP': None, 'manifold': None}, 'step5': {'gm': None, 'kmeans': None}}}
{'nf': {'step1': {'gm': None, 'kmeans': None}, 'step2': {'pca': None, 'ica': None, 'sparseRP': None, 'manifold': None}, 'step3': {'pca': {'gm': None, 'kmeans': None}, 'ica': {'gm': None, 'kmean': None}, 'sparseRP': {'gm': None, 'kmeans': None}, 'manifold': {'gm': None, 'kmeans': None}}, 'step4': {'pca': None, 'ica': None, 'sparseRP': None, 'manifold': None}, 'step5': {'gm': None, 'kmeans': None}}}
Step: 1
Fitting and Predicting Expectation Maximization
Fitting and Predicting Gaussian Mixture


Done with Gaussian Mixture
Done with Expectation Maximization

Fitting and Predicting Clustering
Fitting and Predicting KMeans
Done with KMeans
Done with Clustering
Step: 1 Complete

Step: 2
Fitting PCA
Fitting General PCA
Done with General PCA
Done with PCA

Fitting ICA
Fitting and Transforming FastICA
Done with FastICA
Done with ICA

Fitting and Transforming Randomized Projections
Fitting and Transforing with Sparse Random Projection
Done with Sparse Random Projection
Done with Randomized Projections

Fitting and Transforming Manifold Learning
Fitting and Transforming with Locally Linear Embedding: Heissan Mapping
Done with Locally Linear Embedding: Heissan Mapping
Done with Manifold Learning
Step: 2 Complete

Step: 3
Fitting and Predicting Expectation Maximization
Fitting and Predicting Gaussian Mixture
Done with Gaussian Mixture
Done with Expectation Maximization

Fitting and Predicting Clustering
Fitting and Predicting KMeans
Done with KMeans
Done with Clustering
Fitting and Predi

In [48]:

step1_nf = results_nf['nf']['step1']
step2_nf = results_nf['nf']['step2']
step3_nf = results_nf['nf']['step3']