### IMPORTS

In [1]:
# load, preprocess, scale, baseline
from wrangle import final_dataset

# manipulate data
import pandas as pd
import numpy as np

# visualize data
import matplotlib.pyplot as plt
from cluster_plots import (ncomponents_optimal, scatter_component_means,
                           nclusters_optimal, scatter_cluster_centers)
from dimensionality_reduction_plots import optimal_ncomponents, opt_component_plots
from nn_plot import (learn_curve_display,
                    val_curve_display)
import seaborn as sns

# put it all together
from steps import step2, step3
from models import put_it_all_together

np.random.seed(123)

$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{Load, Clean, Preprocess, Scale, Baseline: Cardiovascular Disease}
$$

In [2]:
%%time
X_train_scaled_cd, X_test_scaled_cd, y_train_cd, y_test_cd = final_dataset(dataset='cvd')

CVD Loading and Cleaning...
CVD Loaded and Cleaned...

CVD Splitting...
CVD Split...

CVD Scaling...
CVD Scaled...

Baseline Accuracy Score: 0.51%

CPU times: user 248 ms, sys: 26.3 ms, total: 275 ms
Wall time: 546 ms


### Step 1
$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{KMeans Find and View Optimal Clusters}
$$

In [3]:
# nclusters_optimal(X_train=X_train_scaled_cd, y_train=y_train_cd, dset='cvd')
# scatter_cluster_centers(X_train=X_train_scaled_cd, k=2, dset='cvd', xlabel='Height', ylabel='Weight')

$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{Gaussian Mixture Find and View Optimal Components}
$$

In [4]:
# ncomponents_optimal(X_train_scaled_cd, y_train_cd, 'cvd')
# scatter_component_means(X_train_scaled_cd, 12, 'cvd', 'Height', 'Weight')

### Step 2
$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{PCA, ICA, SRP, HLLE Find and View Optimal Components}
$$

In [5]:
# optimal_ncomponents(X_train_scaled_cd, 'cvd')
opt_component_plots(X_train_scaled_cd,
                    pca_comp=3,
                    ica_comp=4, 
                    srp_comp=4, 
                    hlle_comp=4, 
                    hlle_neigh=15,
                    dset='cvd')

Fitting PCA
Fitting General PCA
Done with General PCA
Done with PCA

Fitting ICA
Fitting and Transforming FastICA
Done with FastICA
Done with ICA

Fitting and Transforming Randomized Projections
Fitting and Transforing with Sparse Random Projection
Done with Sparse Random Projection
Done with Randomized Projections

Fitting and Transforming Manifold Learning
Fitting and Transforming with Locally Linear Embedding: Heissan Mapping


### Step 3
$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{KMeans Find and View Optimal Clusters with DR}\\~\\
$$

$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{Gaussian Mixture Find and View Optimal Components}
$$

$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{Perform All Steps, Run All Models}
$$

In [None]:
# %%time
# # Run CVD Model
# results_cv = put_it_all_together(X_train=X_train_scaled_cd,
#                                 y_train=y_train_cd,
#                                 X_test=X_test_scaled_cd,
#                                 y_test=y_test_cd,
#                                 dset='cvd')

In [48]:

# step1_cv = results_cv['cvd']['step1']
# step2_cv = results_cv['cvd']['step2']
# step3_cv = results_cv['cvd']['step3']
# step4_cv = results_cv['cvd']['step4']
# step5_cv = results_cv['cvd']['step5']

# pca_best_cvd = step4_cv['pca']
# ica_best_cvd = step4_cv['ica']
# srp_best_cvd = step4_cv['sparseRP']
# hlle_best_cvd = step4_cv['manifold']

# pca_train_cvd = results_cv['cvd']['step2']['pca'][1]
# ica_train_cvd = results_cv['cvd']['step2']['ica'][1]
# sparseRP_train_cvd = results_cv['cvd']['step2']['sparseRP'][1]
# manifold_train_cvd = results_cv['cvd']['step2']['manifold'][1]

# cd_params = {'param1_name': 'beta_1',
#                 'param1_range': [0.25, 0.50, 0.75, 0.90],
#                 'param2_name': 'beta_2',
#                 'param2_range': [0.22, 0.44, 0.66, 0.88],
#                 'sizes': [0.7, 0.75, 0.80, 0.90]}

# x_pca_cvd = pca_train_cvd.iloc[:, 0:2]
# x_ica_cvd = ica_train_cvd.iloc[:, 0:2]
# x_srp_cvd = sparseRP_train_cvd.iloc[:, 0:2]
# x_hlle_cvd = manifold_train_cvd.iloc[:, 0:2]

### Step 4
$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{Learning and Validation Curves}
$$

In [None]:
# ## learning curves
# # pca
# fit_times_pca, score_times_pca = learn_curve_display(pca_train, y_train_cd, 'cvd',
#                                                     cd_params['sizes'], 'pca')

# # ica
# fit_times_ica, score_times_ica = learn_curve_display(ica_train, y_train_cd, 'cvd',
#                                                     cd_params['sizes'], 'ica')

# # srp
# fit_times_srp, score_times_srp = learn_curve_display(sparseRP_train, y_train_cd, 'cvd',
#                                                     cd_params['sizes'], 'srp')

# # hlle
# fit_times_hlle, score_times_hlle = learn_curve_display(manifold_train, y_train_cd, 'cvd',
#                                                     cd_params['sizes'], 'hlle')

In [None]:
# ## validation curves
# # pca
# val_curve_display(X_train_scaled_cd, y_train_cd, 'cvd', 
#                     cd_params['param1_name'], cd_params['param1_range'],
#                     model_name='pca',error=False)
# val_curve_display(X_train_scaled_cd, y_train_cd, 'cvd', 
#                     cd_params['param2_name'], cd_params['param2_range'],
#                     model_name='pca',error=False)
# val_curve_display(X_train_scaled_cd, y_train_cd, 'cvd', 
#                     None, None, model_name='pca', error=True)

# # ica
# val_curve_display(X_train_scaled_cd, y_train_cd, 'cvd',
#                     cd_params['param1_name'], cd_params['param1_range'],
#                     model_name='ica',error=False)
# val_curve_display(X_train_scaled_cd, y_train_cd, 'cvd', 
#                     cd_params['param2_name'], cd_params['param2_range'],
#                     model_name='ica', error=False)
# val_curve_display(X_train_scaled_cd, y_train_cd, 'cvd',
#                     None, None, model_name='ica', error=True)

# # srp
# val_curve_display(X_train_scaled_cd, y_train_cd, 'cvd', 
#                     cd_params['param1_name'], cd_params['param1_range'],
#                     model_name='srp', error=False)
# val_curve_display(X_train_scaled_cd, y_train_cd, 'cvd', 
#                     cd_params['param2_name'], cd_params['param2_range'],
#                     model_name='srp', error=False)
# val_curve_display(X_train_scaled_cd, y_train_cd, 'cvd',
#                     None, None, model_name='srp', error=True)

# # hlle
# val_curve_display(X_train_scaled_cd, y_train_cd, 'cvd', 
#                     cd_params['param1_name'], cd_params['param1_range'],
#                     model_name='hlle', error=False)
# val_curve_display(X_train_scaled_cd, y_train_cd, 'cvd', 
#                     cd_params['param2_name'], cd_params['param2_range'],
#                     model_name='hlle', error=False)
# val_curve_display(X_train_scaled_cd, y_train_cd, 'cvd',
#                     None, None, model_name='hlle', error=True)


In [63]:
# # fit times
# # pca
# fit_times_pca, score_times_pca

# # ica
# fit_times_ica, score_times_ica

# # srp
# fit_times_srp, score_times_srp

# # hlle
# fit_times_hlle, score_times_hlle

(array([[0.99279189, 0.45091844, 1.16929531, 1.1179595 ],
        [1.09546185, 0.73394585, 1.767488  , 1.46814799],
        [1.88248777, 1.12859964, 1.93125844, 1.35569763],
        [1.92586231, 1.93234634, 1.92384958, 1.15773344]]),
 array([[0.01084948, 0.01101899, 0.01175785, 0.010149  ],
        [0.01082826, 0.01078272, 0.00654244, 0.00530791],
        [0.01033282, 0.01164246, 0.00609827, 0.00461245],
        [0.01057339, 0.01038527, 0.00578284, 0.00544524]]))

### Step 5
$$
\textbf{Cardiovascular Disease}\\~\\
\textbf{Learning and Validation Curves}
$$

In [52]:
# gm__best_cvd = step5_cv['gm']
# cl_best_cvd = step5_cv['kmeans']

# X_train_gm = step1_cv['gm'][1]
# X_train_cl = step1_cv['kmeans'][1]

# cd_params = {'param1_name': 'beta_1',
#                 'param1_range': [0.25, 0.50, 0.75, 0.90],
#                 'param2_name': 'beta_2',
#                 'param2_range': [0.22, 0.44, 0.66, 0.88],
#                 'sizes': [0.7, 0.75, 0.80, 0.90]}

In [None]:
# # learning curves
# gm
# fit_times_gm, score_times_gm = learn_curve_display(X_train_gm, y_train_cd, 'cvd', cd_params['sizes'], 'gm')

# cl
# fit_times_cl, score_times_cl = learn_curve_display(X_train_cl, y_train_cd, 'cvd', cd_params['sizes'], 'km')

In [None]:
# # validation curves
# # gm
# val_curve_display(X_train_gm, y_train_cd, 'cvd', 
#                     cd_params['param1_name'], cd_params['param1_range'],
#                     model_name='gm', error=False)
# val_curve_display(X_train_gm, y_train_cd, 'cvd', 
#                     cd_params['param2_name'], cd_params['param2_range'],
#                     model_name='gm', error=False)
# val_curve_display(X_train_gm, y_train_cd, 'cvd', 
#                     None, None, model_name='gm', error=True)

# # cl
# val_curve_display(X_train_cl, y_train_cd, 'cvd',
#                     cd_params['param1_name'], cd_params['param1_range'],
#                     model_name='km', error=False)
# val_curve_display(X_train_cl, y_train_cd, 'cvd', 
#                     cd_params['param2_name'], cd_params['param2_range'],
#                     model_name='km', error=False)
# val_curve_display(X_train_cl, y_train_cd, 'cvd',
#                     None, None, model_name='km', error=True)

In [64]:
# fit_times_gm, score_times_gm

# fit_times_cl, score_times_cl

(array([[0.93194413, 1.3964355 , 1.48192883, 1.19972396],
        [1.09315753, 1.11645222, 1.70269585, 1.34612632],
        [1.10801911, 0.80493569, 1.47939348, 0.77539015],
        [1.70526123, 2.05124879, 1.57843089, 1.01127219]]),
 array([[0.01119161, 0.01094532, 0.01124763, 0.01091003],
        [0.01207471, 0.01151919, 0.00619459, 0.0054791 ],
        [0.01100445, 0.01106215, 0.00988054, 0.00644493],
        [0.01128411, 0.01150751, 0.00570965, 0.00392413]]))

$$
\textbf{Nutrition Facts}\\~\\
\textbf{Load, Clean, Preprocess, Scale, Baseline: Nutrition Facts}
$$

In [55]:
%%time
X_train_scaled_nf, X_test_scaled_nf, y_train_nf, y_test_nf = final_dataset(dataset='nf')

NF Loading and Cleaning...
NF Loaded and Cleaned...

NF Splitting...
NF Split...

NF Scaling...
NF Scaled...

Baseline Accuracy Score: 0.44%

CPU times: user 15.7 s, sys: 9.63 ms, total: 15.7 s
Wall time: 15.8 s


### Step 1
$$
\textbf{Nutrition Facts}\\~\\
\textbf{KMeans Find and View Optimal Clusters}
$$

In [56]:
nclusters_optimal(X_train=X_train_scaled_nf, y_train=y_train_nf, dset='nf')
# scatter_cluster_centers(X_train=X_train_scaled_nf, k=4, dset='nf', xlabel='Protein', ylabel='Carbohydrate')

$$
\textbf{Nutrition Facts}\\~\\
\textbf{Gaussian Mixture Find and View Optimal Components}
$$

In [57]:
ncomponents_optimal(X_train_scaled_nf, y_train_nf, 'nf')
# scatter_component_means(X_train_scaled_nf, 5, 'nf', 'Protein', 'Carbohydrates')

### Step 2
$$
\textbf{Nutrition Facts}\\~\\
\textbf{PCA, ICA, SRP, HLLE Find and View Optimal Components}
$$

In [None]:
optimal_ncomponents(X_train_scaled_nf, y_train_nf, X_test_scaled_nf, y_test_nf, 'nf')
#opt_component_plots(X_train_scaled_nf, 4, 4, 4, 5, 21, 'nf')

### Step 3
$$
\textbf{Nutrition Facts}\\~\\
\textbf{KMeans Find and View Optimal Clusters with DR}
$$

$$
\textbf{Nutrition Facts}\\~\\
\textbf{Gaussian Mixture Find and View Optimal Components}
$$

$$
\textbf{Nutrition Facts}\\~\\
\textbf{Perform All Steps, Run All Models}
$$

In [59]:
# %%time
# # Run NF Model
# results_nf = put_it_all_together(X_train=X_train_scaled_nf,
#                                 y_train=y_train_nf,
#                                 X_test=X_test_scaled_nf, 
#                                 y_test=y_test_nf,
#                                 dset='nf')


Running All Steps
Step: 1
Fitting and Predicting Expectation Maximization
Fitting and Predicting Gaussian Mixture
Done with Gaussian Mixture
Done with Expectation Maximization

Fitting and Predicting Clustering
Fitting and Predicting KMeans
Done with KMeans
Done with Clustering
Step: 1 Complete

Step: 2
Fitting PCA
Fitting General PCA
Done with General PCA
Done with PCA

Fitting ICA
Fitting and Transforming FastICA
Done with FastICA
Done with ICA

Fitting and Transforming Randomized Projections
Fitting and Transforing with Sparse Random Projection
Done with Sparse Random Projection
Done with Randomized Projections

Fitting and Transforming Manifold Learning
Fitting and Transforming with Locally Linear Embedding: Heissan Mapping
Done with Locally Linear Embedding: Heissan Mapping
Done with Manifold Learning
Step: 2 Complete

Step: 3
Fitting and Predicting Expectation Maximization
Fitting and Predicting Gaussian Mixture
Done with Gaussian Mixture
Done with Expectation Maximization

Fitt

In [1]:

# step1_nf = results_nf['nf']['step1']
# step2_nf = results_nf['nf']['step2']
# step3_nf = results_nf['nf']['step3']

# pca_train_nf = step2_nf['pca'][1]
# ica_train_nf = step2_nf['ica'][1]
# sparseRP_train_nf = step2_nf['sparseRP'][1]
# manifold_train_nf = step2_nf['manifold'][1]

# x_pca_nf = pca_train_nf.iloc[:, 0:2]
# x_ica_nf = ica_train_nf.iloc[:, 0:2]
# x_srp_nf = sparseRP_train_nf.iloc[:, 0:2]
# x_hlle_nf = manifold_train_nf.iloc[:, 0:2]

NameError: name 'results_nf' is not defined