# Naive Bayes Classifier

In [1]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.naive_bayes import GaussianNB

from common import load_image, show_image, load_data, split_data, validation_scores, merge_results

In [2]:
# Directory to save results
results_dir = 'naive_bayes_results'

if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    print(f'Results directory "{results_dir}" created')
else:
    print(f'Results directory "{results_dir}" already exists')

Results directory "naive_bayes_results" already exists


In [3]:
# Load data
df = load_data()

# Split data
X_train, X_test, y_train, y_test = split_data(df)
# Preview shape of split data
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(5335, 256) (5335,) (1334, 256) (1334,)


In [4]:
# Setup a pipeline for a strict ordering operations (and names for defining param grids)
pipe = Pipeline([
    ('scaler', StandardScaler()),  # Placehold for Scalers
    ('pca', PCA()),  # Placeholder for PCA
    ('feature_selection', SelectKBest()),  # Placeholder for feature selection
    ('gnb', GaussianNB())  # placeholder for MLPClassifier
])

# These parameters will be the same for each of the separate cases defined later
param_grid = {
    # Scaler: 2 normalization methods
    'scaler': [StandardScaler(), MinMaxScaler()],
    # Principal component analysis
    'pca': [
        PCA(n_components=8),
        PCA(n_components=16),
        PCA(n_components=32),
        PCA(n_components=64),
        PCA(n_components=64),
        PCA(n_components=128),
        None,
    ],
    # Feature Selection: 2 methods + 1 (no method)
    'feature_selection': [
        # VarianceThreshold(threshold=0.1),  # No feature meets this threshold
        VarianceThreshold(threshold=0.2),
        VarianceThreshold(threshold=0.4),
        VarianceThreshold(threshold=0.6),
        VarianceThreshold(threshold=0.8),
        None,
    ],
    # No tunable hyperparamters for GaussianNB
}

results = validation_scores(pipe, param_grid, X_train, y_train, verbose=3)
for scorer_name, scorer_results in results.items():
    print(scorer_name)
    print(scorer_results)

Fitting 4 folds for each of 70 candidates, totalling 280 fits


16 fits failed out of a total of 280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sande\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sande\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\Users\sande\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\sande\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File

Fitting 4 folds for each of 70 candidates, totalling 280 fits


16 fits failed out of a total of 280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sande\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sande\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\Users\sande\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\sande\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File

Fitting 4 folds for each of 70 candidates, totalling 280 fits


16 fits failed out of a total of 280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sande\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sande\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\Users\sande\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\sande\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File

accuracy
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        0.336000      0.051130         0.025030        0.005112   
1        0.444397      0.041159         0.024009        0.003261   
2        0.580381      0.024130         0.044122        0.010910   
3        0.546144      0.022720         0.036302        0.005771   
4        0.509896      0.032069         0.040430        0.005667   
..            ...           ...              ...             ...   
65       1.205175      0.338277         0.038027        0.009383   
66       1.084683      0.078109         0.109263        0.027594   
67       0.942559      0.075672         0.054252        0.011693   
68       0.299403      0.125932         0.106505        0.037448   
69       0.339498      0.122171         0.097499        0.016652   

             param_feature_selection              param_pca      param_scaler  \
0   VarianceThreshold(threshold=0.2)    PCA(n_components=8)  StandardScaler()   
1   Variance

In [5]:
mdf = merge_results(results)

# Remove entries for fits that failed
mdf = mdf[~mdf['mean_test_score'].isna()]
mdf.head()

# Save merged results to csv file
mdf.to_csv(os.path.join(results_dir, 'gnb_results.csv'), index=False)

In [7]:
# Get top scoring for each scorer method
top_each = pd.concat([
    mdf[mdf['scorer']==scorer].sort_values(by='rank_test_score').head(3)
    for scorer in sorted(mdf['scorer'].unique())
], ignore_index=True)

# Save top 3 performing models for each scorers (9 total results, but likely fewer than 9 unique param options)
top_each.to_csv(os.path.join(results_dir, 'top_3_per_scorer.csv'), index=False)

top_each

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_feature_selection,param_pca,param_scaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score,scorer
0,0.576683,0.02649,0.036005,0.005098,VarianceThreshold(threshold=0.4),PCA(n_components=32),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.866567,0.884558,0.87931,0.87997,0.877601,0.006684,1,accuracy
1,0.687598,0.090507,0.035748,0.010782,VarianceThreshold(threshold=0.4),PCA(n_components=8),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.866567,0.884558,0.87931,0.87997,0.877601,0.006684,1,accuracy
2,1.140128,0.070333,0.051755,0.007523,VarianceThreshold(threshold=0.4),PCA(n_components=128),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.866567,0.884558,0.87931,0.87997,0.877601,0.006684,1,accuracy
3,0.552957,0.038427,0.046578,0.008779,VarianceThreshold(threshold=0.4),PCA(n_components=32),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.866571,0.88506,0.879819,0.880411,0.877965,0.006884,1,f1_score
4,0.622134,0.052201,0.037324,0.007792,VarianceThreshold(threshold=0.4),PCA(n_components=8),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.866571,0.88506,0.879819,0.880411,0.877965,0.006884,1,f1_score
5,1.102552,0.045036,0.044763,0.003061,VarianceThreshold(threshold=0.4),PCA(n_components=128),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.866571,0.88506,0.879819,0.880411,0.877965,0.006884,1,f1_score
6,0.751019,0.205066,0.042502,0.003204,VarianceThreshold(threshold=0.2),PCA(n_components=16),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.981876,0.982053,0.983194,0.983396,0.98263,0.000672,1,roc_auc
7,0.542536,0.06147,0.053504,0.008077,VarianceThreshold(threshold=0.2),PCA(n_components=32),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.981877,0.982052,0.983195,0.983396,0.98263,0.000672,2,roc_auc
8,1.091359,0.094195,0.055502,0.004923,VarianceThreshold(threshold=0.2),PCA(n_components=128),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.981876,0.982053,0.983194,0.983396,0.98263,0.000672,3,roc_auc


In [8]:
"""
Best model parameters from results is as follows:
{
    'feature_selection': VarianceThreshold(threshold=0.4),
    'pca': PCA(n_components=32),
    'scaler': StandardScaler()
}

"""
best_params = top_each.iloc[0]['params']
best_params

{'feature_selection': VarianceThreshold(threshold=0.4),
 'pca': PCA(n_components=32),
 'scaler': StandardScaler()}

In [9]:
# Results for all scoring methods for chosen best model
best_model_scores = mdf[mdf['params']==best_params]

# Save all 3 results for best model (3x different scorer methods)
best_model_scores.to_csv(os.path.join(results_dir, 'best_model_params_and_scores.csv'))

# Show all 3 scores for best model params
best_model_scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_feature_selection,param_pca,param_scaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score,scorer
18,0.576683,0.02649,0.036005,0.005098,VarianceThreshold(threshold=0.4),PCA(n_components=32),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.866567,0.884558,0.87931,0.87997,0.877601,0.006684,1,accuracy
88,0.634748,0.068854,0.052251,0.008377,VarianceThreshold(threshold=0.4),PCA(n_components=32),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.981895,0.981951,0.983106,0.983542,0.982624,0.000718,13,roc_auc
158,0.552957,0.038427,0.046578,0.008779,VarianceThreshold(threshold=0.4),PCA(n_components=32),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.866571,0.88506,0.879819,0.880411,0.877965,0.006884,1,f1_score


In [10]:
for scorer in sorted(best_model_scores['scorer'].unique()):
    mean_test_score = best_model_scores[best_model_scores['scorer']==scorer]['mean_test_score'].values[0]
    print(f'{scorer}: {mean_test_score:.4%}')

accuracy: 87.7601%
f1_score: 87.7965%
roc_auc: 98.2624%


In [11]:
# Try a couple clustering options with the best model params
kmeans = KMeans(n_clusters=5, random_state=0)

X_train_clust = X_train.copy()
X_train_clust['cluster'] = kmeans.fit_predict(X_train)

X_test_clust = X_test.copy()
X_test_clust['cluster'] = kmeans.predict(X_test)

In [12]:
# Best model
# Setup a pipeline for a strict ordering operations (and names for defining param grids)
pipe = Pipeline([
    ('scaler', StandardScaler()),  # Placehold for Scalers
    ('pca', PCA()),  # Placeholder for PCA
    ('feature_selection', VarianceThreshold()),  # Placeholder for feature selection
    ('gnb', GaussianNB())  # placeholder for MLPClassifier
])

# These parameters will be the same for each of the separate cases defined later
param_grid = {
    # Scaler: 2 normalization methods
    'scaler': [StandardScaler()],
    # Principal component analysis
    'pca': [
        PCA(n_components=32),
    ],
    # Feature Selection: 2 methods + 1 (no method)
    'feature_selection': [
        VarianceThreshold(threshold=0.4),
    ],
    # No tunable hyperparamters for GaussianNB
}

results_clust, best_params_clust = validation_scores(pipe, param_grid, X_train_clust, y_train, verbose=3, return_params=True)
for scorer_name, scorer_results in results_clust.items():
    print(scorer_name)
    print(scorer_results)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
accuracy
   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.290333      0.011108         0.016499        0.005766   

            param_feature_selection             param_pca      param_scaler  \
0  VarianceThreshold(threshold=0.4)  PCA(n_components=32)  StandardScaler()   

                                              params  split0_test_score  \
0  {'feature_selection': VarianceThreshold(thresh...           0.866567   

   split1_test_score  split2_test_score  split3_test_score  mean_test_score  \
0           0.885307            0.88081            0.88072         0.878351   

   std_test_score  rank_test_score  
0        0.007052                1  
roc_auc
   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.205404      0.005932         0.017751        0.0034

In [13]:
mdf_clust = merge_results(results_clust)

# Remove entries for fits that failed
mdf_clust = mdf_clust[~mdf_clust['mean_test_score'].isna()].sort_values(by='scorer')

# Save merged results to csv file
mdf_clust.to_csv(os.path.join(results_dir, 'gnb_clust_results.csv'), index=False)

mdf_clust.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_feature_selection,param_pca,param_scaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score,scorer
0,0.290333,0.011108,0.016499,0.005766,VarianceThreshold(threshold=0.4),PCA(n_components=32),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.866567,0.885307,0.88081,0.88072,0.878351,0.007052,1,accuracy
2,0.203236,0.011883,0.01225,0.002488,VarianceThreshold(threshold=0.4),PCA(n_components=32),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.866476,0.885796,0.881297,0.881058,0.878657,0.007281,1,f1_score
1,0.205404,0.005932,0.017751,0.003491,VarianceThreshold(threshold=0.4),PCA(n_components=32),StandardScaler(),{'feature_selection': VarianceThreshold(thresh...,0.981934,0.981899,0.983052,0.983496,0.982595,0.000697,1,roc_auc


In [14]:
# There was a small improvement in accuracy and f1_score. Small decrease in roc_auc
# Conclusion: clustering is helpful to this model
print(best_params_clust)

{'feature_selection': VarianceThreshold(threshold=0.4), 'pca': PCA(n_components=32), 'scaler': StandardScaler()}


In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Check final test scores on test data with best setup (w/ clustering)
best_model = GaussianNB(random_state=0)
y_pred = best_model.predict(X_test_clust)
y_pred_proba = best_model.predict_proba(X_test_clust)
best_acc = accuracy_score(y_test, y_pred)
best_f1 = f1_score(y_test, y_pred, average='weighted')
best_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

best_scores = pd.DataFrame({
    'Accuracy': best_acc,
    'f1 Score': best_f1,
    'ROC AUC': best_roc_auc
})

best_scores

TypeError: __init__() got an unexpected keyword argument 'feature_selection'