In [None]:
import pandas as pd
import numpy as np
import sklearn.naive_bayes as nB
from sklearn.model_selection import ShuffleSplit, learning_curve
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
from sklearn import decomposition
from sklearn.manifold import LocallyLinearEmbedding

## Plot Function ##

In [None]:
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

In [None]:
df = pd.read_csv('preprocess.csv')

In [None]:
X = df.loc[:, df.columns != 'prob']
y = df['prob']

## Bayes with original data ##

In [None]:
gnb = GaussianNB()

In [None]:
title = "Learning Curve (Naive Bayes)"

cv = ShuffleSplit(n_splits=200, test_size=0.3, random_state=0)

estimator = GaussianNB()
plot_learning_curve(estimator, title, X, y, ylim=(-0.01, 1.01),
                    cv=cv, n_jobs=16)

## Bayes with Data Scaled ##

In [None]:
StandardScalerr = StandardScaler()
X_standart = X.copy()
X_standart[['age', 'presure_blood_resting', 'colesterol',
       'cigarettes_per_day', 'smoker_years', 'max_heart_rate',
       'res_heart_rate', 'blood_presure_sistoles', 'blood_presure_diastoles',
       'rest_after_exercicie_presure']] = StandardScaler().fit_transform(X_standart[['age', 'presure_blood_resting', 'colesterol',
       'cigarettes_per_day', 'smoker_years', 'max_heart_rate',
       'res_heart_rate', 'blood_presure_sistoles', 'blood_presure_diastoles',
       'rest_after_exercicie_presure']])

In [None]:
X.columns

In [None]:
hist = X_standart.hist(bins=50, figsize=(20,20))
box = X_standart.boxplot()
plt.xticks(rotation='vertical')
plt.xticks(rotation='horizontal')
for name in X_standart.columns:
    fig, axes= plt.subplots(1,2, gridspec_kw={'width_ratios': [1, 4]}, figsize=(9,5))
    X_standart.boxplot(column=name,ax=axes[0])
    X_standart.hist(column=name, ax=axes[1])
plt.show()

In [None]:
title = "Learning Curve (Naive Bayes) Standarized"

cv = ShuffleSplit(n_splits=200, test_size=0.3, random_state=0)

estimator = GaussianNB()
plot_learning_curve(estimator, title, X_standart, y, ylim=(-0.01, 1.01),
                    cv=cv, n_jobs=16)

## Bayes with Data Standarized and PCA applied ##

In [None]:
n_components = 4
columns = df.columns
pca = decomposition.PCA(n_components=n_components)
pca.fit(X_standart)


eigenvalues = sorted(list(pca.explained_variance_ratio_), reverse=True)
n_pcs= pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
initial_feature_names = X.columns
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]



In [None]:
X_pca = pd.DataFrame(pca.transform(X_standart), columns=most_important_names)
X_pca

In [None]:
hist = X_pca.hist(bins=50, figsize=(20,20))
box = X_pca.boxplot()
plt.xticks(rotation='vertical')
plt.xticks(rotation='horizontal')
for name in X_pca.columns:
    fig, axes= plt.subplots(1,2, gridspec_kw={'width_ratios': [1, 4]}, figsize=(9,5))
    X_pca.boxplot(column=name,ax=axes[0])
    X_pca.hist(column=name, ax=axes[1])
plt.show()

In [None]:
title = "Learning Curve (Naive Bayes) with PCA and Standarized"

cv = ShuffleSplit(n_splits=200, test_size=0.3, random_state=0)

estimator = GaussianNB()
plot_learning_curve(estimator, title, X_pca, y, ylim=(-0.01, 1.01),
                    cv=cv, n_jobs=16)

In [None]:
n_components = X.shape[1]
columns = df.columns
pca = decomposition.PCA(n_components=n_components)
pca.fit(X_standart)


eigenvalues = sorted(list(pca.explained_variance_ratio_), reverse=True)
n_pcs= pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
initial_feature_names = X.columns
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]

eigenvalues = sorted(list(pca.explained_variance_ratio_), reverse=True)
n_pcs= pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
initial_feature_names = X.columns
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]
plt.figure(figsize=(8, 6), dpi=80)
plt.bar(most_important_names, eigenvalues)
plt.xticks(rotation='vertical')
plt.show()

## Bayes with data Scaled and LinearEmbedding ##

In [None]:
MinMaxScalerr = MinMaxScaler()
X_scaled = MinMaxScalerr.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled

In [None]:
columns = X_scaled.columns
LocallyLinear = LocallyLinearEmbedding(n_components=len(columns))
X_locally = pd.DataFrame(LocallyLinear.fit_transform(X_scaled), columns =columns)


title = "Learning Curve (Naive Bayes) with Locally Linear EMbedding and Standarized"

cv = ShuffleSplit(n_splits=200, test_size=0.3, random_state=0)

estimator = GaussianNB()
plot_learning_curve(estimator, title, X_locally, y, ylim=(-0.01, 1.01),
                    cv=cv, n_jobs=16)

In [None]:
hist = X_locally.hist(bins=50, figsize=(20,20))
box = X_locally.boxplot()
plt.xticks(rotation='vertical')
plt.xticks(rotation='horizontal')
for name in X_locally.columns:
    fig, axes= plt.subplots(1,2, gridspec_kw={'width_ratios': [1, 4]}, figsize=(9,5))
    X_locally.boxplot(column=name,ax=axes[0])
    X_locally.hist(column=name, ax=axes[1])
plt.show()