In [220]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.datasets import fetch_openml, load_iris, load_breast_cancer, fetch_20newsgroups
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.base import BaseEstimator
from scipy import stats
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

iris_dataset = load_iris()

X = iris_dataset.data
y = iris_dataset.target

## 5 Gaussowski Naiwny Klasyfikator Bayesa

### Zadanie 1

In [221]:
class Bayes(BaseEstimator):
    def __init__(self, *, param=1):
        self.param = param
        self.C = []
        self.amt_of_C = []
        self.Cpriors = []
        self.mean = []
        self.std = []

    def fit(self, X_train, y_train=None):
        self.C = np.unique(y_train)
        for c in self.C:
            self.amt_of_C.append(np.sum(y_train == c))

        self.Cpriors = [c / len(y_train) for c in self.amt_of_C]

        self.mean = []
        self.std = []

        for c in self.C:
            X = X_train[y_train == c]
            self.mean.append(np.mean(X, axis=0))
            self.std.append(np.std(X, axis=0))

        return self
    
    def predict(self, X):
        post_arg = []
        
        for row in X:

            max_posteriori_idx = -1
            max_posteriori = -np.inf
            for c_idx, c in enumerate(self.C):

                posteriori = 0
                for x_idx, x in enumerate(row):
                    mean = self.mean[c_idx][x_idx]
                    std = self.std[c_idx][x_idx]
                    
                    posteriori += np.log(self.likelihood(x, mean, std))
                posteriori += np.log(self.Cpriors[c_idx])

                if posteriori > max_posteriori:
                    max_posteriori = posteriori
                    max_posteriori_idx = c_idx
            
            post_arg.append(max_posteriori_idx)
        
        return post_arg

    
    def likelihood(self, x, mean, std):
        return np.exp(-((x - mean) ** 2 / (2 * std ** 2))) / (np.sqrt(2 * np.pi * std))

### Zadanie 3

In [222]:
Bayesian_scores = {
    'accuracy': [],
    'f1': [],
    'precision': []
}

GaussianNB_scores = {
    'accuracy': [],
    'f1': [],
    'precision': []
}

for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=i+np.random.randint(0, 100))
    clf_Bayes = Bayes()
    clf_Bayes.fit(X_train, y_train)
    y_pred = clf_Bayes.predict(X_test)
    Bayesian_scores["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    Bayesian_scores["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    Bayesian_scores["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    clf_GaussianNB = GaussianNB()
    clf_GaussianNB.fit(X_train, y_train)
    y_pred = clf_GaussianNB.predict(X_test)
    GaussianNB_scores["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    GaussianNB_scores["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    GaussianNB_scores["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

print("Bayesian")
print("Accuracy: ", np.mean(Bayesian_scores["accuracy"]))
print("F1: ", np.mean(Bayesian_scores["f1"]))
print("Precision: ", np.mean(Bayesian_scores["precision"]))
print("")
print("GaussianNB")
print("Accuracy: ", np.mean(GaussianNB_scores["accuracy"]))
print("F1: ", np.mean(GaussianNB_scores["f1"]))
print("Precision: ", np.mean(GaussianNB_scores["precision"]))



Bayesian
Accuracy:  0.9549999999999998
F1:  0.9551457882329567
Precision:  0.9601374236879531

GaussianNB
Accuracy:  0.9574999999999998
F1:  0.9575988722597664
Precision:  0.9621128185866172


  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))


### Zadanie 4

In [223]:
breast_cancer_data = load_breast_cancer()
X = breast_cancer_data.data
y = breast_cancer_data.target
X = np.where(X == 0, 1e-6, X)

In [224]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid', 'cosine']

for kernel_name in kernels:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i+np.random.randint(0, 100))
    kernel_pca = KernelPCA(n_components=2, kernel=kernel_name)
    clf_Bayes = Bayes()
    clf_GaussianNB = GaussianNB()
    clf_RandomForest = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

    X_train_pca = kernel_pca.fit_transform(X_train)
    X_test_pca = kernel_pca.transform(X_test)

    print("========== Kernel: ", kernel_name, " ==========")
    # KernelPCA Bayes
    print("Bayes:")
    clf_Bayes.fit(X_train_pca, y_train)
    y_pred = clf_Bayes.predict(X_test_pca)
    print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
    print("F1: ", metrics.f1_score(y_test, y_pred, average='weighted'))
    print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))

    # KernelPCA GaussianNB
    print("GaussianNB:")
    clf_GaussianNB.fit(X_train_pca, y_train)
    y_pred = clf_GaussianNB.predict(X_test_pca)
    print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
    print("F1: ", metrics.f1_score(y_test, y_pred, average='weighted'))
    print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))

    # KernelPCA RandomForest
    print("RandomForest:")
    clf_RandomForest.fit(X_train_pca, y_train)
    y_pred = clf_RandomForest.predict(X_test_pca)
    print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
    print("F1: ", metrics.f1_score(y_test, y_pred, average='weighted'))
    print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))


Bayes:
Accuracy:  0.9181286549707602
F1:  0.9187853107344632
Precision:  0.9205166418281173
GaussianNB:
Accuracy:  0.9122807017543859
F1:  0.9112099854313502
Precision:  0.9119263089851325
RandomForest:
Accuracy:  0.9181286549707602
F1:  0.9181286549707602
Precision:  0.9181286549707602
Bayes:
Accuracy:  0.935672514619883
F1:  0.9347065550615989
Precision:  0.937305276486563
GaussianNB:
Accuracy:  0.9122807017543859
F1:  0.9094968393144929
Precision:  0.9193499500784482
RandomForest:
Accuracy:  0.935672514619883
F1:  0.935294360053832
Precision:  0.935549381030646


  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Bayes:
Accuracy:  0.6666666666666666
F1:  0.5333333333333333
Precision:  0.4444444444444444
GaussianNB:
Accuracy:  0.6666666666666666
F1:  0.5333333333333333
Precision:  0.4444444444444444
RandomForest:
Accuracy:  0.6666666666666666
F1:  0.5333333333333333
Precision:  0.4444444444444444
Bayes:
Accuracy:  0.0
F1:  0.0
Precision:  0.0
GaussianNB:
Accuracy:  0.3567251461988304
F1:  0.18758822343214357
Precision:  0.12725282993057693
RandomForest:
Accuracy:  0.6432748538011696
F1:  0.503631558136147
Precision:  0.41380253753291607
Bayes:
Accuracy:  0.8830409356725146
F1:  0.8830409356725146
Precision:  0.8830409356725146
GaussianNB:
Accuracy:  0.8947368421052632
F1:  0.893730407523511
Precision:  0.8938279501119285
RandomForest:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return np.exp(-((x - mean) ** 2 / (2 * std ** 2))) / (np.sqrt(2 * np.pi * std))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.9064327485380117
F1:  0.9055381400208987
Precision:  0.9057669758021533


Kernel 'poly' daje najlepsze rezultaty.

In [225]:
choosen_kernel = 'poly'

In [226]:
Bayesian_scores = {
    'basic': {
        'accuracy': [],
        'f1': [],
        'precision': []
    },
    'scaler': {
        'accuracy': [],
        'f1': [],
        'precision': []
    },
    'pca': {
        'accuracy': [],
        'f1': [],
        'precision': []
    },
    'kernel_pca': {
        'accuracy': [],
        'f1': [],
        'precision': []
    },
    'box_cox': {
        'accuracy': [],
        'f1': [],
        'precision': []
    }
}

GaussianNB_scores = {
    'basic': {
        'accuracy': [],
        'f1': [],
        'precision': []
    },
    'scaler': {
        'accuracy': [],
        'f1': [],
        'precision': []
    },
    'pca': {
        'accuracy': [],
        'f1': [],
        'precision': []
    },
    'kernel_pca': {
        'accuracy': [],
        'f1': [],
        'precision': []
    },
    'box_cox': {
        'accuracy': [],
        'f1': [],
        'precision': []
    }
}

RandomForest_scores = {
    'basic': {
        'accuracy': [],
        'f1': [],
        'precision': []
    },
    'scaler': {
        'accuracy': [],
        'f1': [],
        'precision': []
    },
    'pca': {
        'accuracy': [],
        'f1': [],
        'precision': []
    },
    'kernel_pca': {
        'accuracy': [],
        'f1': [],
        'precision': []
    },
    'box_cox': {
        'accuracy': [],
        'f1': [],
        'precision': []
    }
}

for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i+np.random.randint(0, 100))
    
    clf_Bayes = Bayes()
    scaler = StandardScaler()
    pca = PCA(n_components=2)
    kernel_pca = KernelPCA(n_components=2, kernel=choosen_kernel)
    boxcox = PowerTransformer(method="box-cox", standardize=True)
    clf_GaussianNB = GaussianNB()
    clf_RandomForest = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    
    X_train_scaler = scaler.fit_transform(X_train)
    X_test_scaler = scaler.transform(X_test)

    X_train_kernel_pca = kernel_pca.fit_transform(X_train)
    X_test_kernel_pca = kernel_pca.transform(X_test)
    
    X_train_box_cox = boxcox.fit_transform(X_train)
    X_test_box_cox = boxcox.transform(X_test)

    # Basic Bayes
    clf_Bayes.fit(X_train, y_train)
    y_pred = clf_Bayes.predict(X_test)
    Bayesian_scores["basic"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    Bayesian_scores["basic"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    Bayesian_scores["basic"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    # Scaler Bayes
    clf_Bayes.fit(X_train_scaler, y_train)
    y_pred = clf_Bayes.predict(X_test_scaler)
    Bayesian_scores["scaler"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    Bayesian_scores["scaler"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    Bayesian_scores["scaler"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    # PCA Bayes
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    clf_Bayes.fit(X_train_pca, y_train)
    y_pred = clf_Bayes.predict(X_test_pca)
    Bayesian_scores["pca"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    Bayesian_scores["pca"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    Bayesian_scores["pca"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    # KernelPCA Bayes
    clf_Bayes.fit(X_train_kernel_pca, y_train)
    y_pred = clf_Bayes.predict(X_test_kernel_pca)
    Bayesian_scores["kernel_pca"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    Bayesian_scores["kernel_pca"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    Bayesian_scores["kernel_pca"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    # Box-Cox Bayes
    clf_Bayes.fit(X_train_box_cox, y_train)
    y_pred = clf_Bayes.predict(X_test_box_cox)
    Bayesian_scores["box_cox"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    Bayesian_scores["box_cox"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    Bayesian_scores["box_cox"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    # Basic GaussianNB
    clf_GaussianNB.fit(X_train, y_train)
    y_pred = clf_GaussianNB.predict(X_test)
    GaussianNB_scores["basic"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    GaussianNB_scores["basic"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    GaussianNB_scores["basic"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    # Scaler GaussianNB
    clf_GaussianNB.fit(X_train_scaler, y_train)
    y_pred = clf_GaussianNB.predict(X_test_scaler)
    GaussianNB_scores["scaler"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    GaussianNB_scores["scaler"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    GaussianNB_scores["scaler"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    # PCA GaussianNB
    clf_GaussianNB.fit(X_train_pca, y_train)
    y_pred = clf_GaussianNB.predict(X_test_pca)
    GaussianNB_scores["pca"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    GaussianNB_scores["pca"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    GaussianNB_scores["pca"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    # KernelPCA GaussianNB
    clf_GaussianNB.fit(X_train_kernel_pca, y_train)
    y_pred = clf_GaussianNB.predict(X_test_kernel_pca)
    GaussianNB_scores["kernel_pca"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    GaussianNB_scores["kernel_pca"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    GaussianNB_scores["kernel_pca"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    # Box-Cox GaussianNB
    clf_GaussianNB.fit(X_train_box_cox, y_train)
    y_pred = clf_GaussianNB.predict(X_test_box_cox)
    GaussianNB_scores["box_cox"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    GaussianNB_scores["box_cox"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    GaussianNB_scores["box_cox"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    # Basic RandomForest
    clf_RandomForest.fit(X_train, y_train)
    y_pred = clf_RandomForest.predict(X_test)
    RandomForest_scores["basic"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    RandomForest_scores["basic"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    RandomForest_scores["basic"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    # Scaler RandomForest
    clf_RandomForest.fit(X_train_scaler, y_train)
    y_pred = clf_RandomForest.predict(X_test_scaler)
    RandomForest_scores["scaler"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    RandomForest_scores["scaler"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    RandomForest_scores["scaler"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    # PCA RandomForest
    clf_RandomForest.fit(X_train_pca, y_train)
    y_pred = clf_RandomForest.predict(X_test_pca)
    RandomForest_scores["pca"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    RandomForest_scores["pca"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    RandomForest_scores["pca"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    # KernelPCA RandomForest
    clf_RandomForest.fit(X_train_kernel_pca, y_train)
    y_pred = clf_RandomForest.predict(X_test_kernel_pca)
    RandomForest_scores["kernel_pca"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    RandomForest_scores["kernel_pca"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    RandomForest_scores["kernel_pca"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))

    # Box-Cox RandomForest
    clf_RandomForest.fit(X_train_box_cox, y_train)
    y_pred = clf_RandomForest.predict(X_test_box_cox)
    RandomForest_scores["box_cox"]["accuracy"].append(metrics.accuracy_score(y_test, y_pred))
    RandomForest_scores["box_cox"]["f1"].append(metrics.f1_score(y_test, y_pred, average='weighted'))
    RandomForest_scores["box_cox"]["precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))



for (Bayesian, GNB, RandomForest) in zip(Bayesian_scores.items(), GaussianNB_scores.items(), RandomForest_scores.items()):
    print(Bayesian[0])
    for key in "accuracy f1 precision".split():
        print("\t====== ", key, " ======")
        print("\tBayesian: ", np.mean(Bayesian[1][key]))
        print("\tGaussianNB: ", np.mean(GNB[1][key]))
        print("\tRandomForest: ", np.mean(RandomForest[1][key]))


  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.likelihood(x, mean, std))
  posteriori += np.log(self.

basic
	Bayesian:  0.9362573099415205
	GaussianNB:  0.9394736842105266
	RandomForest:  0.941812865497076
	Bayesian:  0.936788619034685
	GaussianNB:  0.9391534944678996
	RandomForest:  0.941536669405469
	Bayesian:  0.940089005257612
	GaussianNB:  0.9404064997230502
	RandomForest:  0.9431895905566205
scaler
	Bayesian:  0.9362573099415205
	GaussianNB:  0.9350877192982457
	RandomForest:  0.941812865497076
	Bayesian:  0.936788619034685
	GaussianNB:  0.9349647937459571
	RandomForest:  0.941536669405469
	Bayesian:  0.940089005257612
	GaussianNB:  0.936066592511063
	RandomForest:  0.9431895905566205
pca
	Bayesian:  0.8973684210526317
	GaussianNB:  0.904093567251462
	RandomForest:  0.9192982456140351
	Bayesian:  0.8973128920232819
	GaussianNB:  0.9017915475724674
	RandomForest:  0.9179392074062456
	Bayesian:  0.8989062020681112
	GaussianNB:  0.9085606190915962
	RandomForest:  0.9215769327091916
kernel_pca
	Bayesian:  0.9011695906432748
	GaussianNB:  0.888888888888889
	RandomForest:  0.9081871345

## 6 Naiwny Klasyfikator Bayesa z rozkładem Bernoulliego

In [227]:
class BayesBernoulli(BaseEstimator):
    def __init__(self, *, param=1):
        self.param = param
        self.C = []
        self.amt_of_C = []
        self.Cpriors = []
        self.pxC = []

    def fit(self, X_train, y_train=None):
        self.C = np.unique(y_train)
        for c in self.C:
            self.amt_of_C.append(np.sum(y_train == c))

        self.Cpriors = [ (c / len(y_train)).astype(np.float64) for c in self.amt_of_C]
        self.pxC = [ [] for _ in range(len(self.C)) ]

        for idx_c, c in enumerate(self.C):
            XC = X_train[y_train == c].astype(np.float64)
            self.pxC[idx_c] = (XC.sum(axis=0).astype(np.float64) / XC.shape[0])
            self.pxC[idx_c] = np.squeeze(self.pxC[idx_c])
            self.pxC[idx_c] = np.where(self.pxC[idx_c] == 0, 1, self.pxC[idx_c]).astype(np.float64)
            self.pxC[idx_c] = np.squeeze(self.pxC[idx_c])

        return self

    def predict(self, X):
        post_arg = []
        
        for row in X:
            max_posteriori_idx = -1
            max_posteriori = -np.inf
            for c_idx, c in enumerate(self.C):

                posteriori = 1
                for x_idx, x in enumerate(row):
                    posteriori *= self.pxC[c_idx][x_idx]
                
                posteriori *= self.Cpriors[c_idx]
                posteriori = posteriori.astype(np.float64)
                
                if posteriori > max_posteriori:
                    max_posteriori = posteriori
                    max_posteriori_idx = c_idx
            
            post_arg.append(self.C[max_posteriori_idx])
        
        return post_arg


In [228]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

cv = CountVectorizer(binary=True)
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

X_train, X_test, y_train, y_test = newsgroups_train.data, newsgroups_test.data, newsgroups_train.target, newsgroups_test.target
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

In [229]:
BBnb = BayesBernoulli()
BBnb.fit(X_train, y_train)
y_pred_bbnb = BBnb.predict(X_test)

print(classification_report(y_test, y_pred_bbnb))
print(sum(y_pred_bbnb == y_test))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       319
           1       0.00      0.00      0.00       389
           2       0.00      0.00      0.00       394
           3       0.00      0.00      0.00       392
           4       0.00      0.00      0.00       385
           5       0.00      0.00      0.00       395
           6       0.05      1.00      0.10       390
           7       0.00      0.00      0.00       396
           8       0.00      0.00      0.00       398
           9       0.00      0.00      0.00       397
          10       0.00      0.00      0.00       399
          11       0.00      0.00      0.00       396
          12       0.00      0.00      0.00       393
          13       0.00      0.00      0.00       396
          14       0.00      0.00      0.00       394
          15       0.00      0.00      0.00       398
          16       0.00      0.00      0.00       364
          17       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [230]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB(binarize=0.0)
model = bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)

print(classification_report(y_test, y_pred))
print(sum(y_pred == y_test))

              precision    recall  f1-score   support

           0       0.92      0.32      0.47       319
           1       0.58      0.63      0.61       389
           2       0.33      0.01      0.01       394
           3       0.43      0.81      0.56       392
           4       0.64      0.76      0.70       385
           5       0.84      0.61      0.70       395
           6       0.30      0.93      0.45       390
           7       0.67      0.78      0.72       396
           8       0.74      0.91      0.82       398
           9       0.77      0.87      0.82       397
          10       0.99      0.83      0.90       399
          11       0.82      0.69      0.75       396
          12       0.57      0.67      0.62       393
          13       0.84      0.52      0.64       396
          14       0.88      0.68      0.77       394
          15       0.53      0.80      0.64       398
          16       0.74      0.57      0.64       364
          17       0.96    

## 7 Naiwny Klasyfikator Bayesa dla zbioru danych Adult Income

In [231]:
adult = fetch_openml("adult", version=2)  
X = adult.data
y = adult.target

In [232]:
import pandas as pd

df_adult_X = pd.DataFrame(X, columns=adult.feature_names)
print(df_adult_X.head())

   age  workclass  fnlwgt     education  education-num      marital-status  \
0   25    Private  226802          11th              7       Never-married   
1   38    Private   89814       HS-grad              9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm             12  Married-civ-spouse   
3   44    Private  160323  Some-college             10  Married-civ-spouse   
4   18        NaN  103497  Some-college             10       Never-married   

          occupation relationship   race     sex  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                NaN    Own-child  White  Female             0             0   

   hours-per-week native-country  
0              

In [233]:
df_adult_y = pd.DataFrame(y)
df_adult_y.rename(columns={'class': 'income'}, inplace=True)
print(df_adult_y.income.value_counts())

income
<=50K    37155
>50K     11687
Name: count, dtype: int64


In [234]:
import sklearn

le = sklearn.preprocessing.LabelEncoder()
le.fit(df_adult_y)
df_adult_y = le.transform(df_adult_y)
print(df_adult_y)

[0 0 1 ... 0 0 1]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [235]:
feature_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", 
                 "capital-gain", "capital-loss","hours-per-week", "native-country"]

categorical_features_names = ["workclass", "education", "marital-status", "occupation", "relationship", 'race', 'sex', 'native-country']
categorical_features = [1, 3, 5, 6, 7, 8, 9, 13]

In [236]:
feature_classes = {}
for feature in categorical_features_names:
    le = sklearn.preprocessing.LabelEncoder()
    le.fit(df_adult_X[feature])
    df_adult_X[feature] = le.transform(df_adult_X[feature])
    feature_classes[feature] = le.classes_

In [237]:
print(df_adult_X.head())

   age  workclass  fnlwgt  education  education-num  marital-status  \
0   25          3  226802          1              7               4   
1   38          3   89814         11              9               2   
2   28          1  336951          7             12               2   
3   44          3  160323         15             10               2   
4   18          8  103497         15             10               4   

   occupation  relationship  race  sex  capital-gain  capital-loss  \
0           6             3     2    1             0             0   
1           4             0     4    1             0             0   
2          10             0     4    1             0             0   
3           6             0     2    1          7688             0   
4          14             3     4    0             0             0   

   hours-per-week  native-country  
0              40              38  
1              50              38  
2              40              38  
3       

In [238]:
df_adult_X2 = df_adult_X.copy()

for col in categorical_features_names:
    df_adult_X2[col] = df_adult_X2[col].astype(str).fillna('missing')
    

In [239]:
for col in feature_names:
    if col not in categorical_features_names:
        df_adult_X2[col] = df_adult_X2[col].astype(float).fillna(0)

In [240]:
X_train, X_test, y_train, y_test = train_test_split(df_adult_X2, df_adult_y, test_size=0.3, random_state=np.random.randint(0, 100))

In [241]:
clf_GaussianNB = GaussianNB()
clf_GaussianNB.fit(X_train, y_train)
y_pred = clf_GaussianNB.predict(X_test)
print("GaussianNB")
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))


GaussianNB
Accuracy:  0.7957414863850406
F1:  0.7655591197807137
Precision:  0.7761256922788576
