In [1]:
import numpy as np

In [2]:
# build train set
mul1, sigma1 = [1, 0], [[1, 0.75], [0.75, 1]]
mul2, sigma2 = [0, 1], [[1, 0.75], [0.75, 1]]
size = 500

train_data2D_1 = np.random.multivariate_normal(mean=mul1, cov=sigma1, size=size)
train_data2D_1_label = np.zeros((size, 1))
print(train_data2D_1_label.shape)

train_data2D_2 = np.random.multivariate_normal(mean=mul2, cov=sigma2, size=size)
train_data2D_2_label = np.ones((size, 1))
X_train = np.vstack([train_data2D_1, train_data2D_2])
y_train = np.vstack([train_data2D_1_label, train_data2D_2_label])
print(X_train[:5], X_train[-5:])
print(y_train[:5], y_train[-5:])

(500, 1)
[[ 1.77908893  1.22007509]
 [ 2.75658475  3.15790226]
 [ 1.51467271  0.18760104]
 [ 0.22450054 -0.67231769]
 [-0.06588937 -0.94176076]] [[ 1.83223457  0.93511325]
 [ 0.07420417  0.91668364]
 [-0.58189593  0.3754807 ]
 [-0.84756133  1.58579771]
 [-1.54280439  0.4474047 ]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]] [[1.]
 [1.]
 [1.]
 [1.]
 [1.]]


In [3]:
# build test set
mul1, sigma1 = [1, 0], [[1, 0.75], [0.75, 1]]
mul2, sigma2 = [0, 1], [[1, 0.75], [0.75, 1]]
size = 500

test_data2D_1 = np.random.multivariate_normal(mean=mul1, cov=sigma1, size=size)
test_data2D_1_label = np.zeros((size, 1))
print(test_data2D_1_label.shape)

test_data2D_2 = np.random.multivariate_normal(mean=mul2, cov=sigma2, size=size)
test_data2D_2_label = np.ones((size, 1))
X_test = np.vstack([test_data2D_1, test_data2D_2])
y_test = np.vstack([test_data2D_1_label, test_data2D_2_label])
print(X_test[:5], X_test[-5:])
print(y_test[:5], y_test[-5:])

(500, 1)
[[ 0.11914754 -0.41073085]
 [ 0.96742746 -0.74897948]
 [ 1.00061425 -1.85875687]
 [ 3.47030771  2.65788671]
 [-0.4764433  -2.12404637]] [[-0.37098839  0.29117371]
 [-0.58536361  0.87804696]
 [ 1.69529099  1.73701308]
 [ 0.28342398 -0.26921007]
 [ 1.63006445  2.44321577]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]] [[1.]
 [1.]
 [1.]
 [1.]
 [1.]]


In [15]:
import pandas as pd
import numpy as np
print(pd.__version__)

def myNB(X, y, X_test, y_test):    
    def norm(x, mean, std):
        return 1/(np.sqrt(2*np.pi)*std)*np.exp(-(x-mean)**2/2*(std**2))
        
    X_train, y_train = pd.DataFrame(X, columns=['Feature1', 'Feature2']), pd.DataFrame(y, columns=['Class'])
    X_test, y_test = pd.DataFrame(X_test, columns=['Feature1', 'Feature2']), pd.DataFrame(y_test, columns=['Class'])
    y_unique = y_train['Class'].unique()
    train_set = pd.concat([X_train, y_train], axis=1)
    prior = np.zeros(len(y_unique))
    conditional = np.reshape(np.zeros(len(y_unique)*len(X_train.columns)*2), (len(y_unique), len(X_train.columns), 2))
    

    for i in range(0,len(y_unique)):
        prior[i]=(sum(y_train['Class']==y_unique[i])+1)/(len(y_train['Class'])+len(y_unique))
#     print("The prior probability of each class is: ", prior)

    for i, h in enumerate(X_train.columns.values.tolist()):
        for j in range(0,len(y_unique)):
            class_feature = train_set[h].loc[(train_set['Class']==y_unique[j])]
            mean = np.mean(class_feature)
            var = np.std(class_feature)
            conditional[i][j] = [mean, var]
#             print("mean and standard variance of current feature is: ", h, j, mean, var)
    
#     print(conditional)
#     print("Prior distribution is: ", prior)
    # prediction
    pred_probs = []
    for idx, row in X_test.iterrows():
        probs = []
        for cIdx, pri in enumerate(prior): # class 0 1
            for fIdx, feat in enumerate(row): # feature 0 1
                pri *= norm(feat, conditional[fIdx][cIdx][0], conditional[fIdx][cIdx][1])
            probs.append(pri)
        pred_probs.append(probs)
    pred = [np.argmax(p) for p in pred_probs]
    # calculate error rate
#     print(np.array(pred).shape)
#     print(y_test.to_numpy().shape)
    err = np.mean(np.array(pred)!=y_test.to_numpy().squeeze())
#     print(err)
    
    return pred, conditional, err
    
# pred, posterior, error = myNB(X_train, y_train, X_test, y_test)
pred, posterior, err = myNB(X_train, y_train, X_test, y_test)

1.0.3


In [17]:
# perform the experiments 10 times
avg_err, run_time = 0, 10
for i in range(run_time):
    train_data2D_1 = np.random.multivariate_normal(mean=mul1, cov=sigma1, size=size)
    train_data2D_1_label = np.zeros((size, 1))

    train_data2D_2 = np.random.multivariate_normal(mean=mul2, cov=sigma2, size=size)
    train_data2D_2_label = np.ones((size, 1))
    X_train = np.vstack([train_data2D_1, train_data2D_2])
    y_train = np.vstack([train_data2D_1_label, train_data2D_2_label])
    
    test_data2D_1 = np.random.multivariate_normal(mean=mul1, cov=sigma1, size=size)
    test_data2D_1_label = np.zeros((size, 1))

    test_data2D_2 = np.random.multivariate_normal(mean=mul2, cov=sigma2, size=size)
    test_data2D_2_label = np.ones((size, 1))
    X_test = np.vstack([test_data2D_1, test_data2D_2])
    y_test = np.vstack([test_data2D_1_label, test_data2D_2_label])
    
    pred, posterior, err = myNB(X_train, y_train, X_test, y_test)
    print("current time err: ", err)
    avg_err += err/run_time

print(avg_err)


current time err:  0.094
current time err:  0.084
current time err:  0.105
current time err:  0.107
current time err:  0.086
current time err:  0.085
current time err:  0.079
current time err:  0.083
current time err:  0.081
current time err:  0.087
0.0891


In [25]:
from sklearn import metrics
train_data2D_1 = np.random.multivariate_normal(mean=mul1, cov=sigma1, size=size)
train_data2D_1_label = np.zeros((size, 1))

train_data2D_2 = np.random.multivariate_normal(mean=mul2, cov=sigma2, size=size)
train_data2D_2_label = np.ones((size, 1))
X_train = np.vstack([train_data2D_1, train_data2D_2])
y_train = np.vstack([train_data2D_1_label, train_data2D_2_label])

test_data2D_1 = np.random.multivariate_normal(mean=mul1, cov=sigma1, size=size)
test_data2D_1_label = np.zeros((size, 1))

test_data2D_2 = np.random.multivariate_normal(mean=mul2, cov=sigma2, size=size)
test_data2D_2_label = np.ones((size, 1))
X_test = np.vstack([test_data2D_1, test_data2D_2])
y_test = np.vstack([test_data2D_1_label, test_data2D_2_label])

pred, posterior, err = myNB(X_train, y_train, X_test, y_test)

acc = metrics.accuracy_score(np.array(pred), y_test.squeeze())
print(acc)
precision = metrics.precision_score(np.array(pred), y_test.squeeze())
print(precision)
recall = metrics.recall_score(np.array(pred), y_test.squeeze())
print(recall)

0.913
0.928
0.9009708737864077
