In [20]:
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
import numpy as np
import pandas as pd
from pandas import DataFrame

#Update mean and variance
def update_mean_variance(x):
    mean = np.mean(x, axis=0)
    variance = np.var(x, axis=0)
    return mean, variance

#Training data
def fit(X_train, y_train):
    categories = np.unique(y_train)
    means = np.zeros((len(categories), X_train.shape[1]))
    variances = np.zeros((len(categories), X_train.shape[1]))
    category_ratios = []
    for category in categories:
        category_index = categories.searchsorted(category)
        category_data = X_train[y_train == category]
        category_ratio = category_data.shape[0]/X_train.shape[0]
        means[category_index, :], variances[category_index, :] = update_mean_variance(category_data)
        category_ratios.append(category_ratio)
    return means, variances, category_ratios

#Gauss distribution function
def gaussian(x, mean, variance):
    return (1 / (np.sqrt(2 * np.pi * variance))) * np.exp(-(np.square(x - mean) / (2 * variance)))

#The probability corresponding to each group of data in each category
def probability_of_each_data(X_test, mean, variance,category_ratio):
    p = []
    features = range(X_test.shape[1])
    for indexs in X_test.index:
        if category_ratio != 0:
            probability = np.log(category_ratio)
            row_data = X_test.loc[indexs].values[:]
            for i in features:
                probability += np.log(gaussian(row_data[i], mean[i], variance[i]))
        else:
            probability = 0
        p.append(probability)
    return p

#Corresponding probabilities for each category
def probability_of_each_category(X_test, category_total, means, variances, category_ratios):
    probability = []
    for i in range(category_total):
        mean, variance = means[i, :], variances[i, :]
        probability.append(probability_of_each_data(X_test, mean, variance,category_ratios[i]))
    return np.array(probability).T

#Predict data
def predict(X_test, y_test, means, variances, category_ratios):
    probability = probability_of_each_category(X_test, len(np.unique(y_test)), means, variances, category_ratios)
    print (np.unique(y_test)[np.argmax(probability, axis=1)])

if __name__ == '__main__':
    data = pd.read_csv('d:/iris.csv')
    class_features = ["Sepal.Length","Sepal.Width","Petal.Length","Petal.Width"]
    X_train = data[class_features]
    y_train = data["Species"]
    means, variances, category_ratios = fit(X_train, y_train)
    X_test = DataFrame({'Sepal.Length':[3.1], 'Sepal.Width':[4.4], 'Petal.Length':[2.1], 'Petal.Width':[0.2]})
    predict(X_test, y_train, means, variances,category_ratios)

['setosa']
