In [21]:
import numpy as np
import pandas as pd
from scipy.stats import norm
pdf = norm.pdf

In [4]:
data = pd.DataFrame()
data['height'] = [6, 5.92, 5.58, 5.92, 5, 5.5, 5.42, 5.75]
data['weight'] = [180, 190, 170, 165, 100, 150, 130, 150]
data['shoe_size'] = [12, 11, 12, 10, 6, 8, 7, 9]
data['gender'] = ['m', 'm', 'm', 'm', 'f', 'f', 'f', 'f']

In [5]:
data

Unnamed: 0,height,weight,shoe_size,gender
0,6.0,180,12,m
1,5.92,190,11,m
2,5.58,170,12,m
3,5.92,165,10,m
4,5.0,100,6,f
5,5.5,150,8,f
6,5.42,130,7,f
7,5.75,150,9,f


In [28]:
x = data[['height', 'weight', 'shoe_size']].values

In [11]:
x

array([[   6.  ,  180.  ,   12.  ],
       [   5.92,  190.  ,   11.  ],
       [   5.58,  170.  ,   12.  ],
       [   5.92,  165.  ,   10.  ],
       [   5.  ,  100.  ,    6.  ],
       [   5.5 ,  150.  ,    8.  ],
       [   5.42,  130.  ,    7.  ],
       [   5.75,  150.  ,    9.  ]])

In [34]:
x.shape

(8, 3)

In [12]:
y = data['gender'].values

In [24]:
y == 'm'

array([ True,  True,  True,  True, False, False, False, False], dtype=bool)

In [18]:
np.unique(y)

array(['f', 'm'], dtype=object)

In [22]:
distr=norm(loc=2, scale=0.5)

In [334]:
class Bayes():
    
    def __init__(self, priors=None):
        self.means_dict = {}
        self.stds_dict = {}
        self.priors = priors
        self.classes = []
        self.scores = None
    
    def _create_class_stats_dict(self, x_data, y_data):
        for class_ in self.classes:
            self.means_dict[class_] = np.mean(x_data[y_data == class_], axis=0)
            self.stds_dict[class_] = np.std(x_data[y_data == class_], axis=0, ddof=1)
            
    def fit(self, x_data, y_data):
        self.classes, counts = np.unique(y_data, return_counts=True)
        self._create_class_stats_dict(x_data, y_data)
        self.scores = np.empty(shape=(0, len(self.classes)))
        
        if not self.priors:
            self.priors = dict(zip(self.classes, counts/counts.sum()))
            
            
    def score(self, data):
        for vector in data:
            c_score = []
            for class_ in self.classes:
                score_for_class = self.priors[class_]
                for feature_index, value in enumerate(vector):
                    score_for_class *= pdf(value, loc=self.means_dict[class_][feature_index], 
                                          scale=self.stds_dict[class_][feature_index])
                c_score.append(score_for_class)
            self.scores = np.append(self.scores, [c_score], axis=0)
        return self.scores

    def predict(self, data):
        self.score(data)
        return np.vectorize(lambda x: self.classes[x])(np.argmax(self.scores, axis=1))
    

In [335]:
b = Bayes()

In [336]:
b.fit(x, y)

In [337]:
b.predict([[6, 130, 8],
         [5, 122, 4],
         [6, 222, 4]])

array(['f', 'f', 'f'],
      dtype='<U1')

In [338]:
import seaborn as sns

In [339]:
import pandas as pd

In [340]:
iris = sns.load_dataset('iris')

In [341]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [342]:
y_iris = iris['species']

In [343]:
iris.drop('species', axis=1, inplace=True)

In [344]:
from sklearn.model_selection import train_test_split

In [345]:
X_train, X_test, y_train, y_test = train_test_split(iris, y_iris, test_size=.1, random_state=42)

In [346]:
b = Bayes()

In [347]:
b.fit(X_train, y_train)

In [348]:
res = b.predict(X_test.values)

In [349]:
y_test.values == res

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True], dtype=bool)