# Problem 7.3
# Implemetation of Naïve Bayes Classifier with Laplacian Correction

In [80]:
import pandas as pd
import numpy as np
import scipy.stats

# 1. Conditional Probability Estimation

- The prior probability of a class can be estimated as
$$ \hat{P} (c) = \frac{|D_{c}| + 1}{|D|+ N} $$

- The conditional probability $ \hat{P}(x_{i}|c) $ can be estimated as
$$ \hat{P} (x_{i}|c) = \frac{|D_{c,x_{i}}|+1}{|D_{c}| + N_{i}} $$

or the PDF of the supposed distribution for continuous variables

In [33]:
def prior_prob(df_target, c):
    """
    This function returns the prior probaility of a class, c, after Laplacian Correction
    It has two inputs:
    1. df_target: the train df column of target
    2. c: value of class, c
    """
    return (sum([1 for e in df_target if e == c]) + 1)/(len(df_target) + len(set(df_target)))    

In [105]:
def condition_prob(df_train, target_name, c, x_name, x_value, dtype):
    """
    This function returns the conditional probability of a class, c, 
    after Laplacian Correction.
    It has six inputs:
    1. df_train: the train df
    2. target name: the column name of the target
    3. c: value of class, c
    4. x_name: the column name of variable, x
    5. x_value: the i-th value of x
    6. dtype: the dtype of x
    """
    
    d_c = df_train[df_train[target_name] == c] # the sub_df_1 of class c
    
    # for continuous variables
    if np.issubdtype(dtype, np.number):
        u = np.mean(d_c[x_name])
        sigma = np.std(d_c[x_name])
        return scipy.stats.norm(u, sigma) # here suppose normal distribution
    
    # for category variables
    else:
        # the sub_df_2 of x == x_value of d_c
        d_c_x = d_c[d_c[x_name]==x_value] 
        
        # the number of unqiue class of sub_df_3 of x == x_value of train_df
        n_i = len(set(df_train[target_name][df_train[x_name]==x_value])) 
        return (len(d_c_x)+1)/(len(d_c)+n_i)

# 2. Training

In [107]:
def bayes_train(df_train, target_name, types):
    """
    This function return the dictionary of all the probabilities
    It has three inputs:
    1. df_train: train df, including target
    2. target_name: string, the column name of target
    3. types: the dtypes of the df
    """
    # build the dictionary
    train_dict = {'prior':{}, 'conditional':{}}
    
    # select the feature names, except target name
    cols = list(df_train.columns)
    cols.remove(target_name)
    
    # compute the prior probability
    for c in set(df_train[target_name]):
        train_dict['prior'][c] = prior_prob(df_train[target_name], c)
        train_dict['conditional'][c] = {}
    
    # compute the conditional probability
    for col in cols: # select x
        for c in train_dict['conditional'].keys(): # select c
            train_dict['conditional'][c][col] = {}
            
            # for continuous variables
            if np.issubdtype(types[cols.index(col)], np.number): 
                # computing the distribution
                train_dict['conditional'][c][col] = condition_prob(df_train, 
                                                                   target_name,
                                                                   c, col, x_value,
                                                                   types[cols.index(col)])
            else: # for category variables
                for x_value in set(df_train[col]):
                    # computing the condititonal probability of c_xi
                    train_dict['conditional'][c][col][x_value] = condition_prob(df_train, 
                                                                                target_name,
                                                                                c, col, x_value,
                                                                                types[cols.index(col)])
            
    return train_dict

# 3. Predict

In [113]:
def bayes_predict(df_validation, classes, train_dict):
    """
    This function returns prediction values of df_validation based on train_dict
    It has three inputs:
    1. df_validation: validation datafram
    2. classes: the list/set of all possible classes
    3. train_dict: dict, the trained dictionary of bayes classification
    """
    
    cols = df_validation.columns
    predictions = []
    
    for i in range(len(df_validation)): # iterate through the df
        p = 0
        variables = df_validation.iloc[i] # features variables
        for c in classes:
            prob = train_dict['prior'][c] # get the prior probability
            for col in cols:
                try: # for category variables
                    
                    # compute p(c) * p(x_i|c)
                    prob *= train_dict['conditional'][c][col][variables[col]]
                except TypeError: # for continuous variables
                    
                    # get distribution
                    norm = train_dict['conditional'][c][col] 
                    
                    # compute p(c) * p(x_i|c)
                    prob *= train_dict['conditional'][c][col].pdf(variables[col]) 
            
            # storing the class with the largest probability
            if prob > p:
                p = prob
                prediction = c
                
        # storing the predictions
        predictions.append(prediction)
    return predictions
            

# 4. Load Data and Prediction

In [117]:
data = pd.read_csv('../data/data.txt').drop(['Id'], axis=1)

Train and test set splitting

In [118]:
data_tr = data.iloc[[0,1,2,5,6,9,13,14,15,16]]
data_test = data.drop(data_tr.index)
data_test_x, data_test_y = data_test.drop(['quality'],axis=1), data_test.quality

Start training...

In [119]:
d = bayes_train(data_tr, 'quality', data_tr.dtypes)
d

{'conditional': {'bad': {'color': {'dark-green': 0.42857142857142855,
    'pitch-dark': 0.2857142857142857,
    'white': 0.5},
   'density': <scipy.stats._distn_infrastructure.rv_frozen at 0x10c4b2b00>,
   'root': {'roll-up': 0.42857142857142855,
    'slighly-curled': 0.42857142857142855,
    'stiff': 0.3333333333333333},
   'sound': {'crisp': 0.3333333333333333,
    'dead': 0.42857142857142855,
    'dull': 0.42857142857142855},
   'stripes': {'blurred': 0.3333333333333333,
    'clear': 0.42857142857142855,
    'indistinct': 0.42857142857142855},
   'sugar': <scipy.stats._distn_infrastructure.rv_frozen at 0x10c4b26a0>,
   'touch': {'hard': 0.5714285714285714, 'soft': 0.42857142857142855},
   'umbilical': {'hollow': 0.2857142857142857,
    'plain': 0.5,
    'slightly-hollow': 0.42857142857142855}},
  'good': {'color': {'dark-green': 0.42857142857142855,
    'pitch-dark': 0.5714285714285714,
    'white': 0.16666666666666666},
   'density': <scipy.stats._distn_infrastructure.rv_frozen at 

Making Predictions

In [115]:
pred = bayes_predict(data_test_x, set(data_tr.quality), d)

In [116]:
# accuracy
sum([data_test_y.iloc[i] == pred[i] for i in range(len(pred))]) / len(pred)

0.8571428571428571