# Problem 7.6
# Implemetation of Semi-Naïve Bayes Classifier 
# Averaged One Dependent Estimator (AODE)

In [1]:
import pandas as pd
import numpy as np
import scipy.stats

# 1. Conditional Probability

- The prior probability of a class $\hat{P} (c,x_{i})$ can be estimated as
$$ \hat{P} (c,x_{i}) = \frac{|D_{c, x_{i}}| + 1}{|D|+ N_{i}} $$

- The conditional probability $ \hat{P} (x_{j}|c, x_{i}) $ can be estimated as
$$ \hat{P} (x_{j}|c, x_{i}) = \frac{|D_{c,x_{i}, x_{j}}|+1}{|D_{c,x_{i}}| + N_{j}} $$

or the PDF of the supposed distribution for continuous variables

In [118]:
def sample_prob(df_target, c):
    """
    This function returns the prior probaility of a class, c, after Laplacian Correction
    It has two inputs:
    1. df_target: the train df column of target
    2. c: value of class, c
    """
    upper = sum([1 for e in df_target if e == c]) + 1
    bottom = len(df_target) + len(set(df_target))
    
    return upper/bottom

In [4]:
def prior_prob(df_train,target_name, c, x_name, x_value, dtype):
    """
    This function returns the prior probaility of a class, c, after Laplacian Correction
    It has six inputs:
    1. df_train: the train df 
    2. target_name: string, column name of target
    3. c: value of class, c
    4. x_name: string, column name of x
    5. x_value: value of xi
    6. dtype: the dtype of column x
    """
    # for continuous features:
    # p(c, xi) = p(c) * p(xi|c)
    if np.issubdtype(dtype, np.number):
        # calculate p(c)
        p_c = sample_prob(df_train[target_name], c) 
        
        # calculate p(xi|c)
        d_c = df_train[df_train[target_name] == c]
        u = np.mean(d_c[x_name])
        sigma = np.std(d_c[x_name])
        p_xi_c = scipy.stats.norm(u, sigma).pdf(x_value)
        return p_c * p_xi_c
    
    # for categorical features    
    else:
        d_c_xi = len(df_train[(df_train[target_name]==c) & (df_train[x_name]==x_value)])
        ni = len(set(df_train[target_name]))
        return (d_c_xi+1)/(len(df_train) + ni)    

In [115]:
def condition_prob(df_train, target_name, c, xi_info, xj_info):
    """
    This function returns the conditional probability P(x_j|c, x_i)
    It has five inputs:
    1. df_train: the train df
    2. target_name: string, the column name of target
    3. c: the class vlaue
    4. xi_info: a list of [xi_name, xi_value, xi_dtype]
    5. xj_info: a list of [xj_name, xj_value, xj_dtype]
    """
    [xi_name, xi_value, xi_dtype] = xi_info
    [xj_name, xj_value, xj_dtype] = xj_info
    
    if not np.issubdtype(xj_dtype, np.number):
        if not np.issubdtype(xi_dtype, np.number):
            d_c_xi = df_train[(df_train[target_name]==c)&(df_train[xi_name]==xi_value)]
        else:
            array_i_bins = pd.cut(df_train[xi_name], 3)
            selected_index = [i for i in range(len(array_i_bins)) 
                              if xi_value in array_i_bins[i]]
            d_xi = df_train.iloc[selected_index]
            d_c_xi = d_xi[d_xi[target_name]==c]
        
        n_j = len(set(df_train[df_train[xj_name]==xj_value][target_name]))
        d_c_xi_xj = d_c_xi[d_c_xi[xj_name]==xj_value]
        return (len(d_c_xi_xj)+1)/(len(d_c_xi)+n_j)
    else:
        if not np.issubdtype(xi_dtype, np.number):
            d_c_xi = df_train[(df_train[target_name]==c)&(df_train[xi_name]==xi_value)]
            u_xj = np.mean(d_c_xi[xj_name])
            sigma_xj = np.std(d_c_xi[xj_name])
            return scipy.stats.norm(u_xj, sigma_xj)
        else: 
        #if np.issubdtype(xj_dtype, np.number):
            d_c = df_train[(df_train[target_name]==c)]
            u_i = np.mean(d_c[xi_name])
            sigma_i = np.std(d_c[xi_name])
            p_xi_c_norm = scipy.stats.norm(u_i, sigma_i)
        
            array_ij = np.vstack((d_c[xi_name].values, d_c[xj_name].values))
            u_ij = np.mean(array_ij, axis=1)
            cov_ij = np.cov(array_ij)
            p_xj_xi_c_norm = scipy.stats.multivariate_normal(u_ij, cov_ij)
            
            return p_xi_c_norm, p_xj_xi_c_norm

# 2. AODE

In [116]:
def aode(df_train, target_name, pred_var, m_value):
    """
    This function return the dictionary of all the probabilities
    It has four inputs:
    1. df_train: train df, including target
    2. target_name: string, the column name of target
    3. pred_var: a row of validation variabble values
    4. m_value: the threshold
    """
    possible_class = list(set(df_train[target_name]))
    variable_names = pred_var.columns
    dtypes = pred_var.dtypes
    
    p = 0
    predictions ={}
    for c_i in possible_class:
        p_i = 0
        for i in range(len(pred_var.values[0])):
            xi_info = [variable_names[i], 
                       pred_var.values[0][i], 
                       dtypes[i]]
            if (not np.issubdtype(xi_info[2], np.number)) and \
            len(df_train[df_train[xi_info[0]]==xi_info[1]]) < m_value:
                prob = 0
            else:
                prob = prior_prob(df_train,target_name, 
                                  c_i, xi_info[0], 
                                  xi_info[1], 
                                  xi_info[2])
                for j in range(len(pred_var.values[0])):
                    if i != j :
                        xj_info = [variable_names[j], 
                                   pred_var.values[0][j], 
                                   dtypes[j]]
                        cond_prob = condition_prob1(df_train, 
                                                   target_name, 
                                                   c_i, 
                                                   xi_info, 
                                                   xj_info)
                        if not np.issubdtype(xj_info[2], np.number):
                            prob *= cond_prob
                        elif not np.issubdtype(xi_info[2], np.number):
                            cond_prob = cond_prob.pdf(xj_info[1])
                            prob *= cond_prob
                        else:
                            p_xji_c = cond_prob[1].pdf([xi_info[1], xj_info[1]])
                            p_xi_c = cond_prob[0].pdf(xi_info[1])
                            cond_prob = p_xji_c / p_xi_c
                            prob *= cond_prob
            p_i += prob
            
        predictions[c_i] = p_i    
        if p_i > p:
            p = p_i
            prediction = c_i    
    
    return prediction, predictions

# 4. Load Data and Prediction

In [5]:
data = pd.read_csv('../data/data.txt').drop(['Id'], axis=1)

In [6]:
data.head()

Unnamed: 0,color,root,sound,stripes,umbilical,touch,density,sugar,quality
0,dark-green,roll-up,dull,clear,hollow,hard,0.697,0.46,good
1,pitch-dark,roll-up,dead,clear,hollow,hard,0.744,0.376,good
2,pitch-dark,roll-up,dull,clear,hollow,hard,0.634,0.264,good
3,dark-green,roll-up,dead,clear,hollow,hard,0.608,0.318,good
4,white,roll-up,dull,clear,hollow,hard,0.556,0.215,good


In [80]:
vali = data.iloc[[0]].drop(['quality'],axis=1)

In [117]:
aode(data, 'quality', vali, 2)

('good', {'bad': 0.00051651694867467068, 'good': 1.2183266607232781})