#### I reimplemented the naive Bayes method in python
Victor Ludvig, November 2023

In [1]:
import pandas as pd
import numpy as np
from math import prod
from sklearn.metrics import confusion_matrix

#### Read train file

In [2]:
df_train = pd.read_csv('./data/haggis_data.csv')
target = df_train.columns[-1]
print(f'target = {target}')
df_train

target = is_haggis


Unnamed: 0,skin,colour,size,flesh,eats_shortbread,length,is_haggis
0,hairy,brown,large,hard,1,3.25,1
1,hairy,green,?,hard,1,4.22,1
2,?,red,small,soft,0,1.27,0
3,hairy,green,large,hard,1,3.55,1
4,smooth,red,small,soft,0,2.13,0
5,smooth,green,large,soft,1,2.67,0
6,hairy,?,large,soft,0,3.77,1


#### Get categorical and numerical column names
Sets are also created to know during inference time if we have to use probabilities (categorical) or PDF (numerical).

In [3]:
list_categorical_columns = df_train.dtypes[df_train.dtypes != float].index.to_list()[:-1]
list_numerical_columns = df_train.dtypes[df_train.dtypes == float].index.to_list()
set_categorical, set_numerical = set(list_categorical_columns), set(list_numerical_columns)
print(f'Categorical columns: {list_categorical_columns}\nNumerical columns: {list_numerical_columns}')

Categorical columns: ['skin', 'colour', 'size', 'flesh', 'eats_shortbread']
Numerical columns: ['length']


#### Get the list of values possible for each column
The value '?' is not saved because the occurences of missing values are not counted

In [4]:
list_values = {label: [x for x in np.unique(df_train[label].values) if x != '?'] for label in list_categorical_columns}
list_values

{'skin': ['hairy', 'smooth'],
 'colour': ['brown', 'green', 'red'],
 'size': ['large', 'small'],
 'flesh': ['hard', 'soft'],
 'eats_shortbread': [0, 1]}

#### Create a data structure to save the number of occurences of yes and no for each label of each column
A nested dictionary is used

In [5]:
dictionnary_frequencies = {
    column_name: {label: {'yes': 0, 'no':0} for label in list_values[column_name]} for column_name in list_categorical_columns
}
print(f'dictionnary_frequencies = {dictionnary_frequencies}')

dictionnary_frequencies = {'skin': {'hairy': {'yes': 0, 'no': 0}, 'smooth': {'yes': 0, 'no': 0}}, 'colour': {'brown': {'yes': 0, 'no': 0}, 'green': {'yes': 0, 'no': 0}, 'red': {'yes': 0, 'no': 0}}, 'size': {'large': {'yes': 0, 'no': 0}, 'small': {'yes': 0, 'no': 0}}, 'flesh': {'hard': {'yes': 0, 'no': 0}, 'soft': {'yes': 0, 'no': 0}}, 'eats_shortbread': {0: {'yes': 0, 'no': 0}, 1: {'yes': 0, 'no': 0}}}


####  Add the number of occurences for categorical data
$\epsilon = 1$ is used for Laplace smoothing. <br>
Q1 : We see that Laplace smoothing adds the value 1 for each count of yes and no, so that no count is zero. Hence, a count of 0 in the number of yes or no for a label in the database will not set the corresponding probability to 0. <br>
I added a print after each column processed, so that we can see the effect of Laplace smoothing.

In [6]:
epsilon = 1
for column_name in list_categorical_columns:
    total_entries = 0
    mode = len(list_values[column_name])

    # First loop to get the number of yes and no corresponding to the label
    for label in list_values[column_name]:
        n_yes = len(df_train[(df_train[column_name]==label) & (df_train['is_haggis']==1)])
        n_no = len(df_train[(df_train[column_name]==label) & (df_train['is_haggis']==0)])
        total_entries += n_yes + n_no
        dictionnary_frequencies[column_name][label]['yes'] = n_yes + epsilon
        dictionnary_frequencies[column_name][label]['no'] = n_no + epsilon

    print(f'After adding the number of occurences of {column_name}, dictonnary_frequencies = {dictionnary_frequencies[column_name]}')

    # Second loop to divide by the total number of entries and add the epsilon
    for label in list_values[column_name]:
        dictionnary_frequencies[column_name][label]['yes'] = (dictionnary_frequencies[column_name][label]['yes'])/(total_entries+mode*epsilon)
        dictionnary_frequencies[column_name][label]['no'] = (dictionnary_frequencies[column_name][label]['no'])/(total_entries+mode*epsilon)

print(f'After dividing by the number of entries + the number of modes, dictionary_frequencies = {dictionnary_frequencies}')

After adding the number of occurences of skin, dictonnary_frequencies = {'hairy': {'yes': 5, 'no': 1}, 'smooth': {'yes': 1, 'no': 3}}
After adding the number of occurences of colour, dictonnary_frequencies = {'brown': {'yes': 2, 'no': 1}, 'green': {'yes': 3, 'no': 2}, 'red': {'yes': 1, 'no': 3}}
After adding the number of occurences of size, dictonnary_frequencies = {'large': {'yes': 4, 'no': 2}, 'small': {'yes': 1, 'no': 3}}
After adding the number of occurences of flesh, dictonnary_frequencies = {'hard': {'yes': 4, 'no': 1}, 'soft': {'yes': 2, 'no': 4}}
After adding the number of occurences of eats_shortbread, dictonnary_frequencies = {0: {'yes': 2, 'no': 3}, 1: {'yes': 4, 'no': 2}}
After dividing by the number of entries + the number of modes, dictionary_frequencies = {'skin': {'hairy': {'yes': 0.625, 'no': 0.125}, 'smooth': {'yes': 0.125, 'no': 0.375}}, 'colour': {'brown': {'yes': 0.2222222222222222, 'no': 0.1111111111111111}, 'green': {'yes': 0.3333333333333333, 'no': 0.2222222222

#### Add Mean and std for numerical data
We need to add the values from the test data as well here. <br>
The mean and std are computed with both train and test data. <br>
The classifier can in fact update the mean/std when it sees new data, so we compute everything beforehand.

First the test data set is loaded

In [7]:
df_test = pd.read_csv('./data/haggis_test.csv')
df_test

Unnamed: 0,skin,colour,size,flesh,eats_shortbread,length,is_haggis
0,smooth,red,large,hard,1,3.25,1
1,hairy,brown,small,hard,1,2.56,0
2,smooth,green,small,hard,1,3.05,1
3,hairy,red,large,soft,0,2.05,1


In [8]:
for column_name in list_numerical_columns:
    n_yes = len(df_train[df_train['is_haggis']==1]) + len(df_test[df_test['is_haggis']==1])
    sum_yes = sum(df_train[(df_train['is_haggis']==1)][column_name]) + sum(df_test[(df_test['is_haggis']==1)][column_name])
    n_no = len(df_train[df_train['is_haggis']==0]) + len(df_test[df_test['is_haggis']==0])
    sum_no = sum(df_train[(df_train['is_haggis']==0)][column_name]) + sum(df_test[(df_test['is_haggis']==0)][column_name])

    print(f'n_yes = {n_yes}, n_no = {n_no}')
    mean_yes, mean_no = sum_yes/n_yes, sum_no/n_no
    std_yes = np.sqrt((sum(df_train[(df_train['is_haggis']==1)]['length']-mean_yes)**2 + sum(df_test[(df_test['is_haggis']==1)]['length']-mean_yes)**2)/(n_yes-1))
    std_no = np.sqrt((sum(df_train[(df_train['is_haggis']==0)]['length']-mean_no)**2 + sum(df_test[(df_test['is_haggis']==0)]['length']-mean_no)**2)/(n_no-1))

    print(f'mean_yes = {mean_yes}, mean_no = {mean_no}')
    print(f'std_yes = {std_yes}, std_no = {std_no}')

    dictionnary_frequencies[column_name] = {'mean_1': mean_yes, 'mean_0': mean_no, 'std_yes': std_yes, 'std_no': std_no}

dictionnary_frequencies

n_yes = 7, n_no = 4
mean_yes = 3.3057142857142856, mean_no = 2.1575
std_yes = 0.9047903504300279, std_no = 0.32863987382341


{'skin': {'hairy': {'yes': 0.625, 'no': 0.125},
  'smooth': {'yes': 0.125, 'no': 0.375}},
 'colour': {'brown': {'yes': 0.2222222222222222, 'no': 0.1111111111111111},
  'green': {'yes': 0.3333333333333333, 'no': 0.2222222222222222},
  'red': {'yes': 0.1111111111111111, 'no': 0.3333333333333333}},
 'size': {'large': {'yes': 0.5, 'no': 0.25},
  'small': {'yes': 0.125, 'no': 0.375}},
 'flesh': {'hard': {'yes': 0.4444444444444444, 'no': 0.1111111111111111},
  'soft': {'yes': 0.2222222222222222, 'no': 0.4444444444444444}},
 'eats_shortbread': {0: {'yes': 0.2222222222222222, 'no': 0.3333333333333333},
  1: {'yes': 0.4444444444444444, 'no': 0.2222222222222222}},
 'length': {'mean_1': 3.3057142857142856,
  'mean_0': 2.1575,
  'std_yes': 0.9047903504300279,
  'std_no': 0.32863987382341}}

#### 2) Computation of confusion matrix


#### Conversion of the test dataframe into a matrix X

#### Creation of the predict function

In [9]:
def pdf(x, std, mean):
    return (1/(np.sqrt(2*np.pi)*std))*np.exp(-((x-mean)**2)/(2*std**2))

Reduction of variable name size to save space

In [10]:
df = dictionnary_frequencies

In [11]:
def predict_instance(i, df_test):
    """ Make the prediction of the i-th instance in the test_dataframe"""
    proba_yes = prod(df[c][df_test.iloc[i][c]]['yes'] for c in list_categorical_columns) * prod(pdf(df_test.iloc[i][c], df[c]['std_yes'], df[c]['mean_1']) for c in list_numerical_columns)
    proba_no = prod(df[c][df_test.iloc[i][c]]['no'] for c in list_categorical_columns) * prod(pdf(df_test.iloc[i][c], df[c]['std_no'], df[c]['mean_0']) for c in list_numerical_columns)
    proba_yes, proba_no = proba_yes/(proba_yes+proba_no), proba_no/(proba_yes+proba_no)
    return 1 if proba_yes > proba_no else 0

def predict(df_test):
    """ Make the prediction on the whole test_dataframe"""
    return [predict_instance(i, df_test) for i in range(len(df_test))]

list_predictions = predict(df_test)
print(f'list_predictions = {list_predictions}')
cm = confusion_matrix(df_test[target].to_list(), list_predictions)
print(f'Confusion matrix: \n{cm}')

list_predictions = [1, 1, 1, 0]
Confusion matrix: 
[[0 1]
 [1 2]]


The naive Bayse classifier outputs 2 True negative, 1 False Positive, 1 False negative.

#### 3)
> 1. Missing data are of type skin, colour and size
> 2. Missing data are not added in the computations of the statistics.
> 3. Numerical values are processed using a Probability Distribution Function. We make the hypothesis that the numerical values are distributed according to a normal law.

#### 4)

Just by looking at the training data, we see that samples with smooth hair are all negative. <br>
Color red are all associated with a negative target. <br>
A large size is associated with a negative target 1/3 of the time, and 2/3 of the time with a positive target. <br>
The flesh hard is associated with only a positive target <br>
The eats_shortbread is associated with a positive target 3/4 of the time, and 1/4 of the time with a negative target. <br>
The length 3.25 is closer to values of length associated with a negative target. <br>
Overall, the class prediction would be positive.

#### 5) 6)
I used the train/test sets division since the beginning of my work. <br>
> 6a.The sample of question 5 is classified as haggis, see the list_predictions above. <br>
The confusion matrix is shown above. <br>
> 6b. We get a 50% accuracy on this small test set.