In this notebook I will try to show how to implement Naive Bayes in raw Python on Primal Indians Diabetes dataset from UCI Machine Learning repository (https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes).

In [1]:
import os
import pandas as pd
import numpy as np
import random
from urllib.request import urlretrieve
from tqdm import tqdm

### Constants

In [2]:
DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
DATA_FILE = "./" + "pima-indians-diabetes.data"

### Dataset info

#### Features:
1. Number of times pregnant 
2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
3. Diastolic blood pressure (mm Hg) 
4. Triceps skin fold thickness (mm) 
5. 2-Hour serum insulin (mu U/ml) 
6. Body mass index (weight in kg/(height in m)^2) 
7. Diabetes pedigree function 
8. Age (years) 
9. Class variable (0 or 1) 

### Prepare data

#### Download dataset

In [3]:
class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not os.path.isfile(DATA_FILE):
    with DLProgress(unit="B", unit_scale=True, miniters=1, desc="Prima Indians Diabetes dataset") as pbar:
        urlretrieve(DATA_URL, DATA_FILE, pbar.hook)

#### Loading files & extracting data

In [4]:
df_data = pd.read_csv(DATA_FILE, names = ["Pregnant_Times", 
                                          "Glucose", 
                                          "Diastolic_Blood_Pressure", 
                                          "Triceps_Skin_Fold_Thickness",
                                          "2-Hour_Serum_Insulin",
                                          "Body_Mass_Index",
                                          "Diabetes_Pedigree_Function",
                                          "Age",
                                          "Class"])

#### Presenting data

In [5]:
df_data.head()

Unnamed: 0,Pregnant_Times,Glucose,Diastolic_Blood_Pressure,Triceps_Skin_Fold_Thickness,2-Hour_Serum_Insulin,Body_Mass_Index,Diabetes_Pedigree_Function,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
df_data.describe()

Unnamed: 0,Pregnant_Times,Glucose,Diastolic_Blood_Pressure,Triceps_Skin_Fold_Thickness,2-Hour_Serum_Insulin,Body_Mass_Index,Diabetes_Pedigree_Function,Age,Class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
print("Samples num: {}".format(df_data.shape[0]))
print("Features num: {}".format(df_data.shape[1] - 1))
print("Class 0 samples: {}".format(df_data.loc[(df_data["Class"] == 0)].shape[0]))
print("Class 1 samples: {}".format(df_data.loc[(df_data["Class"] == 1)].shape[0]))

Samples num: 768
Features num: 8
Class 0 samples: 500
Class 1 samples: 268


### Dataset preparation

#### Group data by class

In [8]:
df_diabetes = df_data.loc[(df_data["Class"] == 1)]
df_non_diabetes = df_data.loc[(df_data["Class"] == 0)]

In [9]:
# Convert diabetes datafreme into numpy array
df_diabetes_label = df_diabetes["Class"].as_matrix()
df_diabetes_data = df_diabetes.drop(["Class"], axis=1).as_matrix()

# Convert non_diabetes datafreme into numpy array
df_non_diabetes_label = df_non_diabetes["Class"].as_matrix()
df_non_diabetes_data = df_non_diabetes.drop(["Class"], axis=1).as_matrix()

In [10]:
# Stack diabetes samples into (features, class) tuple list
diabetes_stacked_list = list()
for data, label in zip(df_diabetes_data, df_diabetes_label):
    diabetes_stacked_list.append((data, label))
    
# Stack non_diabetes samples into (features, class) tuple list
non_diabetes_stacked_list = list()
for data, label in zip(df_non_diabetes_data, df_non_diabetes_label):
    non_diabetes_stacked_list.append((data, label))
    
# Shuffle data
random.shuffle(diabetes_stacked_list)
random.shuffle(non_diabetes_stacked_list)
    
classes = [diabetes_stacked_list, non_diabetes_stacked_list]

In [11]:
print("Diabetes samples: {}".format(len(diabetes_stacked_list)))
print("Non-diabetes samples: {}".format(len(non_diabetes_stacked_list)))

Diabetes samples: 268
Non-diabetes samples: 500


#### Split into train, val, test

In [12]:
train_data = list()
val_data = list()
test_data = list()

train_data_split_ratio = 0.7
val_data_split_ratio = 0.2
test_data_split_ration = 0.1

In [13]:
# Divide data so each split has the same ratio of each sample
for class_data in classes: 
    train_split_first_index = 0
    train_split_last_index = round(len(class_data) * train_data_split_ratio)
    
    val_split_first_index = train_split_last_index
    val_split_last_index = round((len(class_data)) * (val_data_split_ratio + train_data_split_ratio))
    
    test_split_first_index = val_split_last_index
    test_split_last_index = round((len(class_data)) * (
        val_data_split_ratio + train_data_split_ratio + test_data_split_ration))
    
    train_data.extend(class_data[train_split_first_index:train_split_last_index])
    val_data.extend(class_data[val_split_first_index:val_split_last_index])
    test_data.extend(class_data[test_split_first_index:test_split_last_index])

#### Present result after split

In [14]:
print("Train data samples: {}".format(len(train_data)))
print("Validation data samples: {}".format(len(val_data)))
print("Test data data samples: {}".format(len(test_data)))

Train data samples: 538
Validation data samples: 153
Test data data samples: 77


#### Helper functions

In [15]:
def unwrap_data(tuple_list):
    """Function that unwraps list of tuples into sample list and label list."""
    sample_list = list()
    label_list = list()
    
    for sample, label in tuple_list:
        sample_list.append(sample)
        label_list.append(label)
    
    return sample_list, label_list

### Naive Bayes - raw implementation

In [16]:
class NaiveBayes():
    def __init__(self):
        """Implementation of Gaussian Naive Bayes using following rule:
            
                P(C|X) = (P(X|C) * P(C)) / P(X)
            
            where:
            
            - P(C|X): Posterior Probability
            - P(X|C): Likelihood of sample X given Gaussian distribution C. 
            - P(C): Class Prior Probability of X belonging to specific class.
            - P(X): Predictor Prior Probability which brings posterior to proper probability
                    distribution value. It is ignored as we pick class P(C|X) with largest value
                    and P(X) doesn't affect the result.
        """
        self.class_representations = dict()
        self.class_samples_num = dict()
        
    def _count_class_samples(self, y):
        for y_class in y:
            self.class_samples_num.setdefault(y_class, 0)
            self.class_samples_num[y_class] += 1
    
    def _create_representation_of_each_class(self, X, y):
        """Groups all samples from X by classes in y. Using values from all samples creates
        so called 'representation classes' - samples holding mean and std values based on
        all samples of specific class."""
        
        # Group samples by class
        class_samples_dict = dict()
        for sample, label in zip(X,y):
            class_samples_dict.setdefault(label, list()) 
            class_samples_dict[label].append(sample)
        
        # Calculate mean and std of each class feature based on values from all it's samples
        for class_label, class_samples_list in class_samples_dict.items():
            # Calculating mean and std values columnwise 
            mean_features = np.mean(class_samples_list, axis = 0)
            std_features = np.std(class_samples_list, axis = 0)
            
            # Stacking mean and std into tuples and forming sample which represents values of
            # all samples from specific group
            std_mean_stacked = [(mean, std) for mean, std in zip(mean_features, std_features)]
            
            # Saving created representation sample 
            self.class_representations[class_label] = std_mean_stacked
            
    def fit(self, X, y):
        """Given data X and it's labels it calculates mean and std of each feature
        from all classes."""
        self._count_class_samples(y)
        self._create_representation_of_each_class(X, y)
    
    def _calculate_class_prior_probability(self, c):
        """Calculate prior of class c."""
        all_classes_num = np.sum(list(self.class_samples_num.values()))
        c_class_occurances = self.class_samples_num[c]
        return c_class_occurances / c_class_occurances
    
    def _calculate_likelihood(self, feature_val, mean, std):
        """Likelihood calculated with Gaussian Density Function. Returns likelihood of data X 
        basedon its single feature value given mean and var based on all samples of specific 
        class."""
        coefficient = (1.0 / np.sqrt(2.0 * np.pi * std**2 + 1e-7))
        exponent = np.exp(-((feature_val - mean)**2) / (2.0 * std**2 + 1e-7))
        return coefficient * exponent
            
    def _calculate_posteriors(self, x_sample):
        """Calculate probabilities of x_sample belonging to specific class"""
        posteriors = dict()
        
        for class_label, class_representation_values in self.class_representations.items():
            # Setting posterior to 1 * prior P(C) value
            class_posterior = self._calculate_class_prior_probability(class_label)
            
            # Calculating likelihood P(X|C) based on each feature of x_sample given mean, 
            # std of representation class and forming complete posterior
            for feature_index, feature_val in enumerate(x_sample):
                mean, std = class_representation_values[feature_index]
                class_posterior += self._calculate_likelihood(feature_val, mean, std)
            
            # Storing posterior for each class
            posteriors[class_label] = class_posterior
            
        return posteriors
            
    def predict(self, X):
        """Predicting value for every sample in array X."""
        predictions = list()
        
        # Making pradiction for every X sample
        for x_sample in X:
            posteriors = self._calculate_posteriors(x_sample)
            
            # Retriving position of posterior with max value and using it to get class name
            probabilities_list = list(posteriors.values())
            classes_list = list(posteriors.keys())
            predictions.append(classes_list[probabilities_list.index(max(probabilities_list))])
        
        return predictions

### Model

In [17]:
def accuracy(predictions, targets):
    """Calculates what percent of predictions have the same value as target."""
    correct_predictions = 0
    for prediction, target in zip(predictions, targets):
        if prediction == target:
            correct_predictions += 1
    return correct_predictions / len(targets)

#### Creating new NaiveBayes model

In [18]:
naiveBayes = NaiveBayes()

#### Fitting data

In [19]:
train_samples, train_labels = unwrap_data(train_data)
naiveBayes.fit(train_samples, train_labels)

#### Validation

In [20]:
val_samples, val_labels = unwrap_data(val_data)
predictions = naiveBayes.predict(val_samples)

In [21]:
val_acc = accuracy(predictions, val_labels)
print("Validation accuracy: {}".format(val_acc))

Validation accuracy: 0.6993464052287581


#### Test

In [22]:
test_samples, test_labels = unwrap_data(test_data)
predictions = naiveBayes.predict(test_samples)

In [23]:
test_acc = accuracy(predictions, test_labels)
print("Test accuracy: {}".format(test_acc))

Test accuracy: 0.7012987012987013
