# Naive Bayes Classifier

## Importing required libraries

In [1]:
# To load breast cancer dataset
from sklearn import datasets  

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from matplotlib import pyplot

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import time

# To execute a cell line by line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings('ignore')

### Import Data

In [57]:
# Loading boys_names.csv dataset 
df_boys = pd.read_csv('boy_names.csv', index_col=0)

# Adding a label '-1' to indicate a boy's name
df_boys['label'] = '-1'
print("The number of rows in df_boys : {}".format(df_boys.shape[0]))
df_boys.head()

The number of rows in df_boys : 1000


Unnamed: 0,x,label
1,Otho,-1
2,Caswell,-1
3,Deforest,-1
4,Eddy,-1
5,Corbett,-1


In [58]:
# Loading girls_names.csv dataset 
df_girls = pd.read_csv('girl_names.csv', index_col=0)

# Adding a label '+1' to indicate a boy's name
df_girls['label'] = '+1'
print("The number of rows in df_girls : {}".format(df_girls.shape[0]))
df_girls.head()

The number of rows in df_girls : 1000


Unnamed: 0,x,label
1,Elaina,1
2,Deedee,1
3,Aaliyah,1
4,Kathey,1
5,Antonetta,1


In [59]:
# Joining both the datasets to create a common dataset with names and respective labels
df_names = pd.concat([df_boys,df_girls])
print("The number of rows in data : {}".format(df_names.shape[0]))
df_names = df_names.reset_index(drop = True)
df_names.head()

The number of rows in data : 2000


Unnamed: 0,x,label
0,Otho,-1
1,Caswell,-1
2,Deforest,-1
3,Eddy,-1
4,Corbett,-1


### Data Preprocessing

In [60]:
# Renaming the name column
df_names = df_names.rename(columns={'x': 'name'})

In [61]:
# Converting all the characters to lower case in all names
df_names['name'] = df_names['name'].str.lower()

In [62]:
# Removing any white spaces in all the names
df_names['name'] = df_names['name'].str.strip()

Creating additional features using existing feature

I have created 4 additional features using the 'name' feature
1. #Consonants : To count number of consonants in every name
2. #Vowels : To count number of vowels in every name
3. Name starting with a consonant? : To check if a name starts with a consonant
4. Name starting with a vowel? : To check if a name starts with a vowel

In [63]:
# Initialising columns for additional features
def colInit(df):
  df['#Consonants'] = 0
  df['#Vowels'] = 0
  df['Name starting with a consonant?'] = 0
  df['Name starting with a vowel?'] = 0
  return df

In [64]:
df_names = colInit(df_names)  

In [65]:
# To count numner of consonants and vowels in every name (code citation : code4coding.com)
def calc_count(df):
  for i in range(len(df)):
      name = df.iloc[i]['name']
      vowels=0
      consonants=0
      # Checking each letter in a name if it is a consonant or a vowel
      for letter in name:
          if (letter == 'a'or letter == 'e'or letter == 'i' or letter == 'o'or letter == 'u'):
                vowels=vowels+1;#vowel counter is incremented by 1
          else:
              consonants=consonants+1;
          # Setting the count in the columns based on above calculations
          df.loc[i,'#Consonants'] = consonants
          df.loc[i,'#Vowels'] = vowels
  return df

In [66]:
# Function to checck if the name is starting with a vowel or consonant (code citation : code4coding.com)
def calc_nameStart(df):
  for i in range(len(df)):
    name = df.iloc[i]['name']
    # Checking if the first letter in a name it is a consonant or a vowel
    if (name[0] == 'a'or name[0] == 'e'or name[0] == 'i' or name[0] == 'o' or name[0] == 'u'):
          df.loc[i,'Name starting with a vowel?'] = 1
    else:
      df.loc[i,'Name starting with a consonant?'] = 1
  return df

In [67]:
df_names = calc_count(df_names)
df_names = calc_nameStart(df_names)
df_names.head()

Unnamed: 0,name,label,#Consonants,#Vowels,Name starting with a consonant?,Name starting with a vowel?
0,otho,-1,2,2,0,1
1,caswell,-1,5,2,1,0
2,deforest,-1,5,3,1,0
3,eddy,-1,3,1,0,1
4,corbett,-1,5,2,1,0


### Implementing Naive Bayes from scratch

We need to estimate the likelihood of the features and as all the features are continuous, we can choose Gaussian distribution for this purpose. Gaussian Density function is given by:

> f
(
x
)
=
(
1
/
s
q
r
t
(
2
∗
P
I
)
∗
s
i
g
m
a
)
∗
e
x
p
(
−
(
(
x
−
m
e
a
n
)
2
/
(
2
∗
s
i
g
m
a
2
)
)
)

> where μ is mean, σ² is variance, σ is square root of variance (standard deviation).

For this, we need to first calculate mean and variance for each feature and calculate Gaussian density function

In [68]:
# Calculating the count of boys, girls and total names
boys_count = df_names['label'][df_names['label'] == '-1'].count()
girls_count = df_names['label'][df_names['label'] == '+1'].count()
total_count = df_names['label'].count()

In [69]:
# Calculating the probability of a name being girl or boy
boys_prob = boys_count/total_count
girls_prob = girls_count/total_count

In [70]:
# Calculating the mean and variance for each feature based on the label
calc_mean = df_names.groupby('label').mean()
calc_variance = df_names.groupby('label').var()
calc_mean
calc_variance

Unnamed: 0_level_0,#Consonants,#Vowels,Name starting with a consonant?,Name starting with a vowel?
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3.416,2.66,0.83,0.17
-1,3.58,2.213,0.805,0.195


Unnamed: 0_level_0,#Consonants,#Vowels,Name starting with a consonant?,Name starting with a vowel?
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.248192,0.737137,0.141241,0.141241
-1,1.180781,0.632263,0.157132,0.157132


In [71]:
# Mean of each feature given it belongs to 'boys' class
boy_consCount_mean = calc_mean['#Consonants'][calc_mean.index == '-1'].values[0]
boy_vowCount_mean = calc_mean['#Vowels'][calc_mean.index == '-1'].values[0]
boy_consStart_mean = calc_mean['Name starting with a consonant?'][calc_mean.index == '-1'].values[0]
boy_vowStart_mean = calc_mean['Name starting with a vowel?'][calc_mean.index == '-1'].values[0]

# Variance Mean of each feature given it belongs to 'boys' class
boy_consCount_var = calc_variance['#Consonants'][calc_variance.index == '-1'].values[0]
boy_vowCount_var = calc_variance['#Vowels'][calc_variance.index == '-1'].values[0]
boy_consStart_var = calc_variance['Name starting with a consonant?'][calc_variance.index == '-1'].values[0]
boy_vowStart_var = calc_variance['Name starting with a vowel?'][calc_variance.index == '-1'].values[0]

# Mean of each feature given it belongs to 'girls' class
girl_consCount_mean = calc_mean['#Consonants'][calc_mean.index == '+1'].values[0]
girl_vowCount_mean = calc_mean['#Vowels'][calc_mean.index == '+1'].values[0]
girl_consStart_mean = calc_mean['Name starting with a consonant?'][calc_mean.index == '+1'].values[0]
girl_vowStart_mean = calc_mean['Name starting with a vowel?'][calc_mean.index == '+1'].values[0]

# Variance Mean of each feature given it belongs to 'girls' class
girl_consCount_var = calc_variance['#Consonants'][calc_variance.index == '+1'].values[0]
girl_vowCount_var = calc_variance['#Vowels'][calc_variance.index == '+1'].values[0]
girl_consStart_var = calc_variance['Name starting with a consonant?'][calc_variance.index == '+1'].values[0]
girl_vowStart_var = calc_variance['Name starting with a vowel?'][calc_variance.index == '+1'].values[0]

In [72]:
# Function to calculate the probability density of each attribute of the terms of the likelihood
def calc_prob(x, mean, variance):
  # Input the arguments into a probability density function
  prob = 1/(np.sqrt(2*np.pi*variance)) * np.exp((-(x-mean)**2)/(2*variance))
  return prob

In [86]:
X = df_names.loc[:,df_names.columns != 'label']
y = df_names['label']

In [87]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in actual.index:
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

Here we have built the Naive Bayes clasifier usin the existing data. Now we can predict the label for train data and calculate accuracy. I'm calculating the probability density of each feature of the terms of likelihood. For example:

CodeCogsEqn.png

In [88]:
X['label'] = 0

for i in range(len(X)):
  prob_boys = boys_prob * calc_prob(X['#Consonants'][i], boy_consCount_mean, boy_consCount_var) * \
  calc_prob(X['#Vowels'][i], boy_vowCount_mean, boy_vowCount_var) * \
  calc_prob(X['Name starting with a consonant?'][i], boy_consStart_mean, boy_consStart_var) * \
  calc_prob(X['Name starting with a vowel?'][i], boy_vowStart_mean, boy_vowStart_var) 

  prob_girls = girls_prob * calc_prob(X['#Consonants'][i], girl_consCount_mean, girl_consCount_var) * \
  calc_prob(X['#Vowels'][i], girl_vowCount_mean, girl_vowCount_var) * \
  calc_prob(X['Name starting with a consonant?'][i], girl_consStart_mean, girl_consStart_var) * \
  calc_prob(X['Name starting with a vowel?'][i], girl_vowStart_mean, girl_vowStart_var) 

  if prob_boys > prob_girls:
    X.loc[i,'label'] = '-1'
  else:
    X.loc[i,'label'] = '+1'

accuracy = accuracy_metric(y,X['label'])
print("The accuracy on test dataset is {:.2f}%".format(accuracy))

The accuracy on test dataset is 62.95%


### Labelling the test data

In [89]:
# Loading test_names.csv dataset 
df_test = pd.read_csv('test_names.csv', index_col=0)
print("The number of rows in df_girls : {}".format(df_test.shape[0]))

The number of rows in df_girls : 100


In [90]:
df_test = colInit(df_test)
df_test.head(2)

Unnamed: 0,x,#Consonants,#Vowels,Name starting with a consonant?,Name starting with a vowel?
1,Brittani,0,0,0,0
2,Brandin,0,0,0,0


In [91]:
# Renaming the name column
df_test = df_test.rename(columns={'x': 'name'})

In [92]:
# Converting all the characters to lower case in all names
df_test['name'] = df_test['name'].str.lower()

In [93]:
# Removing any white spaces in all the names
df_test['name'] = df_test['name'].str.strip()

In [94]:
df_test['label'] = 0

for i in df_test.index:
  prob_boys = boys_prob * calc_prob(df_test['#Consonants'][i], boy_consCount_mean, boy_consCount_var) * \
  calc_prob(df_test['#Vowels'][i], boy_vowCount_mean, boy_vowCount_var) * \
  calc_prob(df_test['Name starting with a consonant?'][i], boy_consStart_mean, boy_consStart_var) * \
  calc_prob(df_test['Name starting with a vowel?'][i], boy_vowStart_mean, boy_vowStart_var) 

  prob_girls = girls_prob * calc_prob(df_test['#Consonants'][i], girl_consCount_mean, girl_consCount_var) * \
  calc_prob(df_test['#Vowels'][i], girl_vowCount_mean, girl_vowCount_var) * \
  calc_prob(df_test['Name starting with a consonant?'][i], girl_consStart_mean, girl_consStart_var) * \
  calc_prob(df_test['Name starting with a vowel?'][i], girl_vowStart_mean, girl_vowStart_var) 

  if prob_boys > prob_girls:
    df_test.loc[i,'label'] = '-1'
  else:
    df_test.loc[i,'label'] = '+1'

In [95]:
df_test.head()

Unnamed: 0,name,#Consonants,#Vowels,Name starting with a consonant?,Name starting with a vowel?,label
1,brittani,0,0,0,0,-1
2,brandin,0,0,0,0,-1
3,darry,0,0,0,0,-1
4,tresa,0,0,0,0,-1
5,fabiola,0,0,0,0,-1


In [96]:
# Exporting the output as a csv file
df_test.to_csv('test_output.csv', index=False)