# Gaussian Naive Bayes (Gender Prediction)

https://www.youtube.com/watch?v=H3EjCKtlVog

In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.model_selection import train_test_split

*Note:* `1 - women, 2 - men`

In [4]:
data = pd.read_csv('./data/cardio.csv', delimiter=';')
data = data[['weight', 'height', 'gender']]
data.head(5)

Unnamed: 0,weight,height,gender
0,62.0,168,2
1,85.0,156,1
2,64.0,165,1
3,82.0,169,2
4,56.0,156,1


In [8]:
train, test = train_test_split(data, test_size=0.2)

train.head(5)

Unnamed: 0,weight,height,gender
13270,80.0,150,1
67975,80.0,153,1
7345,63.0,150,1
39255,78.0,164,1
47715,58.0,156,1


In [11]:
features = ['weight', 'height']
label_column = 'gender'
labels = train[label_column].unique()

In [12]:
train_by_label = { label: train[train[label_column] == label] for label in labels }

assert(train_by_label[1].shape[0] + train_by_label[2].shape[0] == train.shape[0])

In [13]:
total = train.shape[0]

prior = { label: train_by_label[label].shape[0] / total for label in labels }

assert(prior[1] + prior[2] == 1)

In [14]:
dists = { label: {} for label in labels }

for label in labels:
  for f in features:
    col = train_by_label[label][f]
    dists[label][f] = stats.norm(col.mean(), col.std())

Avoiding underflow (when probabilities are too small) can be done with this trick:

https://en.wikipedia.org/wiki/Log_probability

In [15]:
def classify(input):
  p = dict(prior)

  for label in labels:
    p[label] = np.log(p[label])

    for f in features:
      p[label] += np.log(dists[label][f].pdf(input[f]))

  return max(p, key=lambda k: p[k])

# Test

In [18]:
from sklearn.metrics import confusion_matrix, f1_score

In [19]:
label_correct = test[label_column]
label_predicted = [classify(row) for _, row in test.iterrows()]

print(confusion_matrix(label_correct, label_predicted))
print(f1_score(label_correct, label_predicted))

[[8096 1006]
 [2313 2585]]
0.8298908308133873
