# Import statements

In [10]:
import pandas as pd
from sklearn import naive_bayes
from sklearn import model_selection
from sklearn import metrics
from sklearn import preprocessing as pp
import numpy as np

# Reading and printing data

In [6]:
data = pd.read_csv('patient_enrollment_diet.csv')

In [7]:
data.head(10)

Unnamed: 0,Patient ID,Gender,Weight (kg),Height (cm),BMI,Age,Physical Activity,Needs_Diet
0,1,Male,101,151,44.3,58,Lightly active,1
1,2,Female,64,155,26.64,45,Sedentary,1
2,3,Male,110,191,30.15,24,Very active,1
3,4,Female,70,153,29.9,26,Moderately active,1
4,5,Male,73,178,23.04,25,Moderately active,0
5,6,Male,52,167,18.65,29,Moderately active,0
6,7,Male,71,175,23.18,51,Lightly active,0
7,8,Female,102,193,27.38,50,Lightly active,1
8,9,Female,51,183,15.23,65,Lightly active,0
9,10,Female,79,159,31.25,40,Lightly active,1


# Cleaning dataset

In [None]:
# encoding
le = pp.LabelEncoder()

data['Physical Activity'] = le.fit_transform(data['Physical Activity'])

In [22]:
temp = list(data.columns)
temp[7] = 'Physical Activity'
data.columns = temp

In [23]:
data.head()

Unnamed: 0,Patient ID,Gender,Weight (kg),Height (cm),BMI,Age,Needs_Diet,Physical Activity
0,1,Male,101,151,44.3,58,1,0
1,2,Female,64,155,26.64,45,1,2
2,3,Male,110,191,30.15,24,1,3
3,4,Female,70,153,29.9,26,1,1
4,5,Male,73,178,23.04,25,0,1


In [24]:
data = pd.get_dummies(data, columns=['Gender'], drop_first=True)


In [26]:
data.head()

Unnamed: 0,Patient ID,Weight (kg),Height (cm),BMI,Age,Needs_Diet,Physical Activity,Gender_Male
0,1,101,151,44.3,58,1,0,True
1,2,64,155,26.64,45,1,2,False
2,3,110,191,30.15,24,1,3,True
3,4,70,153,29.9,26,1,1,False
4,5,73,178,23.04,25,0,1,True


In [27]:
y = data['Needs_Diet']
X = data.drop(['Needs_Diet' , 'Patient ID'],axis = 1)

In [28]:
y.shape

(50,)

In [29]:
X.shape

(50, 6)

# Dividing data into train and test

In [30]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=42)

In [31]:
X_train.shape

(37, 6)

In [32]:
X_test.shape

(13, 6)

# Fit model and evaluate

In [33]:
nb = naive_bayes.GaussianNB()

nb.fit(X_train,y_train)
y_pred = nb.predict(X_test)

In [35]:

print('accuracy score:' , metrics.accuracy_score(y_test, y_pred))

print('precision score:' , metrics.precision_score(y_test, y_pred))

print('recall score:' , metrics.recall_score(y_test, y_pred))

print('F1 score:' , metrics.f1_score(y_test, y_pred))

accuracy score: 0.6923076923076923
precision score: 0.7272727272727273
recall score: 0.8888888888888888
F1 score: 0.7999999999999999


So we see that the model was able to predict correctly 70% of the time whether a strict diet was needed or not.

In [36]:
y_pred

array([1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

In [39]:
np.array(y_test)

array([1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1])