In [94]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [95]:
# Import data

df = pd.read_csv("/content/drive/MyDrive/DS Course Uploads/Datasets/Obesity Classification.csv")
df.head()

Unnamed: 0,ID,Age,Gender,Height,Weight,BMI,Label
0,1,25,Male,175,80,25.3,Normal Weight
1,2,30,Female,160,60,22.5,Normal Weight
2,3,35,Male,180,90,27.3,Overweight
3,4,40,Female,150,50,20.0,Underweight
4,5,45,Male,190,100,31.2,Obese


### EDA

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      108 non-null    int64  
 1   Age     108 non-null    int64  
 2   Gender  108 non-null    object 
 3   Height  108 non-null    int64  
 4   Weight  108 non-null    int64  
 5   BMI     108 non-null    float64
 6   Label   108 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 6.0+ KB


In [97]:
df.Gender.value_counts()

Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
Male,56
Female,52


In [98]:
df.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
Underweight,47
Normal Weight,29
Overweight,20
Obese,12


In [99]:
df.isnull().sum()

Unnamed: 0,0
ID,0
Age,0
Gender,0
Height,0
Weight,0
BMI,0
Label,0


In [100]:
df.duplicated().sum()

0

In [101]:
# Map Gender to 0 and 1

df.Gender = df.Gender.map({"Female": 1, "Male": 0})

# Implement label encoding on Label column

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.Label = le.fit_transform(df.Label)

# Drop id column

df.drop("ID", axis=1, inplace=True)

In [102]:
df.head()

Unnamed: 0,Age,Gender,Height,Weight,BMI,Label
0,25,0,175,80,25.3,0
1,30,1,160,60,22.5,0
2,35,0,180,90,27.3,2
3,40,1,150,50,20.0,3
4,45,0,190,100,31.2,1


In [103]:
from re import X
# Separate input and output

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [109]:
# Standardise the data

scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=df.columns[:-1])
X

Unnamed: 0,Age,Gender,Height,Weight,BMI
0,-0.876032,-0.963624,0.303700,0.714053,0.629376
1,-0.672829,1.037749,-0.236952,0.017730,0.258448
2,-0.469625,-0.963624,0.483917,1.062214,0.894325
3,-0.266422,1.037749,-0.597387,-0.330431,-0.072738
4,-0.063219,-0.963624,0.844352,1.410375,1.410975
...,...,...,...,...,...
103,-1.445001,-0.963624,0.303700,-1.723076,-2.205577
104,-1.241798,1.037749,-0.236952,-1.723076,-2.205577
105,-1.038594,-0.963624,0.483917,-1.548995,-1.980370
106,-0.835391,1.037749,-0.597387,-1.548995,-1.980370


In [105]:
# Perform train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Modeling

In [106]:
# Create and pridict DecisionTree

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

In [107]:
# Evaluating the model

print("Accuracy - ", metrics.accuracy_score(y_test, y_pred).round(2))
print("Precision - ", metrics.precision_score(y_test, y_pred, average="weighted").round(2))
print("Recall - ", metrics.recall_score(y_test, y_pred, average="weighted").round(2))
print("F1 Score - ", metrics.f1_score(y_test, y_pred, average="weighted").round(2))

Accuracy -  1.0
Precision -  1.0
Recall -  1.0
F1 Score -  1.0


In [108]:
# Evaluating model on training data

y_pred_train = dt.predict(X_train)

print("Accuracy - ", metrics.accuracy_score(y_train, y_pred_train).round(2))
print("Precision - ", metrics.precision_score(y_train, y_pred_train, average="weighted").round(2))
print("Recall - ", metrics.recall_score(y_train, y_pred_train, average="weighted").round(2))
print("F1 Score - ", metrics.f1_score(y_train, y_pred_train, average="weighted").round(2))

Accuracy -  1.0
Precision -  1.0
Recall -  1.0
F1 Score -  1.0
