In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
ob_data = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

In [3]:
ob_data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [4]:
ob_data.shape

(2111, 17)

In [5]:
ob_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [6]:
ob_data.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [7]:
# Preprocess the dataset
encoder = LabelEncoder()
ob_data['Gender'] = encoder.fit_transform(ob_data['Gender'])
ob_data['family_history_with_overweight'] = encoder.fit_transform(ob_data['family_history_with_overweight'])
ob_data['FAVC'] = encoder.fit_transform(ob_data['FAVC'])
ob_data['CAEC'] = encoder.fit_transform(ob_data['CAEC'])
ob_data['SMOKE'] = encoder.fit_transform(ob_data['SMOKE'])
ob_data['SCC'] = encoder.fit_transform(ob_data['SCC'])
ob_data['CALC'] = encoder.fit_transform(ob_data['CALC'])
ob_data['MTRANS'] = encoder.fit_transform(ob_data['MTRANS'])
ob_data['NObeyesdad'] = encoder.fit_transform(ob_data['NObeyesdad'])

In [8]:
#spliting data into dependent and independent features
x = ob_data.drop('NObeyesdad',axis=1)
y = ob_data['NObeyesdad']

In [9]:
#splitiing into test and train
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [10]:
#Linear Regression Classifier model
lr_model = LogisticRegression()
lr_model.fit(x_train,y_train)
lr_prediction = lr_model.predict(x_test)


In [11]:
#Descision tree classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train,y_train)
dt_prediction = dt_model.predict(x_test)

In [12]:
#Random forest classifier
rf_model = RandomForestClassifier()
rf_model.fit(x_train,y_train)
rf_prediction = rf_model.predict(x_test)

In [13]:
#K-neighbour
kn_model = KNeighborsClassifier()
kn_model.fit(x_train,y_train)
kn_prediction =kn_model.predict(x_test)

In [14]:
#SVM classifier
svm_model = SVC()
svm_model.fit(x_train,y_train)
svm_prediction = svm_model.predict(x_test)

In [15]:
#Classification report for each classification model
print("Linear Regressor classifier model")
print(classification_report(y_test,lr_prediction))

print("Descision Tree classifier")
print(classification_report(y_test,dt_prediction))

print("Random Forest Classifier")
print(classification_report(y_test,rf_prediction))

print("K-Neighbout classifier")
print(classification_report(y_test,kn_prediction))

print("SVM Classifier")
print(classification_report(y_test,svm_prediction))

Linear Regressor classifier model
              precision    recall  f1-score   support

           0       0.74      0.93      0.83        56
           1       0.53      0.42      0.47        62
           2       0.58      0.60      0.59        78
           3       0.82      0.84      0.83        58
           4       0.90      1.00      0.95        63
           5       0.54      0.38      0.44        56
           6       0.35      0.38      0.37        50

    accuracy                           0.65       423
   macro avg       0.64      0.65      0.64       423
weighted avg       0.64      0.65      0.64       423

Descision Tree classifier
              precision    recall  f1-score   support

           0       0.93      0.98      0.96        56
           1       0.89      0.90      0.90        62
           2       0.95      0.94      0.94        78
           3       0.95      0.95      0.95        58
           4       1.00      1.00      1.00        63
           5      