# Import Libraries

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder , StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")

# Reading Data by using pandas

In [2]:
df=pd.read_csv("/kaggle/input/train-obesity-risk/train.csv")

# Explore Data

### Reading first 5 rows

In [3]:
df.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


### Displaying the Dimensions of the DataFrame

In [4]:
df.shape

(20758, 18)

### Listing the Column Names of the DataFrame

In [5]:
df.columns

Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

### Displaying DataFrame Information and Summary

In [6]:
 df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

### Counting Missing Values in Each Column

In [7]:
df.isnull().sum()

id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

### Counting Duplicate Rows in the DataFrame

In [8]:
df.duplicated().sum()

0

### Generating Statistical Summary of the DataFrame

In [9]:
df.describe()

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,10378.5,23.841804,1.700245,87.887768,2.445908,2.761332,2.029418,0.981747,0.616756
std,5992.46278,5.688072,0.087312,26.379443,0.533218,0.705375,0.608467,0.838302,0.602113
min,0.0,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,5189.25,20.0,1.631856,66.0,2.0,3.0,1.792022,0.008013,0.0
50%,10378.5,22.815416,1.7,84.064875,2.393837,3.0,2.0,1.0,0.573887
75%,15567.75,26.0,1.762887,111.600553,3.0,3.0,2.549617,1.587406,1.0
max,20757.0,61.0,1.975663,165.057269,3.0,4.0,3.0,3.0,2.0


# Transform Data

### Selecting and Sampling Categorical Data from the DataFrame

In [10]:
df_object=df.select_dtypes(include="object")
df_object.sample(5)

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad
14807,Male,yes,yes,Sometimes,no,no,Sometimes,Automobile,Overweight_Level_I
20638,Female,no,yes,Sometimes,no,no,Sometimes,Public_Transportation,Overweight_Level_I
10299,Female,yes,no,Frequently,no,no,no,Public_Transportation,Insufficient_Weight
1834,Female,yes,yes,Sometimes,no,no,Sometimes,Public_Transportation,Obesity_Type_III
3817,Male,yes,yes,Sometimes,no,no,Sometimes,Public_Transportation,Overweight_Level_II


### Selecting and Sampling Non-Categorical Data from the DataFrame

In [11]:
df_non_object=df.select_dtypes(exclude="object")
df_non_object.sample(5)

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
10519,10519,26.0,1.65757,111.868169,3.0,3.0,2.623489,0.0,0.17403
19518,19518,23.0,1.68,68.0,2.0,4.0,1.0,0.0,0.0
8047,8047,26.0,1.7,75.0,3.0,3.0,2.0,1.0,0.0
169,169,18.024853,1.68,90.0,2.0,3.0,1.49681,0.874643,0.0
3901,3901,22.884722,1.622999,82.0,1.00876,1.0,2.0,0.0,1.772463


### Initializing the Label Encoder for Categorical Data

In [12]:
la=LabelEncoder()

### Encoding Categorical Features into Numeric Format Using Label Encoding

In [13]:
for i in range(0,df_object.shape[1]):
    df_object.iloc[:,i]=la.fit_transform(df_object.iloc[:,i])

### Displaying the First Few Rows of the Categorical DataFrame

In [14]:
df_object.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad
0,1,1,1,2,0,0,1,3,6
1,0,1,1,1,0,0,2,0,1
2,0,1,1,2,0,0,2,3,0
3,0,1,1,2,0,0,1,3,4
4,1,1,1,2,0,0,1,3,6


### Displaying Data Types of Categorical Columns in the DataFrame

In [15]:
for j in df_object.columns:
    print(j,":",df_object[j].dtype)

Gender : object
family_history_with_overweight : object
FAVC : object
CAEC : object
SMOKE : object
SCC : object
CALC : object
MTRANS : object
NObeyesdad : object


### Converting Categorical DataFrame to Integer Type

In [16]:
df_object=df_object.astype("int")

### Displaying Data Types of Categorical Columns in the DataFrame

In [17]:
for s in df_object.columns:
    print(s,":",df_object[j].dtype)

Gender : int64
family_history_with_overweight : int64
FAVC : int64
CAEC : int64
SMOKE : int64
SCC : int64
CALC : int64
MTRANS : int64
NObeyesdad : int64


### Combining Categorical and Non-Categorical DataFrames and Displaying the Result

In [18]:
df=pd.concat([df_object,df_non_object],axis=1)
df.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,1,1,1,2,0,0,1,3,6,0,24.443011,1.699998,81.66995,2.0,2.983297,2.763573,0.0,0.976473
1,0,1,1,1,0,0,2,0,1,1,18.0,1.56,57.0,2.0,3.0,2.0,1.0,1.0
2,0,1,1,2,0,0,2,3,0,2,18.0,1.71146,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584
3,0,1,1,2,0,0,1,3,4,3,20.952737,1.71073,131.274851,3.0,3.0,1.674061,1.467863,0.780199
4,1,1,1,2,0,0,1,3,6,4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721


### Loading Test Data from CSV File for Obesity Analysis

In [19]:
df_test=pd.read_csv("/kaggle/input/test-obesity/test.csv")

### Selecting and Sampling Categorical Data from the Test DataFrame

In [20]:
df_object=df_test.select_dtypes(include="object")
df_object.sample(5)

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS
12966,Male,yes,no,Sometimes,no,no,no,Public_Transportation
8091,Female,yes,yes,Sometimes,no,no,Sometimes,Public_Transportation
11470,Male,yes,yes,Sometimes,no,no,no,Public_Transportation
10045,Female,yes,yes,Sometimes,no,no,Sometimes,Automobile
11790,Male,yes,yes,Sometimes,no,no,Sometimes,Public_Transportation


### Selecting and Sampling Non-Categorical Data from the Test DataFrame

In [21]:
df_non_object=df_test.select_dtypes(exclude="object")
df_object.sample(5)

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS
7430,Male,yes,yes,Frequently,no,no,Sometimes,Public_Transportation
12382,Female,no,yes,Sometimes,no,no,Sometimes,Automobile
13423,Female,yes,yes,Sometimes,no,no,Sometimes,Automobile
11199,Female,yes,yes,Sometimes,no,no,no,Public_Transportation
6918,Male,yes,yes,Sometimes,no,no,no,Public_Transportation


### Encoding Categorical Features in the Test DataFrame Using Label Encoding

In [22]:
for i in range(0,df_object.shape[1]):
    df_object.iloc[:,i]=la.fit_transform(df_object.iloc[:,i])

### Displaying Data Types of Categorical Columns in the Test DataFrame

In [23]:
for j in df_object.columns:
    print(j,":",df_object[j].dtype)

Gender : object
family_history_with_overweight : object
FAVC : object
CAEC : object
SMOKE : object
SCC : object
CALC : object
MTRANS : object


### Converting Categorical Data in Test DataFrame to Integer Type

In [24]:
df_object=df_object.astype("int")

### Displaying Data Types of Categorical Columns in the Transformed Test DataFrame

In [25]:
for j in df_object.columns:
    print(j,":",df_object[j].dtype)

Gender : int64
family_history_with_overweight : int64
FAVC : int64
CAEC : int64
SMOKE : int64
SCC : int64
CALC : int64
MTRANS : int64


### Combining Categorical and Non-Categorical Data in the Test DataFrame

In [26]:
df_test=pd.concat([df_object,df_non_object],axis=1)
df_test.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,1,1,1,2,0,0,2,3,20758,26.899886,1.848294,120.644178,2.938616,3.0,2.825629,0.8554,0.0
1,0,1,1,2,0,0,2,3,20759,21.0,1.6,66.0,2.0,1.0,3.0,1.0,0.0
2,0,1,1,2,0,0,2,3,20760,26.0,1.643355,111.600553,3.0,3.0,2.621877,0.0,0.250502
3,1,1,1,2,0,0,2,3,20761,20.979254,1.553127,103.669116,2.0,2.977909,2.786417,0.094851,0.0
4,0,1,1,2,0,0,2,3,20762,26.0,1.627396,104.835346,3.0,3.0,2.653531,0.0,0.741069


### Initializing the Standard Scaler for Feature Scaling

In [27]:
ss=StandardScaler()

### Standardizing 'Age' and 'Weight' Features in the DataFrame

In [28]:
df["Age"]=ss.fit_transform(df[["Age"]])
df["Weight"]=ss.fit_transform(df[["Weight"]])

### Displaying the First Few Rows of the Updated DataFrame

In [29]:
df.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,1,1,1,2,0,0,1,3,6,0,0.105699,1.699998,-0.235713,2.0,2.983297,2.763573,0.0,0.976473
1,0,1,1,1,0,0,2,0,1,1,-1.027052,1.56,-1.170931,2.0,3.0,2.0,1.0,1.0
2,0,1,1,2,0,0,2,3,0,2,-1.027052,1.71146,-1.430012,1.880534,1.411685,1.910378,0.866045,1.673584
3,0,1,1,2,0,0,1,3,4,3,-0.507929,1.71073,1.64477,3.0,3.0,1.674061,1.467863,0.780199
4,1,1,1,2,0,0,1,3,6,4,1.371197,1.914186,0.224054,2.679664,1.971472,1.979848,1.967973,0.931721


# Create model

### Dropping Unnecessary Columns from the DataFrame for Feature Selection

In [30]:
x=df.drop(["id","NObeyesdad"],axis=1)

### Extracting the Target Variable 'NObeyesdad' for Model Training

In [31]:
y=df["NObeyesdad"]

### Splitting Data into Training and Testing Sets

In [32]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

### Displaying the Dimensions of the Training Feature Set

In [33]:
x_train.shape

(16606, 16)

### Displaying the Dimensions of the Training Target Variable

In [34]:
y_train.shape

(16606,)

### Initializing Multiple Classification Models for Comparison

In [35]:
model1=LogisticRegression()
model2=RandomForestClassifier()
model3=GaussianNB()
model4=SVC()
model5=XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=300)
model6=GradientBoostingClassifier(learning_rate=0.1, max_depth=5, n_estimators=300)

### Training and Evaluating a Classification Model

In [36]:
def pred(model):
    model.fit(x_train,y_train)
    pre=model.predict(x_test)
    print(classification_report(pre,y_test))

### Training and Evaluating the Logistic Regression Model

In [37]:
pred(model1)

              precision    recall  f1-score   support

           0       0.85      0.79      0.82       569
           1       0.69      0.70      0.69       615
           2       0.80      0.77      0.78       565
           3       0.97      0.95      0.96       668
           4       1.00      0.99      1.00       807
           5       0.59      0.64      0.62       448
           6       0.60      0.65      0.62       480

    accuracy                           0.81      4152
   macro avg       0.79      0.78      0.78      4152
weighted avg       0.81      0.81      0.81      4152



### Training and Evaluating the Random Forest Classifier Model

In [38]:
pred(model2)

              precision    recall  f1-score   support

           0       0.91      0.94      0.93       506
           1       0.88      0.84      0.86       657
           2       0.87      0.89      0.88       530
           3       0.97      0.97      0.97       659
           4       1.00      1.00      1.00       804
           5       0.76      0.76      0.76       484
           6       0.79      0.79      0.79       512

    accuracy                           0.90      4152
   macro avg       0.88      0.89      0.88      4152
weighted avg       0.90      0.90      0.90      4152



### Training and Evaluating the Gaussian Naive Bayes Model

In [39]:
pred(model3)

              precision    recall  f1-score   support

           0       0.85      0.70      0.77       641
           1       0.47      0.65      0.54       450
           2       0.61      0.37      0.46       897
           3       0.93      0.71      0.81       858
           4       1.00      0.96      0.98       832
           5       0.30      0.61      0.40       239
           6       0.24      0.53      0.33       235

    accuracy                           0.66      4152
   macro avg       0.63      0.65      0.61      4152
weighted avg       0.74      0.66      0.68      4152



### Training and Evaluating the Support Vector Classifier Model

In [40]:
pred(model4)

              precision    recall  f1-score   support

           0       0.94      0.87      0.90       564
           1       0.77      0.81      0.79       593
           2       0.84      0.82      0.83       557
           3       0.97      0.97      0.97       659
           4       1.00      1.00      1.00       804
           5       0.66      0.69      0.67       463
           6       0.69      0.70      0.69       512

    accuracy                           0.85      4152
   macro avg       0.84      0.83      0.84      4152
weighted avg       0.85      0.85      0.85      4152



### Training and Evaluating the XGBoost Classifier Model

In [41]:
pred(model5)

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       511
           1       0.90      0.88      0.89       639
           2       0.88      0.89      0.89       537
           3       0.97      0.97      0.97       658
           4       1.00      1.00      1.00       805
           5       0.79      0.79      0.79       478
           6       0.82      0.80      0.81       524

    accuracy                           0.91      4152
   macro avg       0.90      0.90      0.90      4152
weighted avg       0.91      0.91      0.91      4152



### Training and Evaluating the Gradient Boosting Classifier Model

In [42]:
pred(model6)

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       516
           1       0.88      0.88      0.88       629
           2       0.87      0.88      0.88       538
           3       0.97      0.97      0.97       656
           4       1.00      1.00      1.00       803
           5       0.79      0.77      0.78       493
           6       0.81      0.80      0.80       517

    accuracy                           0.90      4152
   macro avg       0.89      0.89      0.89      4152
weighted avg       0.90      0.90      0.90      4152



### Defining Hyperparameter Grid and Scoring Metric for Model Tuning

In [43]:
prim_grid={'n_estimators':[100,200,300],
         'learning_rate':[0.1,0.01,0.001],
          'max_depth':[3,5,7]}
scorer="accuracy"

### Performing Hyperparameter Tuning for the XGBoost Classifier Using Grid Search

In [44]:
m5=GridSearchCV(model5,prim_grid,scoring=scorer, n_jobs=-1)
m5.fit(x_train,y_train)
print(m5.best_params_)
print(m5.best_score_)

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
0.9082865301192982


### Performing Hyperparameter Tuning for the Gradient Boosting Classifier Using Grid Search

In [45]:
m6=GridSearchCV(model6,prim_grid,scoring=scorer, n_jobs=-1)
m6.fit(x_train,y_train)
print(m6.best_params_)
print(m6.best_score_)

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
0.9056972205951906


### Removing 'id' Column from the Test Data for Feature Preparation

In [47]:
testx=df_test.drop('id',axis=1)

### Making Predictions on the Test Data Using the Optimized XGBoost Model

In [48]:
prex=model5.predict(testx)

### Creating Submission DataFrame for Test Predictions

In [50]:
submission=pd.DataFrame({"id":df_test['id'],"NObeyesdad":prex})

### Transforming Predicted Labels Back to Original Categorical Values

In [51]:
submission['NObeyesdad']=la.inverse_transform(prex)

### Saving Submission DataFrame to CSV File for Submission

In [52]:
submission.to_csv("submission.csv",index=False)

### Loading and Displaying the Submission Data from CSV File

In [54]:
df_submission=pd.read_csv("/kaggle/working/submission.csv")
df_submission.head()

Unnamed: 0,id,NObeyesdad
0,20758,Public_Transportation
1,20759,Motorbike
2,20760,Walking
3,20761,Public_Transportation
4,20762,Walking
