# Question-2

Imagine you have a dataset where you have different features like Age ,
Gender , Height , Weight , BMI , and Blood Pressure and you have to classify the people into
different classes like Normal , Overweight , Obesity , Underweight , and Extreme Obesity by using
any 4 different classification algorithms. Now you have to build a model which
can classify people into different classes.

Data_link - https://www.kaggle.com/datasets/ankurbajaj9/obesity-levels

In [1]:
#  Importing necessary libraries
import pandas as pd             # for data analysis
import numpy as np                # for numerical calulation
import matplotlib.pyplot as plt    # for visualisation
import seaborn as sns             # for visualisation
import warnings                 
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")   #   read the csv data

In [3]:
data.head()  # give 1st five rows

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [4]:
data.isnull().sum()      # Checking the null values

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [5]:
data.duplicated().sum()    #  # Checking the  duplicated data 

24

In [6]:
data.drop_duplicates(inplace=True)     # removing  the  duplicated data 

In [7]:
data.info()          # gives information of data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2087 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2087 non-null   object 
 1   Age                             2087 non-null   float64
 2   Height                          2087 non-null   float64
 3   Weight                          2087 non-null   float64
 4   family_history_with_overweight  2087 non-null   object 
 5   FAVC                            2087 non-null   object 
 6   FCVC                            2087 non-null   float64
 7   NCP                             2087 non-null   float64
 8   CAEC                            2087 non-null   object 
 9   SMOKE                           2087 non-null   object 
 10  CH2O                            2087 non-null   float64
 11  SCC                             2087 non-null   object 
 12  FAF                             20

In [8]:
categorical_features = data.select_dtypes(include="object").columns   # separate  numwerical and catigorical features
numerical_features = data.select_dtypes(exclude="object").columns
print(categorical_features)
print(numerical_features)

Index(['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')
Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'], dtype='object')


In [9]:
data['NObeyesdad'].value_counts()   # count of types in 'NObeyesdad' column

Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_II    290
Normal_Weight          282
Overweight_Level_I     276
Insufficient_Weight    267
Name: NObeyesdad, dtype: int64

In [10]:
from sklearn.preprocessing import LabelEncoder    # use label encoding on catigorical data
lable = LabelEncoder()

for i in categorical_features:
    data[i] = lable.fit_transform(data[i])

In [11]:
X = data.drop('NObeyesdad',axis=1)    # drop lthe label column from features
Y = data['NObeyesdad']

In [12]:
categorical_features = X.select_dtypes(include="object").columns   # separate  numwerical and catigorical features
numerical_features = X.select_dtypes(exclude="object").columns
print(categorical_features)
print(numerical_features)

Index([], dtype='object')
Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS'],
      dtype='object')


In [13]:
from sklearn.impute import SimpleImputer                   # impute/fill missing values
from sklearn.preprocessing import StandardScaler,OneHotEncoder     
from sklearn.pipeline import Pipeline       # creating pipeline
from sklearn.compose import ColumnTransformer    # combinning Numerical Pipline and  categorical Pipline

In [14]:
## Numerical Pipline
num_pipline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]
)
## categorical Pipline
cato_pipline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ('Encoder', OneHotEncoder(handle_unknown = "ignore")),
        ("scaler",StandardScaler(with_mean=False))
    ]
)

# Create Preprocessor object
preprocessor = ColumnTransformer([
    ("num_pipline",num_pipline,numerical_features),
    ("cato_pipline",cato_pipline,categorical_features)
])

In [15]:
from sklearn.model_selection import train_test_split     # splitting train and test data
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.20,random_state=42)

In [16]:
X_train = preprocessor.fit_transform(X_train)     # apply fit_transform function
X_test = preprocessor.transform(X_test)

In [17]:
#    importing necessary ml algorithms
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [18]:
models={'LogisticRegression': LogisticRegression(),
       'DecisionTree': DecisionTreeClassifier(max_depth=10,class_weight="balanced",min_samples_split=5),
        'SVM': SVC(C=10),
       'RandomForest': RandomForestClassifier(n_estimators=200,max_depth=10,class_weight="balanced",min_samples_split=5)
      }   # models in dictionary

In [19]:
def evaluate_models(X_train,X_test,y_test,y_train,models):   # function for getting best model
    report={}
    report_class={}
    
    for i in range(len(list(models))):
        model=list(models.values())[i]

        model.fit(X_train,y_train)
        
        y_test_pred=model.predict(X_test)
        acc_score=accuracy_score(y_test,y_test_pred)
        
        print(f'{list(models.keys())[i]} :')
        print(classification_report(y_test,y_test_pred))
        
        report[list(models.keys())[i]]=acc_score
        
        best_score=max(list(report.values()))
        best_model=list(filter(lambda x:report[x]==best_score,report))[0]
    print(f'The Best score of models:{best_model} : {best_score}')
    return report
    
        

In [20]:
evaluate_models(X_train,X_test,y_test,y_train,models)  # call above function

LogisticRegression :
              precision    recall  f1-score   support

           0       0.83      0.93      0.88        59
           1       0.84      0.62      0.72        61
           2       0.93      0.91      0.92        70
           3       0.97      1.00      0.98        64
           4       1.00      1.00      1.00        60
           5       0.75      0.75      0.75        55
           6       0.70      0.82      0.75        49

    accuracy                           0.87       418
   macro avg       0.86      0.86      0.86       418
weighted avg       0.87      0.87      0.86       418

DecisionTree :
              precision    recall  f1-score   support

           0       0.89      0.95      0.92        59
           1       0.79      0.67      0.73        61
           2       0.96      0.99      0.97        70
           3       0.98      0.98      0.98        64
           4       1.00      1.00      1.00        60
           5       0.77      0.80      0.7

{'LogisticRegression': 0.8660287081339713,
 'DecisionTree': 0.9114832535885168,
 'SVM': 0.9114832535885168,
 'RandomForest': 0.9569377990430622}