# Requirements

In [1]:
import pandas as pd
import missingno as msno
import numpy as np
from scipy.stats import ppcc_max
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_validate
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import missingno as msno
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [7]:
# Add as many imports as you need.

## Obesity Analysis


# Dataset Description

## Attributes

- id: represents a unique identifier for each participant or record in the dataset.
- Gender: Represents the gender of the individual.
- Age: Indicates the age of the individual.
- Height - Represents the height of the individual.
- Weight: Represents the weight of the individual.
- FCVC Frequency of Consumption of Vegetables
- NCP - Number of Main Meals per Day
- SMOKE - Indicates whether the individual smokes
- CH2O - Consumption of Water
- TUE - Time Using Electronic Devices
- MTRANS - Mode of Transportation
- NObeyesdad - Nutritional Obesity Status, obesity class

### Target variable
- FAF Frequency of Physical Activity

# Read the dataset

In [2]:
# Write your code here. Add as many boxes as you need.
data=pd.read_csv('obesity.csv')
data=data.drop('id',axis=1)
data=data.drop('Unnamed: 0',axis=1)

Explore the dataset using visualizations of your choice.

In [3]:
# Write your code here. Add as many boxes as you need.
data.sample(3)

Unnamed: 0,Gender,Age,Height,Weight,FCVC,NCP,SMOKE,CH2O,FAF,TUE,MTRANS,NObeyesdad
12435,Female,18.0,1.65,82.0,,1.0,no,1.0,,1.0,Public_Transportation,
15762,Male,,,84.75383,2.973569,,no,1.751723,,0.038253,Automobile,Overweight_Level_I
16421,Male,23.0,1.7,80.0,2.0,3.0,no,2.0,0.0,,Public_Transportation,Overweight_Level_II


## Detecting Missing Values
Calculate the percentage of missing values present in each column of the dataset.

In [4]:
# Write your code here. Add as many boxes as you need.
data.isnull().sum()/len(data)*100

Gender        19.867039
Age           19.992292
Height        19.708064
Weight        20.623374
FCVC          19.833317
NCP           19.563542
SMOKE         19.799595
CH2O          20.045284
FAF           19.939301
TUE           19.977840
MTRANS        20.030831
NObeyesdad    19.910396
dtype: float64

## Understanding the Causes Behind Missing Values
Using visualization tools such as heatmaps, and dendrograms, illustrate the interdependence between attributes with missing values. Also, visualize the distribution of the missing values within the dataset using matrices and bar charts. To achieve this use the `missingno` library.

In [None]:
# Write your code here. Add as many boxes as you need.

## Handling the Missing Values
Handle the missing values using suitable method based on the insights obtained from the various visualizations.

In [5]:
def label_data(data: pd.DataFrame, columns: list):
    encoder = LabelEncoder()
    data_copy = data.copy()

    for column in columns:
        data_copy[column] = encoder.fit_transform(data_copy[[column]].astype(str).values.ravel())

        if 'nan' in encoder.classes_:
            data_copy.loc[data_copy[column] == data_copy[column].max(), column] = np.nan
    return data_copy

In [6]:
data_imputed=label_data(data,['Gender','SMOKE','MTRANS','NObeyesdad'])

In [7]:
def knn_impute_data(data: pd.DataFrame, columns: list, n_neighbors: int):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    data_copy = data.copy()

    for column in columns:
        data_copy[column] = imputer.fit_transform(data_copy[column].to_numpy().reshape(-1, 1))
    return data_copy


In [8]:
data_imputed=knn_impute_data(data_imputed,['Gender','Age','Height','Weight','FCVC','NCP','SMOKE','CH2O','TUE','MTRANS','NObeyesdad'],2)

In [9]:
data_imputed.sample(10)

Unnamed: 0,Gender,Age,Height,Weight,FCVC,NCP,SMOKE,CH2O,FAF,TUE,MTRANS,NObeyesdad
5560,1.0,17.188754,1.868931,58.943347,3.0,3.0,1.0,2.654702,1.166064,0.138418,0.0,0.0
16037,0.0,26.0,1.643332,111.868169,2.445041,3.0,0.0,2.707201,,0.125235,3.0,4.0
11272,1.0,23.850701,1.829142,129.19449,3.0,3.0,1.0,2.027584,1.266866,0.034897,3.0,2.999338
4821,1.0,34.993835,1.70015,83.314157,2.0,1.80993,1.0,2.793505,1.07672,1.099764,0.0,5.0
14638,0.498978,28.404332,1.774647,112.173731,1.924632,3.0,1.0,2.067741,,0.618256,0.0,3.0
2205,0.0,23.850701,1.605469,45.0,3.0,3.0,0.0,2.746197,1.417228,0.0,3.0,0.0
16894,0.498978,18.0,1.6,48.0,3.0,3.0,1.0,1.0,,0.0,3.0,1.0
13915,0.498978,24.178638,1.690437,97.58826,2.445041,2.759481,0.0,1.0,0.0,1.612432,3.0,2.0
9532,0.0,21.652229,1.716497,133.94608,3.0,3.0,0.0,2.833566,1.413239,0.716327,3.0,4.0
5536,0.0,21.491055,1.70015,43.087508,3.0,1.73762,1.0,2.55975,0.119643,0.0,3.0,2.999338


In [10]:
data_imputed.isnull().sum()/len(data_imputed)*100

Gender         0.000000
Age            0.000000
Height         0.000000
Weight         0.000000
FCVC           0.000000
NCP            0.000000
SMOKE          0.000000
CH2O           0.000000
FAF           19.939301
TUE            0.000000
MTRANS         0.000000
NObeyesdad     0.000000
dtype: float64

## Feature Preprocessing


Preprocess the features to make them ready for using the model

In [20]:
# Write your code here. Add as many boxes as you need.
data_imputed=data_imputed.dropna(subset='FAF',axis=0)
X=data_imputed.drop('FAF',axis=1)
Y=data_imputed['FAF'].dropna()
X.sample(3)


Unnamed: 0,Gender,Age,Height,Weight,FCVC,NCP,SMOKE,CH2O,TUE,MTRANS,NObeyesdad
1455,0.0,23.850701,1.7,87.90535,2.445041,1.095223,1.0,2.027584,1.619596,3.0,6.0
6585,1.0,21.0,1.85,105.0,3.0,1.0,0.0,3.0,1.0,3.0,2.999338
7412,0.0,18.94093,1.746416,87.90535,3.0,3.0,1.0,2.838893,0.937492,3.0,4.0


In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

## Hyper-parameter optimization

Train an XGBoost Model, optimize it's hyper-parameters with grid-search and 5-fold cross validation on the training set.

In [22]:
# Write your code here. Add as many boxes as you need.
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=10)
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2], 
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],  
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

grid_search.fit(X_train, Y_train)




In [24]:
import numpy as np
bestModel=grid_search.best_estimator_
bestModel.fit(X_train, Y_train)
y_pred=bestModel.predict(X_test)


In [28]:
mae = mean_absolute_error(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, y_pred)

range=data['FAF'].max()-data['FAF'].min()

print(f'Range: {range}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R²): {r2}')

Range: 3.0
Mean Absolute Error (MAE): 0.5078539440500971
Mean Squared Error (MSE): 0.48213035188285747
Root Mean Squared Error (RMSE): 0.6943560699546433
R-squared (R²): 0.3394315173581277


# Cross-validation evaluation

Perform leave-one out cross validation on the entire dataset, with an untrained model with the chosen hyper-parameters from the previous step.

In [32]:
# Write your code here. Add as many boxes as you need.
from sklearn.model_selection import LeaveOneOut,cross_validate
loo = LeaveOneOut()
result=cross_validate(bestModel,X,Y,cv=loo,n_jobs=-1,scoring='neg_mean_absolute_error')



Assess the performance of the model by using different metrics provided by the `scikit-learn` library.

In [35]:
# Write your code here. Add as many boxes as you need.
print(result['test_score'].mean()*-1)

0.5026388983833374
