# Model 

In [1]:
import pandas as pd # type: ignore
import numpy as np   # type: ignore
from sklearn.pipeline import Pipeline    # type: ignore
from sklearn.preprocessing import StandardScaler, OrdinalEncoder  # type: ignore
from sklearn.compose import ColumnTransformer  # type: ignore
from lightgbm import LGBMClassifier # type: ignore
import joblib

# Load data
train_sample = pd.read_csv(r"D:\Projects ITI\DV\Obesity\data\train.csv")
train_orgin_extra = pd.read_csv(r"D:\Projects ITI\DV\Obesity\data\ObesityDataSet.csv")
train = pd.concat([train_sample, train_orgin_extra], ignore_index=True)
train.drop('id', axis=1, inplace=True)

# Remove duplicated data
train = train.drop_duplicates()

# Define categorical columns
categorical_columns = train.select_dtypes(include='object').columns
if 'NObeyesdad' in categorical_columns:
    categorical_columns = categorical_columns.drop(['NObeyesdad'])

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(), categorical_columns),
        ('scaling', StandardScaler(), ['Age', 'Weight'])
    ], remainder='passthrough'
)

# Define pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(
        objective='multiclass',
        metric='multi_logloss',
        num_class=7,
        verbosity=-1,
        boosting_type='gbdt',
        random_state=42,
        learning_rate=0.03276219058578542,
        n_estimators=500,
        lambda_l1=0.009879324515507773,
        lambda_l2=0.04509276523818003,
        max_depth=10,
        colsample_bytree=0.451686663982718,
        subsample=0.9636469087931024,
        min_child_samples=28,
        n_jobs=-1
    ))
])

# Prepare data
X = train.drop(['NObeyesdad'], axis=1)
y = train['NObeyesdad']

# Train model
pipeline.fit(X, y)



In [3]:
# Save the pipeline to a file
joblib.dump(pipeline, 'model_pipeline.joblib')

# Load the pipeline from the file
pipeline_loaded = joblib.load('model_pipeline.joblib')

test_sample = pd.read_csv(r"D:\Projects ITI\DV\Obesity\data\test.csv")

# Make predictions on the test set
y_pred = pipeline_loaded.predict(test_sample)
y_pred

array(['Obesity_Type_II', 'Overweight_Level_I', 'Obesity_Type_III', ...,
       'Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_II'],
      dtype=object)

In [4]:
for cat in  categorical_columns :
    print(f"{cat}  "  ,train[cat].unique() )

Gender   ['Male' 'Female']
family_history_with_overweight   ['yes' 'no']
FAVC   ['yes' 'no']
CAEC   ['Sometimes' 'Frequently' 'no' 'Always']
SMOKE   ['no' 'yes']
SCC   ['no' 'yes']
CALC   ['Sometimes' 'no' 'Frequently' 'Always']
MTRANS   ['Public_Transportation' 'Automobile' 'Walking' 'Motorbike' 'Bike']
