In [156]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/playground-series-s4e2/sample_submission.csv
/kaggle/input/playground-series-s4e2/train.csv
/kaggle/input/playground-series-s4e2/test.csv


In [157]:
df_train = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
df_test = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")

In [158]:
df_train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


## EDA

In [159]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

In [160]:
profile_report = df_train.profile_report()

In [192]:
profile_report

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [162]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
import xgboost as xgb
import lightgbm as ltgm
import catboost as cat
from sklearn.ensemble import VotingClassifier
from mlxtend.classifier import StackingCVClassifier

In [163]:
# Encoding target variable
label_encoder = LabelEncoder()
df_train['NObeyesdad'] = label_encoder.fit_transform(df_train['NObeyesdad'])

# One-hot encoding categorical variables
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

# Separating features and target variable
X = df_train.drop(['id','NObeyesdad'], axis=1)
y = df_train['NObeyesdad']

# Splitting data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Instantiating Models

In [180]:
xgb_model = xgb.XGBClassifier()
bag_model = BaggingClassifier()
rf_model = RandomForestClassifier()
ada_model = AdaBoostClassifier()
gb_model = GradientBoostingClassifier()

# Stacking models
stacking_models = [('xgb', xgb_model), ('bag', bag_model), ('rf', rf_model), ('ada', ada_model), ('gb', gb_model)]
stacking_model = StackingClassifier(estimators=stacking_models, final_estimator=LogisticRegression())

# Voting classifier
voting_model = VotingClassifier(estimators=[('xgb', xgb_model), ('bag', bag_model), ('rf', rf_model), ('ada', ada_model), ('gb', gb_model)], voting='hard')

# StackingCV classifier
stackingcv_model = StackingCVClassifier(classifiers=[xgb_model, bag_model, rf_model, ada_model, gb_model],
                                        meta_classifier=LogisticRegression(),
                                        use_probas=True,
                                        cv=5)

### Fitting models individually

In [183]:
xgb_model.fit(X_train, y_train)
bag_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
ada_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)

In [186]:
predictions_xgb = xgb_model.predict(X_val)
predictions_bag = bag_model.predict(X_val)
predictions_rf = rf_model.predict(X_val)
predictions_ada = ada_model.predict(X_val)
predictions_gb = gb_model.predict(X_val)

In [187]:
accuracy_xgb = accuracy_score(y_val, predictions_xgb)
accuracy_bag = accuracy_score(y_val, predictions_bag)
accuracy_rf = accuracy_score(y_val, predictions_rf)
accuracy_ada = accuracy_score(y_val, predictions_ada)
accuracy_gb = accuracy_score(y_val, predictions_gb)

In [190]:
data = {
    'Model': ['XGB', 'Bagging', 'Random Forest', 'AdaBoost', 'Gradient Boosting'],
    'Accuracy Score': [accuracy_xgb, accuracy_bag, accuracy_rf, accuracy_ada, accuracy_gb]
}

In [193]:
accuracy_table = pd.DataFrame(data)

# Display the table
print(accuracy_table)

               Model  Accuracy Score
0                XGB        0.901975
1            Bagging        0.883189
2      Random Forest        0.882466
3           AdaBoost        0.432803
4  Gradient Boosting        0.903902


In [194]:
if 'CALC_Always' in df_test.columns:
    df_test.drop('CALC_Always', axis=1, inplace=True)

In [195]:
test = df_test.drop('id', axis=1)
test_pred = gb_model.predict(test)

In [196]:
submission = pd.DataFrame()
submission['id'] = df_test['id']
submission['NObeyesdad'] = label_encoder.inverse_transform(test_pred)

In [199]:
submission.to_csv("submission.csv", index=False)

In [202]:
submission

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
...,...,...
13835,34593,Overweight_Level_II
13836,34594,Overweight_Level_I
13837,34595,Insufficient_Weight
13838,34596,Normal_Weight
