# **Imports**

In this pivotal section of the notebook, we meticulously curate the foundation for seamless execution by loading and importing essential libraries. This pivotal step ensures the smooth functioning of the notebook, laying the groundwork for subsequent analyses and tasks.

In [107]:
# Basic imports
import numpy as np
import pandas as pd
from tqdm import tqdm

# Data Processing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Visualization
import plotly.express as px

# Feature Selection
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif

# ML Models
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform, loguniform

# Metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# **Data Loading**

This section of the notebook centers around the critical task of loading data. Here, our primary focus is on proficient data loading and preprocessing, laying the groundwork for subsequent analyses and modeling endeavors.

In [139]:
# Load Dataframe
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

# Quick look
train_df.head(5)

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [140]:
class_mapping = LabelEncoder()
train_df.NObeyesdad = class_mapping.fit_transform(train_df.NObeyesdad)

In [141]:
# Converting categorical labels into numeric values
for df in [train_df, test_df]:
    for col in df.columns:
        if df[col].dtype == "object":

            # Initialize Label Encoder
            label_encoder = LabelEncoder()
            df[col] = label_encoder.fit_transform(df[col])

In [142]:
# Quick Look
train_df.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,1,24.443011,1.699998,81.66995,1,1,2.0,2.983297,2,0,2.763573,0,0.0,0.976473,1,3,6
1,1,0,18.0,1.56,57.0,1,1,2.0,3.0,1,0,2.0,0,1.0,1.0,2,0,1
2,2,0,18.0,1.71146,50.165754,1,1,1.880534,1.411685,2,0,1.910378,0,0.866045,1.673584,2,3,0
3,3,0,20.952737,1.71073,131.274851,1,1,3.0,3.0,2,0,1.674061,0,1.467863,0.780199,1,3,4
4,4,1,31.641081,1.914186,93.798055,1,1,2.679664,1.971472,2,0,1.979848,0,1.967973,0.931721,1,3,6


In [143]:
# Dropping uneccessary columns
train_df.drop(columns=['id'], inplace=True)
submission_ids = test_df.pop('id')

In [144]:
# Splitting into features and labels
data = train_df.copy()
y = data.pop('NObeyesdad')
X = data

# Scaling data features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_submission = scaler.transform(test_df)

# Training Testing Spliting
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    train_size = 0.95,
    test_size = 0.05,
    shuffle = True,
    stratify = y,
    random_state = 42
)

In [145]:
features = list(train_df.columns[:-1])
n_classes = len(train_df.NObeyesdad.unique())

# **Feature Selection**

The most simple, basic form of feature selection is feature correlation.

In [115]:
# Compute Spearman correlation
corr = np.round(train_df.corr(method='spearman'), 2)

# heatmap visualization
heatmap = px.imshow(corr, text_auto=True, title="Spearman Correlation", height=1000)
heatmap.show()

To be clear, I must say that we will be not using the feature correlation as a feature selection step, although we will indeed look for a decorrelation scores of the final top 6 or 5 features that we select in order to select the best among them. This will be one of the measures that we will be looking at.

---

Previously, we didn't perform the feature selection and directly hypertube under the extreme gradient boosting model, which achieved a pretty decent performance, but its ranking isn't that good. And the model should be improved further, although by performing further hyperparameters. So it's, it turned out that the model was not as good as to be expected. Thus, we come to the feature selection step. Using **RFE**

In [116]:
# Create a base classifier
base_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Define the range of features to select
num_features_range = list(range(2, 17, 2))

# Lists to store results
f1_scores = []
rfe_models = []
accuracy_scores = []

# Iterate over the number of features
for num_features in tqdm(num_features_range, desc="Testing"):

    # Create RFE model
    rfe = RFE(base_classifier, n_features_to_select=num_features)

    # Fit RFE
    X_train_rfe = rfe.fit_transform(X_train, y_train)
    X_test_rfe = rfe.transform(X_test)

    # Train a model using the selected features
    rfe_classifier = base_classifier.fit(X_train_rfe, y_train)
    rfe_models.append(rfe)

    # Make predictions on the test set
    y_pred_rfe = rfe_classifier.predict(X_test_rfe)

    # Compute accuracy and F1 score
    accuracy_rfe = accuracy_score(y_test, y_pred_rfe)
    f1_rfe = f1_score(y_test, y_pred_rfe, average="weighted")

    accuracy_scores.append(accuracy_rfe)
    f1_scores.append(f1_rfe)

Testing: 100%|██████████| 8/8 [03:29<00:00, 26.14s/it]


In [117]:
# Plot the results
line_acc_plot = px.line(x=num_features_range, y=accuracy_scores, title="Accuracy Curve")
line_acc_plot.update_layout(
    xaxis_title="Features",
    yaxis_title="Accuracy Score"
)
line_acc_plot.show()

line_f1_plot = px.line(x=num_features_range, y=f1_scores, title="F1 Socre Curve")
line_f1_plot.update_layout(
    xaxis_title="Features",
    yaxis_title="F1 Score"
)
line_f1_plot.show()

Looking at both the curves of the F1 score and the accuracy score, we can clearly see that we only require six features instead of the total 16 features in order to achieve the most high accuracy. In this case, it watched as achieved as 90% which is still quite low. This is because these models are not hypertuned, but the. shows that just by using the. six features and hypertuning the final model we can get an awesome performance.

In [118]:
index = 2
rfe_model = rfe_models[2]
rfe_features = sorted(list(np.array(features)[rfe_model.support_]))
print(f"Selected Features: {rfe_features}")

Selected Features: ['Age', 'CH2O', 'FCVC', 'Gender', 'Height', 'Weight']


Using **Random Forest**

In [119]:
# Train a Random Forest model
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_classifier.fit(X_train, y_train)

# Get feature importances
feature_importances = random_forest_classifier.feature_importances_

# Sort features based on importance
sorted_indices = feature_importances.argsort()[::-1]

# Select top k features (adjust k as needed)
k = 6
X_train_topk = X_train[:, sorted_indices[:k]]
X_test_topk = X_test[:, sorted_indices[:k]]

# Train a model using the selected features
topk_classifier = base_classifier.fit(X_train_topk, y_train)

# Make predictions on the test set
y_pred_topk = topk_classifier.predict(X_test_topk)

# Compute accuracy
accuracy_topk = accuracy_score(y_test, y_pred_topk)
print(f'Top-{k} Features Accuracy: {accuracy_topk}')

Top-6 Features Accuracy: 0.8988439306358381


In [120]:
top_rfc_features = sorted([features[index] for index in sorted_indices[:6]])
print(f"Top RFC Features: {top_rfc_features}")
print(f"Top RFE Features: {rfe_features}")

Top RFC Features: ['Age', 'FCVC', 'Gender', 'Height', 'TUE', 'Weight']
Top RFE Features: ['Age', 'CH2O', 'FCVC', 'Gender', 'Height', 'Weight']


Via **SelectKBest** using **ANOVA F-statistic**

In [121]:
# Recording results
accuracy_scores = []
skb_models = []

for k in tqdm(num_features_range, desc="Testing"):

    # Select top k features using SelectKBest and ANOVA F-statistic
    selector = SelectKBest(score_func=f_classif, k=k)
    skb_models.append(selector)

    X_train_skb = selector.fit_transform(X_train, y_train)
    X_test_skb = selector.transform(X_test)

    # Train a model using the selected features
    skb_classifier = base_classifier.fit(X_train_skb, y_train)

    # Make predictions on the test set
    y_pred_skb = skb_classifier.predict(X_test_skb)

    # Compute accuracy
    accuracy_skb = accuracy_score(y_test, y_pred_skb)
    accuracy_scores.append(accuracy_skb)


Testing: 100%|██████████| 8/8 [00:19<00:00,  2.42s/it]


In [122]:
# Visualize the results
line_acc_plot = px.line(x=num_features_range, y=accuracy_scores, title="Accuracy Curve")
line_acc_plot.update_layout(
    xaxis_title="Features",
    yaxis_title="Accuracy Score"
)
line_acc_plot.show()

In [123]:
index = num_features_range.index(10)
skb_model = skb_models[index]
top_k_features = sorted(list(np.array(features)[skb_model.get_support()]))

print(f"Top RFC Features: {top_rfc_features}")
print(f"Top RFE Features: {rfe_features}")
print(f"Top SKB Features: {top_k_features}")

Top RFC Features: ['Age', 'FCVC', 'Gender', 'Height', 'TUE', 'Weight']
Top RFE Features: ['Age', 'CH2O', 'FCVC', 'Gender', 'Height', 'Weight']
Top SKB Features: ['Age', 'CAEC', 'CALC', 'CH2O', 'FCVC', 'Gender', 'Height', 'MTRANS', 'Weight', 'family_history_with_overweight']


In [124]:
common_features = set(top_rfc_features) & set(rfe_features) & set(top_k_features)
print(f"Common Features: {common_features}")

Common Features: {'Gender', 'Age', 'Weight', 'Height', 'FCVC'}


In [125]:
X_train_imps = X_train[:, [features.index(f) for f in common_features]]
X_test_imps = X_test[:, [features.index(f) for f in common_features]]

# **XGB Model**

In our pursuit of optimal model performance, we begin by assessing whether utilizing the complete dataset or a subset containing only crucial features yields superior results. To make this determination, we initiate an Extreme Gradient Boosting (XGBoost) model and train it on both datasets. The ensuing comparison of accuracy scores serves as a pivotal gauge, guiding our decision on the most effective dataset for subsequent analyses.

In [126]:
# Define XGB Model
xgb_classifier = XGBClassifier(objective='multi:softmax', num_class=n_classes)

# Model training
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test set with the best model
y_pred_best = xgb_classifier.predict(X_test)

# Compute accuracy for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)

# Print the results
print(f'XGBoost Model FUll - Accuracy: {accuracy_best}')

XGBoost Model FUll - Accuracy: 0.9046242774566474


In [127]:
# Define XGB Model
xgb_classifier = XGBClassifier(objective='multi:softmax', num_class=n_classes)

# Model training
xgb_classifier.fit(X_train_imps, y_train)

# Make predictions on the test set with the best model
y_pred_best = xgb_classifier.predict(X_test_imps)

# Compute accuracy for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)

# Print the results
print(f'XGBoost Model PARTIAL - Accuracy: {accuracy_best}')

XGBoost Model PARTIAL - Accuracy: 0.9026974951830443


The marginal disparity in accuracy between the model tested on the complete dataset and the subset of crucial features aligns with our expectations. As observed during feature exploration, incremental feature additions did lead to a decrease in accuracy, albeit not significantly.

Building on this insight, we now shift our focus to fine-tuning the Extreme Gradient Boosting (XGBoost) model specifically on the subset of essential features, employing the XGB model. This tailored approach aims to extract optimal performance from the most influential features identified earlier.

# **Hyperparameter Search**

In [128]:
# Define the XGBoost classifier
xgb_classifier = XGBClassifier(objective='multi:softmax', num_class=n_classes)

# Define the hyperparameter distributions for Randomized Search
param_dist = {
    'max_depth': randint(3, 11),
    'learning_rate': loguniform(0.01, 0.3),
    'n_estimators': randint(50, 201),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': randint(0, 5)
}

# Perform Randomized Search with 5-fold cross-validation
random_search = RandomizedSearchCV(
    xgb_classifier,
    param_distributions=param_dist,
    n_iter=10, scoring='accuracy',
    cv=5, verbose=1, n_jobs=-1
)
random_search.fit(X_train_imps, y_train)

# Get the best model and its hyperparameters
best_xgb_model = random_search.best_estimator_
best_params = random_search.best_params_

# Make predictions on the test set with the best model
y_pred_best = best_xgb_model.predict(X_test_imps)

# Compute accuracy for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)

# Print the results
print(f'Best XGBoost Model - Hyperparameters: {best_params}')
print(f'Best XGBoost Model - Accuracy: {accuracy_best}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best XGBoost Model - Hyperparameters: {'colsample_bytree': 0.6139699470534671, 'gamma': 2, 'learning_rate': 0.03184732800021393, 'max_depth': 9, 'n_estimators': 169, 'subsample': 0.8407702006250868}
Best XGBoost Model - Accuracy: 0.9017341040462428


In [129]:
# Create a scatter plot of hyperparameter combinations and their corresponding accuracy scores
fig = px.scatter_3d(random_search.cv_results_, x='param_learning_rate', y='param_max_depth', z='param_n_estimators', color='mean_test_score',
                 labels={'param_learning_rate': 'Learning Rate', 'param_max_depth': 'Max Depth', 'param_n_estimators':'N Estimators'},
                 title='Hyperparameter Search Results', color_continuous_scale='Viridis')

# Show the plot
fig.show()

# **Hyperparameter Search (Full Data)**

In [156]:
# Define the XGBoost classifier
xgb_classifier = XGBClassifier(objective='multi:softmax', num_class=n_classes)

# Define the hyperparameter distributions for Randomized Search
param_dist = {
    'max_depth': randint(3, 11),
    'learning_rate': loguniform(0.01, 0.3),
    'n_estimators': randint(50, 201),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': randint(0, 5)
}

# Perform Randomized Search with 5-fold cross-validation
random_search = RandomizedSearchCV(
    xgb_classifier,
    param_distributions=param_dist,
    n_iter=10, scoring='accuracy',
    cv=5, verbose=1, n_jobs=-1
)
random_search.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_xgb_model_full = random_search.best_estimator_
best_params = random_search.best_params_

# Make predictions on the test set with the best model
y_pred_best = best_xgb_model_full.predict(X_test)

# Compute accuracy for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)

# Print the results
print(f'Best XGBoost Model - Hyperparameters: {best_params}')
print(f'Best XGBoost Model - Accuracy: {accuracy_best}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best XGBoost Model - Hyperparameters: {'colsample_bytree': 0.6862116573666228, 'gamma': 1, 'learning_rate': 0.1126738233471937, 'max_depth': 5, 'n_estimators': 188, 'subsample': 0.8631343523103607}
Best XGBoost Model - Accuracy: 0.9132947976878613


Despite feature selection, the ultimate recommendation leans towards utilizing the full dataset. This decision stems from the discernible advantage of leveraging the complete set of features, resulting in a subtle yet consequential improvement in model accuracy. Even a modest enhancement holds significance, affirming the value of incorporating the entire dataset for optimal model performance.

---

**DeepNets**