In [None]:
# 📘 Obesity Prediction - Kaggle Playground S4E2 (REVISED)

# === Imports ===
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from xgboost import XGBClassifier, plot_importance

# === 1. Load data ===
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
display(train_df.head())



In [None]:
# === 2. Data cleaning ===
# Modifica chiave: uso .copy() per evitare SettingWithCopyWarning
train_df = train_df.dropna().copy()
test_df = test_df.dropna().copy()

# Conversione esplicita yes/no -> True/False
for df in [train_df, test_df]:
    df.replace({'yes': True, 'no': False}, inplace=True)


In [None]:
# === 3. Exploratory Data Analysis before encoding ===
plt.figure(figsize=(10, 6))
sns.countplot(data=train_df, x='NObeyesdad', order=train_df['NObeyesdad'].value_counts().index, palette='Set2')
plt.title('Distribution of Obesity Risk Categories')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(data=train_df, x='NObeyesdad', y='Age', order=train_df['NObeyesdad'].value_counts().index, palette='Pastel1')
plt.title('Age Distribution by Obesity Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.countplot(data=train_df, x='NObeyesdad', hue='Gender', order=train_df['NObeyesdad'].value_counts().index, palette='Set3')
plt.title('Obesity Category Distribution by Gender')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.countplot(data=train_df, x='NObeyesdad', hue='family_history_with_overweight', order=train_df['NObeyesdad'].value_counts().index, palette='Set1')
plt.title('Obesity Category by Family History')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.violinplot(data=train_df, x='NObeyesdad', y='FCVC', order=train_df['NObeyesdad'].value_counts().index, palette='Set2')
plt.title('Vegetable Consumption (FCVC) by Obesity Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



In [None]:
# === 4. Encode target labels ===
weight_map = {
    'Normal_Weight': 1,
    'Insufficient_Weight': 0,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}
train_df['NObeyesdad'] = train_df['NObeyesdad'].map(weight_map).astype(int)

# === 5. Encoding colonne categoriche (versione migliorata) ===
categorical_cols = train_df.select_dtypes(include='object').columns

label_encoders = {}
for col in categorical_cols:
    if col in test_df.columns:
        # Versione più robusta con gestione 'unknown'
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col].astype(str))
        
        # Processamento test set più sicuro
        test_df[col] = test_df[col].astype(str)
        mask = ~test_df[col].isin(le.classes_)
        if mask.any():
            test_df.loc[mask, col] = 'unknown'
            le.classes_ = np.append(le.classes_, 'unknown')
        test_df[col] = le.transform(test_df[col])
        label_encoders[col] = le

# === 6. Train/Validation Split ===
X = train_df.drop(columns=['id', 'NObeyesdad'])
y = train_df['NObeyesdad']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# === 7. Feature Scaling ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Preparazione test set più robusta
X_test = test_df.drop(columns=['id'], errors='ignore')
X_test_scaled = scaler.transform(X_test)

# === 8. Modeling with XGBoost (versione migliorata) ===
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

xgb_model = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss',
    early_stopping_rounds=10  # Aggiunto early stopping
)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1  # Aggiunto parallel processing
)

grid_search.fit(
    X_train_scaled, y_train,
    eval_set=[(X_val_scaled, y_val)],
    verbose=False
)

best_model = grid_search.best_estimator_
print("Best parameters found:", grid_search.best_params_)

# === 9. Validation Evaluation (migliorato) ===
y_pred = best_model.predict(X_val_scaled)
print("\nClassification Report:\n")
print(classification_report(y_val, y_pred))

# Aggiunta confusion matrix
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=weight_map.keys(),
            yticklabels=weight_map.keys())
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Aggiunta feature importance
plt.figure(figsize=(10, 8))
plot_importance(best_model, max_num_features=20)
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

# === 10. Predictions on Test Set (versione più robusta) ===
predictions = best_model.predict(X_test_scaled)
predictions = np.clip(predictions, 0, 6).astype(int)  # Clip esplicito

inverse_map = {v: k for k, v in weight_map.items()}
pred_labels = [inverse_map.get(p, 'Unknown') for p in predictions]

output = test_df[['id']].copy()
output['NObeyesdad'] = pred_labels  # Rinominato per consistenza con la competizione
output.to_csv("submission_revised.csv", index=False)

print("\nSample predictions:")
print(output.head(10))