In [4]:

# Obesity Prediction - Preprocessing Pipeline
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

# === 1. Caricamento dei dati ===
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# === 2. Pulizia valori nulli ===
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# === 3. Mappatura della colonna target 'NObeyesdad' ===
weight_map = {
    'Normal_Weight': 0,
    'Insufficient_Weight': -1,
    'Overweight_Level_I': 1,
    'Overweight_Level_II': 2,
    'Obesity_Type_I': 3,
    'Obesity_Type_II': 4,
    'Obesity_Type_III': 5
}
train_df['NObeyesdad'] = train_df['NObeyesdad'].map(weight_map)

# === 4. Encoding yes/no → True/False ===
for df in [train_df, test_df]:
    df.replace({'yes': True, 'no': False}, inplace=True)

# === 5. Label Encoding delle colonne categoriche (esclusa la colonna target) ===
categorical_cols = train_df.select_dtypes(include='object').columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    
    # Fit su dati combinati per evitare problemi con categorie nuove
    combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
    le.fit(combined)
    
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    
    label_encoders[col] = le

# === 6. Separazione input e target ===
X = train_df.drop(columns=['id', 'NObeyesdad'])  # Rimuove id e target
y = train_df['NObeyesdad']

# === 7. Train/validation split ===
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



  df.replace({'yes': True, 'no': False}, inplace=True)


In [5]:
# === 7. Feature selection (opzionale, basata su VIF e p-value) ===
def calculate_vif(X):
    # Assicura solo colonne numeriche e converte bool in int
    X_numeric = X.select_dtypes(include=[np.number, 'bool']).copy()
    X_numeric = X_numeric.astype(float)
    
    # Rimuove eventuali infiniti
    X_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_numeric.dropna(inplace=True)
    
    return pd.DataFrame({
        'feature': X_numeric.columns,
        'VIF': [variance_inflation_factor(X_numeric.values, i) for i in range(X_numeric.shape[1])]
    })

def calculate_pvalues(X, y):
    X_const = sm.add_constant(X)
    model = sm.OLS(y, X_const).fit()
    return model.pvalues[1:]  # exclude intercept

def feature_selection(X, y, vif_threshold=10, pval_threshold=0.05):
    while True:
        vif = calculate_vif(X)
        pvals = calculate_pvalues(X, y)

        drop_cols = [
            col for col in X.columns
            if vif[vif.feature == col]['VIF'].values[0] > vif_threshold or pvals[col] > pval_threshold
        ]

        if not drop_cols:
            break

        print(f"Rimosse feature: {drop_cols}")
        X = X.drop(columns=drop_cols)

    return X

In [6]:
print(X.dtypes)
print(X.isnull().sum())
print(np.isfinite(X.select_dtypes(include=[np.number])).all())

X_train_selected = feature_selection(X_train.copy(), y_train)

# === 8. Modello ===
model = RandomForestClassifier(random_state=42)
model.fit(X_train_selected, y_train)

# === 9. Preparazione del test set ===
X_test = test_df.drop(columns=['id', 'NObeyesdad'], errors='ignore')  # Rimuovi 'id' e 'NObeyesdad' se presenti
X_test_selected = X_test[X_train_selected.columns]  # Allinea le colonne con quelle selezionate nel train

predictions = model.predict(X_test_selected)

# Inverti l'encoding per avere le etichette leggibili
pred_labels = le.inverse_transform(predictions)# === 7. Feature selection (opzionale, basata su VIF e p-value) ===
def calculate_vif(X):
    # Rimuovi colonne non numeriche
    X = X.select_dtypes(include=[np.number])
    # Gestisci valori mancanti
    X = X.fillna(0)  # Sostituisci NaN con 0 (o usa un'altra strategia)
    return pd.DataFrame({
        'feature': X.columns,
        'VIF': [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    })

def calculate_pvalues(X, y):
    X_const = sm.add_constant(X)
    model = sm.OLS(y, X_const).fit()
    return model.pvalues[1:]  # exclude intercept

def feature_selection(X, y, vif_threshold=10, pval_threshold=0.05):
    # Rimuovi colonne non numeriche
    X = X.select_dtypes(include=[np.number])
    # Gestisci valori mancanti
    X = X.fillna(0)  # Sostituisci NaN con 0 (o usa un'altra strategia)
    
    while True:
        vif = calculate_vif(X)
        pvals = calculate_pvalues(X, y)

        drop_cols = [
            col for col in X.columns
            if vif[vif.feature == col]['VIF'].values[0] > vif_threshold or pvals[col] > pval_threshold
        ]

        if not drop_cols:
            break

        print(f"Rimosse feature: {drop_cols}")
        X = X.drop(columns=drop_cols)

    return X

# Applica la funzione di selezione delle feature
X_train_selected = feature_selection(X_train.copy(), y_train)

# Crea un DataFrame con gli ID e le predizioni
output = test_df[['id']].copy()
output['Predicted_Obesity_Level'] = pred_labels

# Mostra le prime 10 predizioni
print("📊 Prime 10 predizioni sul test set:")
print(output.head(10))

Gender                              int32
Age                               float64
Height                            float64
Weight                            float64
family_history_with_overweight       bool
FAVC                                 bool
FCVC                              float64
NCP                               float64
CAEC                                int32
SMOKE                                bool
CH2O                              float64
SCC                                  bool
FAF                               float64
TUE                               float64
CALC                                int32
MTRANS                              int32
dtype: object
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                      

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).