In [5]:
# Obesity Risk Prediction with Validation & Reproducibility
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# ========================
# 1. Initial Setup
# ========================
# Set global random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# ========================
# 2. Data Checks
# ========================
# Check for missing values
print("Missing values in train:", train.isnull().sum().sum())
print("Missing values in test:", test.isnull().sum().sum())

# Check basic data stats
print("\nTrain data description:")
print(train.describe(include='all'))

# ========================
# 3. Data Preparation
# ========================
# Train-validation split (80-20)
X = train.drop(['id', 'NObeyesdad'], axis=1)
y = train['NObeyesdad']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=RANDOM_SEED
)

# ========================
# 4. Preprocessing
# ========================
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 
                       'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# ========================
# 5. Model Setup
# ========================
models = {
    'LogisticRegression': Pipeline([
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler(with_mean=False)),
        ('model', LogisticRegression(max_iter=2000, random_state=RANDOM_SEED))
    ]),
    'LDA': Pipeline([
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('model', LinearDiscriminantAnalysis())
    ]),
    'NaiveBayes': Pipeline([
        ('preprocessor', preprocessor),
        ('model', GaussianNB())
    ]),
    'SVM': Pipeline([
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('model', SVC(kernel='linear', random_state=RANDOM_SEED))
    ])
}

# ========================
# 6. Model Evaluation
# ========================
for name, pipeline in models.items():
    print(f"\n=== {name} ===")
    
    # Cross-validated predictions
    y_pred = cross_val_predict(pipeline, X_train, y_train, cv=5)
    
    # Validation metrics
    print("Cross-validated Performance:")
    print(classification_report(y_train, y_pred))
    
    # Final validation set evaluation
    pipeline.fit(X_train, y_train)
    val_pred = pipeline.predict(X_val)
    
    print("\nValidation Set Performance:")
    print(classification_report(y_val, val_pred))
    
    # Confusion matrix
    print(f"Confusion Matrix ({name}):")
    print(confusion_matrix(y_val, val_pred))

# ========================
# 7. Generate Submissions
# ========================
for name, pipeline in models.items():
    pipeline.fit(X, y)  # Retrain on full data
    test_pred = pipeline.predict(test.drop('id', axis=1))
    
    pd.DataFrame({
        'id': test['id'],
        'NObeyesdad': test_pred
    }).to_csv(f'submission_{name}.csv', index=False)

Missing values in train: 0
Missing values in test: 0

Train data description:
                 id  Gender           Age        Height        Weight  \
count   20758.00000   20758  20758.000000  20758.000000  20758.000000   
unique          NaN       2           NaN           NaN           NaN   
top             NaN  Female           NaN           NaN           NaN   
freq            NaN   10422           NaN           NaN           NaN   
mean    10378.50000     NaN     23.841804      1.700245     87.887768   
std      5992.46278     NaN      5.688072      0.087312     26.379443   
min         0.00000     NaN     14.000000      1.450000     39.000000   
25%      5189.25000     NaN     20.000000      1.631856     66.000000   
50%     10378.50000     NaN     22.815416      1.700000     84.064875   
75%     15567.75000     NaN     26.000000      1.762887    111.600553   
max     20757.00000     NaN     61.000000      1.975663    165.057269   

       family_history_with_overweight   FAVC 