In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, log_loss

# ==========================================
# 1. USER CONFIGURATION (EDIT THIS PART ONLY)
# ==========================================
TRAIN_PATH = "/kaggle/input/ai-201-b-mse-2-aiml-a/train.csv"  # Path to training file
TEST_PATH = "/kaggle/input/ai-201-b-mse-2-aiml-a/test.csv"    # Path to test file
TARGET_COL = "NObeyesdad"                  # Name of the column you want to predict
ID_COL = "id"                          # Name of the ID column (to be excluded from training)
OUTPUT_FILE = "Name.csv"         # Name of the output file
# ==========================================

# 2. Load Data
print("Loading data...")
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")

# 3. Handle ID Columns
# Save test IDs for submission later
test_ids = test_data[ID_COL]

# Drop ID from training and testing sets to prevent overfitting
if ID_COL in train_data.columns:
    train_data = train_data.drop(columns=[ID_COL])
if ID_COL in test_data.columns:
    test_data = test_data.drop(columns=[ID_COL])

# 4. Separate Features (X) and Target (y)
X = train_data.drop(columns=[TARGET_COL])
y = train_data[TARGET_COL]

# 5. Dynamic Feature Selection
# Automatically detect which columns are numbers and which are categories
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=['number']).columns.tolist()

print(f"\nDetected {len(cat_cols)} categorical columns: {cat_cols}")
print(f"Detected {len(num_cols)} numerical columns: {num_cols}")

# 6. Impute Missing Values (Preprocessing Step 1)
# Fill numbers with MEAN, categories with MODE
print("\nImputing missing values...")

# Fill Numerical
if num_cols:
    mean_vals = X[num_cols].mean()
    X[num_cols] = X[num_cols].fillna(mean_vals)
    test_data[num_cols] = test_data[num_cols].fillna(mean_vals)

# Fill Categorical
if cat_cols:
    # We take the first mode ([0]) in case there's a tie
    mode_vals = X[cat_cols].mode().iloc[0]
    X[cat_cols] = X[cat_cols].fillna(mode_vals)
    test_data[cat_cols] = test_data[cat_cols].fillna(mode_vals)

# 7. Visualization (Optional)
# Plotting correlation for numerical features
if len(num_cols) > 1:
    plt.figure(figsize=(10, 6))
    sns.heatmap(train_data[num_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.title("Feature Correlation Heatmap")
    plt.show()

# Plot Class Distribution
plt.figure(figsize=(6, 4))
sns.countplot(x=TARGET_COL, data=train_data)
plt.title(f"Class Distribution for {TARGET_COL}")
plt.show()

# 8. Define Transformation Pipeline (Preprocessing Step 2)
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', StandardScaler(), num_cols)
])

# 9. Train/Validation Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# 10. Fit Preprocessor
print("\nTransforming data...")
X_train_pre = preprocessor.fit_transform(X_train)
X_val_pre = preprocessor.transform(X_val)
test_data_pre = preprocessor.transform(test_data)

# 11. Train Model
print("\nTraining Random Forest...")
rfc = RandomForestClassifier(n_estimators=1000, random_state=42, class_weight='balanced')
rfc.fit(X_train_pre, y_train)

# 12. Evaluate
print("Evaluating model...")
val_proba = rfc.predict_proba(X_val_pre)

# Calculate Score (handling multiclass automatically)
try:
    roc = roc_auc_score(y_val, val_proba, multi_class='ovr', average='macro')
    loss = log_loss(y_val, val_proba)
    print(f"Validation ROC AUC: {roc:.4f}")
    print(f"Validation Log Loss: {loss:.4f}")
except ValueError as e:
    print(f"Could not calculate some metrics (likely binary vs multiclass mismatch): {e}")

# 13. Generate Submission
print("\nGenerating submission...")

test_pred_labels = rfc.predict(test_data_pre)

submission_df = pd.DataFrame({
    ID_COL: test_ids,
    TARGET_COL: test_pred_labels
})

submission_df.to_csv(OUTPUT_FILE, index=False)
print(f"Submission saved to {OUTPUT_FILE}")
print(submission_df.head())