In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score

# ==========================================
# 1. USER CONFIGURATION (EDIT ONLY THESE)
# ==========================================
TRAIN_PATH = "/kaggle/input/your-dataset/train.csv"   # Update path
TEST_PATH = "/kaggle/input/your-dataset/test.csv"     # Update path
TARGET_COL = "NObeyesdad"                             # Target column
ID_COL = "id"                                         # ID column
OUTPUT_FILE = "submission.csv"                        # Output filename

# ==========================================

# 2. Load Data
print("Loading data...")
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")

# 3. Handle ID Columns
test_ids = test_data[ID_COL]

if ID_COL in train_data.columns:
    train_data = train_data.drop(columns=[ID_COL])
if ID_COL in test_data.columns:
    test_data = test_data.drop(columns=[ID_COL])

# 4. Separate Features and Target
X = train_data.drop(columns=[TARGET_COL])
y = train_data[TARGET_COL]

# 5. Detect Categorical & Numerical Columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=['number']).columns.tolist()

print(f"\nCategorical columns: {cat_cols}")
print(f"Numerical columns: {num_cols}")

# 6. Impute Missing Values
print("\nImputing missing values...")

if num_cols:
    mean_vals = X[num_cols].mean()
    X[num_cols] = X[num_cols].fillna(mean_vals)
    test_data[num_cols] = test_data[num_cols].fillna(mean_vals)

if cat_cols:
    mode_vals = X[cat_cols].mode().iloc[0]
    X[cat_cols] = X[cat_cols].fillna(mode_vals)
    test_data[cat_cols] = test_data[cat_cols].fillna(mode_vals)

# 7. Optional Visualization
plt.figure(figsize=(6, 4))
sns.countplot(x=TARGET_COL, data=train_data)
plt.title(f"Class Distribution for {TARGET_COL}")
plt.xticks(rotation=45)
plt.show()

# 8. Preprocessing Pipeline
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', StandardScaler(), num_cols)
])

# 9. Train/Validation Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# 10. Transform Data
print("\nTransforming data...")
X_train_pre = preprocessor.fit_transform(X_train)
X_val_pre = preprocessor.transform(X_val)
test_data_pre = preprocessor.transform(test_data)

# 11. Train Model
print("\nTraining Random Forest...")
rfc = RandomForestClassifier(
    n_estimators=800,
    max_depth=20,
    class_weight='balanced',
    random_state=42
)
rfc.fit(X_train_pre, y_train)

# 12. Evaluate
print("\nEvaluating model...")
val_preds = rfc.predict(X_val_pre)
val_proba = rfc.predict_proba(X_val_pre)

print(f"Accuracy: {accuracy_score(y_val, val_preds):.4f}")
print(f"Log Loss: {log_loss(y_val, val_proba):.4f}")

# 13. Generate Final Submission (CLASS LABELS ONLY)
print("\nGenerating submission...")

test_pred_labels = rfc.predict(test_data_pre)

submission_df = pd.DataFrame({
    ID_COL: test_ids,
    TARGET_COL: test_pred_labels
})

submission_df.to_csv(OUTPUT_FILE, index=False)
print(f"Submission saved to {OUTPUT_FILE}")
print(submission_df.head())