In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error, r2_score

warnings.filterwarnings('ignore')

# ================================================================
# 1. USER CONFIGURATION
# ================================================================
# Choose: "Classification" (Categories) or "Regression" (Numbers)
TASK_TYPE = "Classification"

# File Paths
TRAIN_PATH  = "/kaggle/input/ai-201-b-mse-2-aiml-a/train.csv"
TEST_PATH   = "/kaggle/input/ai-201-b-mse-2-aiml-a/test.csv"
SAMPLE_PATH = "/kaggle/input/ai-201-b-mse-2-aiml-a/sample_submission.csv"

# Column Names
TARGET_COL  = "NObeyesdad" # The column you want to predict
ID_COL      = "id"          # The ID column to ignore

# Unwanted columns to drop immediately (IDs, Names, etc.)
# NOTE: Add 'CustomerId' or 'Surname' here if they exist in your data
DROP_COLS   = [ID_COL, "CustomerId", "Surname", "RowNumber"]

# ================================================================
# 2. LOAD DATA & TEXT EDA
# ================================================================
print("--- LOADING DATA ---")
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_PATH)

# Save IDs for submission before dropping them
test_ids = test[ID_COL] if ID_COL in test.columns else test.index

# --- USER REQUESTED EDA STEPS ---
print("\n--- 1. DATA INFO ---")
print(train.info())

print("\n--- 2. FIRST 5 ROWS (RAW) ---")
print(train.head())

print(f"\n--- 3. DROPPING UNWANTED COLUMNS: {DROP_COLS} ---")
# Dropping columns inplace as requested
train.drop(columns=DROP_COLS, axis=1, inplace=True, errors='ignore')
test.drop(columns=DROP_COLS, axis=1, inplace=True, errors='ignore')

print("\n--- 4. FIRST 5 ROWS (AFTER DROP) ---")
print(train.head())

print("\n--- 5. MISSING VALUES ---")
print(train.isnull().sum())

print("\n--- 6. DUPLICATES ---")
print(train.duplicated().sum())

print("\n--- 7. UNIQUE VALUES PER COLUMN ---")
print(train.nunique())

# ================================================================
# 3. VISUAL EDA (BOXPLOTS & HEATMAP ONLY)
# ================================================================
print("\n--- GENERATING PLOTS ---")

# Separate numeric cols for plotting
eda_num_cols = train.select_dtypes(include=['number']).columns

if len(eda_num_cols) > 0:
    # PLOT 1: Boxplots (Outlier Detection)
    plt.figure(figsize=(15, 6))
    # Normalize data for visualization so all boxplots fit on one scale
    sns.boxplot(data=(train[eda_num_cols] - train[eda_num_cols].mean()) / train[eda_num_cols].std())
    plt.title("Outlier Detection (Normalized Boxplots)")
    plt.xticks(rotation=90)
    plt.show()

    # PLOT 2: Correlation Heatmap
    if len(eda_num_cols) > 1:
        plt.figure(figsize=(10, 8))
        sns.heatmap(train[eda_num_cols].corr(), annot=True, fmt=".1f", cmap='coolwarm', linewidths=0.5)
        plt.title("Feature Correlation Matrix")
        plt.show()

# ================================================================
# 4. PREPROCESSING & SPLIT
# ================================================================
print("\n--- PREPROCESSING ---")

# Separate X and y
X = train.drop(columns=[TARGET_COL])
y = train[TARGET_COL]

# Identify columns automatically
num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# --- 4.1 Handle Null Values (Median/Mode) ---
# Replacing nulls BEFORE capping outliers to ensure data consistency
X[num_cols] = X[num_cols].fillna(X[num_cols].median())
test[num_cols] = test[num_cols].fillna(test[num_cols].median())

if len(cat_cols) > 0:
    X[cat_cols] = X[cat_cols].fillna(X[cat_cols].mode().iloc[0])
    test[cat_cols] = test[cat_cols].fillna(test[cat_cols].mode().iloc[0])

# --- 4.2 Robust Outlier Capping (Category-wise) ---
def cap_outliers_categorywise(df, cat_col, num_cols):
    df = df.copy()
    for col in num_cols:
        if df[cat_col].nunique() < 50: # Only apply if category count is reasonable
            Q1 = df.groupby(cat_col)[col].transform(lambda x: x.quantile(0.25))
            Q3 = df.groupby(cat_col)[col].transform(lambda x: x.quantile(0.75))
            IQR = Q3 - Q1
            df[col] = df[col].clip(Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
    return df

print("Capping Outliers (using robust IQR method)...")
for c in cat_cols:
    X = cap_outliers_categorywise(X, c, num_cols)
    test = cap_outliers_categorywise(test, c, num_cols)

# --- Target Encoding ---
le = LabelEncoder()
if TASK_TYPE == "Classification":
    y_encoded = le.fit_transform(y)
    print(f"Target Encoded. Mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")
else:
    y_encoded = y

# --- Pipeline Setup ---
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

# --- Model Selection ---
if TASK_TYPE == "Classification":
    model = Pipeline(steps=[
        ("preprocess", preprocess),
        ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
    ])
else:
    model = Pipeline(steps=[
        ("preprocess", preprocess),
        ("clf", RandomForestRegressor(n_estimators=100, random_state=42))
    ])

# ================================================================
# 5. TRAINING & EVALUATION
# ================================================================
print("\n--- TRAINING MODEL ---")
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
y_pred_val = model.predict(X_val)

print(f"\n===== {TASK_TYPE.upper()} PERFORMANCE =====")
if TASK_TYPE == "Classification":
    print("Accuracy:", accuracy_score(y_val, y_pred_val))
    print("F1 Score:", f1_score(y_val, y_pred_val, average='macro'))
else:
    print("RMSE:", np.sqrt(mean_squared_error(y_val, y_pred_val)))
    print("R2 Score:", r2_score(y_val, y_pred_val))

# ================================================================
# 6. FINAL PREDICTION & SUBMISSION (SMART LOGIC)
# ================================================================
print("\n--- GENERATING SUBMISSION ---")

# 1. Retrain on full data
model.fit(X, y_encoded)

# 2. Setup Submission DataFrame using IDs from TEST file (Safe Method)
submission = pd.DataFrame()
submission[ID_COL] = test_ids

# 3. Identify sample submission format
sample_cols = [c for c in sample_sub.columns if c != ID_COL]

if len(sample_cols) == 1:
    # --- SCENARIO 1: Simple Prediction ---
    print(f"Detected Single Target: {sample_cols[0]}")
    preds = model.predict(test)

    if TASK_TYPE == "Classification":
        submission[sample_cols[0]] = le.inverse_transform(preds) # Words
    else:
        submission[sample_cols[0]] = preds # Numbers

else:
    # --- SCENARIO 2: Probabilities ---
    print(f"Detected Multi-Class Probabilities ({len(sample_cols)} cols)")
    probs = model.predict_proba(test)

    for i, class_name in enumerate(le.classes_):
        found_col = [col for col in sample_cols if str(class_name) in col]
        if found_col:
            submission[found_col[0]] = probs[:, i]

# 4. Save
submission.to_csv("submission_final.csv", index=False)
print("\nsubmission_final.csv CREATED!")
print(submission.head())