In [None]:
# ================================================================
#  MACHINE LEARNING PIPELINE – EXTREMELY CLEAN + GOD-LEVEL COMMENTS
#  This code is written in a generalised way so you can reuse it
#  for ANY classification dataset by changing file paths + target.
# ================================================================

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# ================================================================
# 1. LOAD DATA
#    ✔ Change file paths when using a new dataset.
# ================================================================
train = pd.read_csv("/kaggle/input/mock-test-2-mse-2/train.csv")
test = pd.read_csv("/kaggle/input/mock-test-2-mse-2/test.csv")

# ================================
# 2. BASIC EDA CHECKS
# ================================
print("\n===== HEAD OF DATA =====")
print(train.head())

print("\n===== DATA INFO =====")
print(train.info())

print("\n===== NULL COUNT =====")
print(train.isnull().sum())

print("\n===== TARGET DISTRIBUTION =====")
print(train['Status'].value_counts())
# NOTE: When using a new dataset → replace "Status" with your new target column name.

# ================================================================
# 3. SEPARATE FEATURES & TARGET
# ================================================================
y = train["Status"]                     # TARGET COLUMN → change for new dataset
X = train.drop("Status", axis=1)        # DROP TARGET FROM FEATURES

# Detect column types (helps generalize code)
num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

print("\nNumeric Columns:", list(num_cols))
print("Categorical Columns:", list(cat_cols))

# ================================================================
# 4. CATEGORY-WISE OUTLIER CAPPING
#    ✔ Highly robust method: caps outliers inside each category.
#    ✔ Works perfectly for imbalanced or grouped datasets.
# ================================================================
def cap_outliers_categorywise_all(df, cat_col, num_cols):
    """
    Caps numeric feature outliers separately for each category.
    This is useful when distribution varies between categories.

    HOW TO REUSE:
    - df      : dataset
    - cat_col : any categorical column
    - num_cols: list of numeric columns
    """
    df = df.copy()
    for col in num_cols:
        Q1 = df.groupby(cat_col)[col].transform(lambda x: x.quantile(0.25))
        Q3 = df.groupby(cat_col)[col].transform(lambda x: x.quantile(0.75))
        IQR = Q3 - Q1

        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        df[col] = df[col].clip(lower, upper)

    return df

# Apply outlier capping on ALL categorical columns
for c in cat_cols:
    X = cap_outliers_categorywise_all(X, c, num_cols)
    test = cap_outliers_categorywise_all(test, c, num_cols)

# ================================================================
# 5. VISUALIZATION – EASY MARKS IN EXAM
# ================================================================
plt.figure(figsize=(10,4))
train.isnull().sum().plot(kind='bar')
plt.title("Missing Values per Column")
plt.show()

plt.figure(figsize=(5,4))
sns.countplot(x=train['Status'])
plt.title("Target Class Distribution")
plt.show()

X[num_cols].hist(figsize=(14,10))
plt.suptitle("Numeric Feature Distributions")
plt.show()

plt.figure(figsize=(12,6))
sns.heatmap(X[num_cols].corr(), cmap='coolwarm', annot=False)
plt.title("Correlation Heatmap")
plt.show()

# ================================================================
# 6. HANDLE NULL VALUES (VERY IMPORTANT FOR MODEL STABILITY)
# ================================================================
# For numeric → median (safe for skewed data)
X[num_cols] = X[num_cols].fillna(X[num_cols].median())
test[num_cols] = test[num_cols].fillna(test[num_cols].median())

# For categorical → mode (most common class)
X[cat_cols] = X[cat_cols].fillna(X[cat_cols].mode().iloc[0])
test[cat_cols] = test[cat_cols].fillna(test[cat_cols].mode().iloc[0])

# Reset index to avoid ID issues
X = X.reset_index(drop=True)
test = test.reset_index(drop=True)

# ================================================================
# 7. LABEL ENCODE TARGET COLUMN
#    ✔ Converts labels (strings) → numeric
# ================================================================
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("\nLabel Mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

# ================================================================
# 8. ONE-HOT ENCODING + PIPELINE
# ================================================================
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Preprocessing = OHE for categorical + passthrough numeric
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

# Full model pipeline
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(
        random_state=42
    ))
])

# ================================================================
# 9. TRAIN-VALIDATION SPLIT
# ================================================================
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

# ================================================================
# 10. TRAIN MODEL
# ================================================================
model.fit(X_train, y_train)

# ================================================================
# 11. EVALUATION METRICS
# ================================================================
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)

print("\n========= MODEL PERFORMANCE =========")
print("Accuracy :", accuracy_score(y_val, y_pred))
print("Precision:", precision_score(y_val, y_pred, average='macro'))
print("Recall   :", recall_score(y_val, y_pred, average='macro'))
print("F1 Score :", f1_score(y_val, y_pred, average='macro'))
print("ROC AUC  :", roc_auc_score(y_val, y_prob, multi_class='ovr'))

# ================================================================
# 12. TRAIN FINAL MODEL ON FULL DATA
# ================================================================
model.fit(X, y_encoded)

# ================================================================
# 13. GENERATE FINAL PREDICTIONS
# ================================================================
test_prob = model.predict_proba(test)

# NOTE:
#  Kaggle expects prediction probabilities class-wise.
#  The order of columns must match the encoded labels.
# ================================================================

submission = pd.DataFrame()
submission["id"] = test["id"]

# Add probability for each class
for class_label in le.classes_:
    submission[f"Status_{class_label}"] = test_prob[:, le.transform([class_label])[0]]

print("\nDuplicate IDs in Submission:", submission["id"].duplicated().sum())

# ================================================================
# 14. SAVE FINAL SUBMISSION
# ================================================================
submission.to_csv("submission1.csv", index=False)
print("\nsubmission1.csv CREATED SUCCESSFULLY!\n")
print(submission.head())
