In [None]:
# =====================================================================
# üìå COMPLETE PIPELINE WITH EDA + OUTLIER ANALYSIS + RANDOM FOREST
# üìÅ FILE: COMPLETE_EDA_OUTLIER_RF_PIPELINE.py
# =====================================================================

# ==============================================================
# 1Ô∏è‚É£ IMPORT LIBRARIES
#    - numpy/pandas ‚Üí data processing
#    - seaborn/matplotlib ‚Üí visualization
#    - sklearn ‚Üí ML processing + modeling
# ==============================================================

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, log_loss, confusion_matrix,
    classification_report
)
from sklearn.pipeline import Pipeline

# =====================================================================
# 2Ô∏è‚É£ LOAD DATA
# =====================================================================
train = pd.read_csv("/content/train.csv")   # training data
test = pd.read_csv("/content/test.csv")     # test data for submission

print("Train Shape:", train.shape)
print("Test Shape:", test.shape)


# =====================================================================
# üé® 3Ô∏è‚É£ EXPLORATORY DATA ANALYSIS (EDA)
# =====================================================================
print("\n================ EDA & VISUALIZATION ================")

# --- (A) Check missing values ---
print("\nNull Values per Column:")
print(train.isnull().sum())
train.info()         # show datatypes + nulls
train.head()         # preview rows
train.duplicated().sum()   # count duplicate rows
train.nunique()      # number of unique values per column

# --- (B) Target Distribution ---
plt.figure(figsize=(6, 4))
sns.countplot(x=train["Status"], palette="viridis")
plt.title("Distribution of Target Variable (Status)")
plt.show()

# --- (C) Correlation Heatmap ---
numeric_df = train.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

# --- (D) Boxplots for Outlier Detection ---
cols_to_plot = numeric_df.columns
n_cols = 3
n_rows = (len(cols_to_plot) - 1) // n_cols + 1

plt.figure(figsize=(15, n_rows * 4))
for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(x=train[col], color="skyblue")
    plt.title(f"Boxplot of {col}")
plt.tight_layout()
plt.show()


# =====================================================================
# 4Ô∏è‚É£ DATA CLEANING ‚Äì DROP NULLS (Training only)
# =====================================================================
print("\n================ DATA CLEANING ================")
initial_rows = len(train)

# Removes any rows in TRAIN with null values
train.dropna(inplace=True)

print(f"Rows dropped: {initial_rows - len(train)}")
print(f"Remaining Training Rows: {len(train)}")

# Test dataset must keep original row count ‚Üí fill instead of drop
for col in test.columns:
    if test[col].dtype == 'object':
        test[col] = test[col].fillna(test[col].mode()[0])  # fill categorical
    else:
        test[col] = test[col].fillna(test[col].mean())     # fill numerical


# =====================================================================
# 5Ô∏è‚É£ SEPARATE FEATURES & TARGET
# =====================================================================
y = train["Status"]               # target column
X = train.drop(columns=["Status"])    # feature columns


# =====================================================================
# 6Ô∏è‚É£ OUTLIER ANALYSIS (Isolation Forest + IQR Capping)
# =====================================================================
print("\n================ OUTLIER REMOVAL ================")

numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns

# --- (A) Isolation Forest Outlier Detection ---
iso = IsolationForest(contamination=0.03, random_state=42)
outlier_flags = iso.fit_predict(X[numeric_cols])  # -1 = outlier

X_clean = X[outlier_flags == 1]   # keep only non-outliers
y_clean = y[outlier_flags == 1]

print(f"Rows removed by IsolationForest: {len(X) - len(X_clean)}")


# --- (B) IQR Capping (per category group) ---
def cap_outliers_categorywise_all(df, cat_col, num_cols):
    """
    Caps numeric column values within each category group based on IQR.
    Prevents extreme values while preserving category distribution.
    """
    def cap_group(group):
        group = group.copy()
        for col in num_cols:
            Q1 = group[col].quantile(0.25)
            Q3 = group[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            group[col] = group[col].clip(lower, upper)  # cap outliers
        return group
    return df.groupby(cat_col, group_keys=False, observed=True, sort=False).apply(cap_group)

# Choose first categorical column for grouping
cat_columns = X_clean.select_dtypes(include="object").columns

if len(cat_columns) > 0:
    categorical_col_for_capping = "Region" if "Region" in X_clean.columns else cat_columns[0]

    temp_df = X_clean.copy()
    temp_df["Status"] = y_clean

    capped_df = cap_outliers_categorywise_all(temp_df, categorical_col_for_capping, numeric_cols)

    y_clean = capped_df["Status"]
    X_clean = capped_df.drop(columns=["Status"])

    print("IQR Outlier Capping completed!")
else:
    print("No categorical column found for IQR method. Skipping...")


# =====================================================================
# 7Ô∏è‚É£ TRAIN‚ÄìVALIDATION SPLIT
# =====================================================================
X_train, X_val, y_train, y_val = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42, stratify=y_clean
)


# =====================================================================
# 8Ô∏è‚É£ PREPROCESSING PIPELINE
#    Combines OneHotEncoding + numeric passthrough
# =====================================================================
categorical_cols = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

base_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])


# =====================================================================
# 9Ô∏è‚É£ GRIDSEARCH HYPERPARAMETER TUNING
# =====================================================================
print("\n================ MODEL TRAINING ================")

param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [10, 20],
    "classifier__min_samples_split": [2, 5],
}

grid = GridSearchCV(
    base_model,
    param_grid,
    scoring="accuracy",
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid.fit(X_train, y_train)
best_model = grid.best_estimator_

print("\nüîé Best Parameters:", grid.best_params_)


# =====================================================================
# üîü VALIDATION EVALUATION
# =====================================================================
y_pred = best_model.predict(X_val)
y_proba = best_model.predict_proba(X_val)

print("\n================ VALIDATION METRICS ================")
print("Accuracy :", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

# Confusion Matrix Plot
sns.heatmap(confusion_matrix(y_val, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.show()


# =====================================================================
# 1Ô∏è‚É£1Ô∏è‚É£ GENERATE SUBMISSION FILE
# =====================================================================
test_processed = test[X.columns]   # keep original feature order

predict_proba = best_model.predict_proba(test_processed)
class_labels = list(best_model.classes_)

# Create submission file structure
submission = pd.DataFrame({"id": test["id"]})
for i, cls in enumerate(class_labels):
    submission[f"Status_{cls}"] = predict_proba[:, i]

submission.to_csv("submission.csv", index=False)
print("\nüìÅ Submission saved to submission.csv")

