In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, log_loss
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [8]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [9]:
import seaborn as sns
import matplotlib.pyplot as plt
train_df = pd.read_csv("/content/train (5).csv")
test_df = pd.read_csv("/content/test (4).csv")
sample_submission = pd.read_csv("/kaggle/input/mock-test-2-mse-2/sample_submission.csv")

In [10]:
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Sample submission shape:", sample_submission.shape)

Train shape: (2250, 16)
Test shape: (750, 15)


NameError: name 'sample_submission' is not defined

In [None]:
target = train_df.columns[-1]

X = train_df.drop(columns=[target])
y_raw = train_df[target]

le = LabelEncoder()
y = le.fit_transform(y_raw)

X_test_final = test_df.copy()

In [None]:
num_cols_viz = X.select_dtypes(include=np.number).columns
num_cols_viz = num_cols_viz.drop("id", errors="ignore")

# ðŸ”¹ Histograms
for col in num_cols_viz:
    plt.figure(figsize=(5,3))
    sns.histplot(train_df[col], kde=True)
    plt.title(f"Histogram of {col}")
    plt.show()

In [None]:
for col in num_cols_viz:
    plt.figure(figsize=(5,2))
    sns.boxplot(x=train_df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(train_df[num_cols_viz].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Numerical Feature Correlation Heatmap")
plt.show()

In [None]:
num_cols_out = X.select_dtypes(include=np.number).columns
num_cols_out = num_cols_out.drop("id", errors="ignore")

for col in num_cols_out:
    Q1 = X[col].quantile(0.25)
    Q3 = X[col].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    X[col] = np.where(X[col] < lower, lower, X[col])
    X[col] = np.where(X[col] > upper, upper, X[col])

    if col in X_test_final.columns:
        X_test_final[col] = np.where(X_test_final[col] < lower, lower, X_test_final[col])
        X_test_final[col] = np.where(X_test_final[col] > upper, upper, X_test_final[col])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [None]:
preprocess = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),

        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ]), num_cols),
    ]
)

In [None]:
models = {
    "XGBoost": XGBClassifier(
        objective="multi:softprob",
        eval_metric="mlogloss",
        n_estimators=400,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        n_jobs=-1
    ),
    "LightGBM": LGBMClassifier(
        objective="multiclass",
        n_estimators=400,
        learning_rate=0.05,
        n_jobs=-1
    ),
    "CatBoost": CatBoostClassifier(
        loss_function="MultiClass",
        eval_metric="MultiClass",
        iterations=500,
        learning_rate=0.05,
        depth=6,
        verbose=0
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=400,
        n_jobs=-1
    ),
    "LogisticRegression": LogisticRegression(
        max_iter=2000,
        n_jobs=-1
    )
}

In [None]:
best_model = None
best_logloss = np.inf
best_acc = 0.0
best_name = ""

for name, model in models.items():
    print(f"\nTraining {name} ...")

    pipe = Pipeline([
        ("pre", preprocess),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)

    preds = pipe.predict(X_valid)
    prob = pipe.predict_proba(X_valid)

    acc = accuracy_score(y_valid, preds)
    ll = log_loss(y_valid, prob)

    print(f"===== {name} =====")
    print("Accuracy:", acc)
    print("LogLoss :", ll)

    if ll < best_logloss:
        best_logloss = ll
        best_acc = acc
        best_model = pipe
        best_name = name

In [None]:
print("\n===============================")
print(" BEST MODEL SELECTED AUTOMATICALLY ")
print("===============================")
print("Model      :", best_name)
print("Accuracy   :", best_acc)
print("LogLoss    :", best_logloss)

In [None]:
final_prob = best_model.predict_proba(X_test_final)

submission = sample_submission.copy()

for i, class_name in enumerate(le.classes_):
    submission[f"{target}_{class_name}"] = final_prob[:, i]

submission.to_csv("submission_final.csv", index=False)

print("\nsubmission_final.csv saved!")
print(submission.head())