In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, accuracy_score

from xgboost import XGBClassifier


# ===============================
# 1. LOAD DATA
# ===============================
train_df = pd.read_csv("/kaggle/input/mock-test-2-mse-2/train.csv")
test_df  = pd.read_csv("/kaggle/input/mock-test-2-mse-2/test.csv")

y_raw = train_df["Status"]
X = train_df.drop(columns=["Status"])

test_ids = test_df["id"]
X_test = test_df.copy()


# ===============================
# 2. LABEL ENCODING
# ===============================
le = LabelEncoder()
y = le.fit_transform(y_raw)


# ===============================
# 3. COLUMN TYPES
# ===============================
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

if "id" in num_cols:
    num_cols.remove("id")
if "id" in cat_cols:
    cat_cols.remove("id")


# ===============================
# 4. PREPROCESSOR
# ===============================
numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(
        handle_unknown="ignore",
        sparse_output=True,
        min_frequency=5
    ))
])

preprocess = ColumnTransformer([
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols)
])


# ===============================
# 5. TRAIN-VALID SPLIT
# ===============================
X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# ===============================
# 6. FIT PREPROCESSING ON TRAIN ONLY
# ===============================
X_train_p = preprocess.fit_transform(X_train)
X_valid_p = preprocess.transform(X_valid)
X_test_p  = preprocess.transform(X_test)


# ===============================
# 7. XGBOOST MODEL
# ===============================
xgb = XGBClassifier(
    n_estimators=3000,
    learning_rate=0.03,
    max_depth=5,
    min_child_weight=3,
    gamma=0.2,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=0.1,
    reg_lambda=1.5,
    objective="multi:softprob",
    eval_metric="mlogloss",
    random_state=42,
    tree_method="hist",
    n_jobs=-1,
    early_stopping_rounds=100
)


# ===============================
# 8. TRAIN WITH EARLY STOPPING
# ===============================
xgb.fit(
    X_train_p,
    y_train,
    eval_set=[(X_valid_p, y_valid)],
    verbose=False
)


# ===============================
# 9. VALIDATION METRICS
# ===============================
valid_probs = xgb.predict_proba(X_valid_p)
valid_preds = np.argmax(valid_probs, axis=1)

val_logloss = log_loss(y_valid, valid_probs)
val_accuracy = accuracy_score(y_valid, valid_preds)

print("Validation Log Loss:", val_logloss)
print("Validation Accuracy:", val_accuracy)


# ===============================
# 10. TEST PREDICTIONS
# ===============================
test_probs = xgb.predict_proba(X_test_p)

class_order = xgb.classes_
label_map = le.inverse_transform(class_order)


# ===============================
# 11. SUBMISSION
# ===============================
# submission = pd.DataFrame({
#     "id": test_ids,
#     "Status_C":  test_probs[:, list(label_map).index("C")],
#     "Status_CL": test_probs[:, list(label_map).index("CL")],
#     "Status_D":  test_probs[:, list(label_map).index("D")]
# })
sample_sub = pd.read_csv("/kaggle/input/mock-test-2-mse-2/sample_submission.csv")
print(sample_sub.columns)
submission = pd.DataFrame()
submission["id"] = test_ids

for col in sample_sub.columns[1:]:  # skip id
    class_name = col.replace("Status_", "")
    
    if class_name in label_map:
        submission[col] = test_probs[:, list(label_map).index(class_name)]
    else:
        submission[col] = 0.0  # class not seen in training
submission.to_csv("submission.csv", index=False)
print("✔ submission.csv created successfully")


Validation Log Loss: 0.36079385378336837
Validation Accuracy: 0.8576666666666667
Index(['id', 'Status_C', 'Status_CL', 'Status_D'], dtype='object')
✔ submission.csv created successfully
