In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# -------------------- LOAD DATA --------------------
train = pd.read_csv("/kaggle/input/mse-2-ai-201-b-aiml-a/train.csv")
test = pd.read_csv("/kaggle/input/mse-2-ai-201-b-aiml-a/test.csv")

# -------------------- FEATURES & TARGET --------------------
X = train.drop("Class", axis=1)
y = LabelEncoder().fit_transform(train["Class"])

# -------------------- PREPROCESSING --------------------
prep = ColumnTransformer([
    ("num", Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc", StandardScaler())
    ]), X.select_dtypes(np.number).columns),

    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh", OneHotEncoder(handle_unknown="ignore"))
    ]), X.select_dtypes("object").columns)
])

# -------------------- MODEL PIPELINE --------------------
model = Pipeline([
    ("prep", prep),
    ("rf", RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

# -------------------- TRAIN / VALIDATE --------------------
Xtr, Xval, ytr, yval = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model.fit(Xtr, ytr)
print("Validation Accuracy:", accuracy_score(yval, model.predict(Xval)))

# -------------------- HYPERPARAMETER TUNING --------------------
grid = GridSearchCV(
    model,
    {"rf__n_estimators": [100, 200], "rf__max_depth": [None, 10]},
    cv=3
)
grid.fit(Xtr, ytr)
best_model = grid.best_estimator_

# -------------------- FEATURE ALIGNMENT CHECK --------------------
print("Train - Test:", X.columns.difference(test.columns))
print("Test - Train:", test.columns.difference(X.columns))

# -------------------- PREDICTION & SUBMISSION --------------------
le = LabelEncoder().fit(train["Class"])
pred = le.inverse_transform(best_model.predict(test))

pd.DataFrame({
    "id": test["id"],
    "Class": pred
}).to_csv("submission.csv", index=False)

# sample_sub = pd.read_csv("/kaggle/input/mse-2-ai-201-b-aiml-a/sample_submission.csv")
# print(sample_sub.columns)
# submission = pd.DataFrame()
# #submission["id"] = test_ids

# for col in sample_sub.columns[1:]:  # skip id
#     class_name = col.replace("Status_", "")

#     if class_name in label_map:
#         submission[col] = probs[:, list(label_map).index(class_name)]
#     else:
#         submission[col] = 0.0  # class not seen in training

