In [2]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import accuracy_score, log_loss

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [None]:
# =========================
# 2. LOAD DATA (KAGGLE)
# =========================
train_df = pd.read_csv("/kaggle/input/your-dataset/train.csv")
test_df = pd.read_csv("/kaggle/input/your-dataset/test.csv")
sample_submission = pd.read_csv("/kaggle/input/your-dataset/sample_submission.csv")

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("Submission shape:", sample_submission.shape)


In [None]:
# =========================
# 3. TARGET & FEATURES
# =========================
target = train_df.columns[-1]

X = train_df.drop(columns=[target])
y = train_df[target]

# Encode target (Classification)
le = LabelEncoder()
y = le.fit_transform(y)

X_test_final = test_df.copy()


In [None]:
# =========================
# 4. TRAINâ€“VALID SPLIT
# =========================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
# =========================
# 5. DATA PRE-PROCESSING
# =========================
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

preprocess = ColumnTransformer([
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols),

    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ]), num_cols)
])


In [None]:
# =========================
# 6. MODEL SELECTION
# =========================
models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, n_jobs=-1),
    
    "RandomForest": RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    ),
    
    "XGBoost": XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        n_jobs=-1
    ),
    
    "LightGBM": LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        n_jobs=-1
    ),
    
    "CatBoost": CatBoostClassifier(
        iterations=300,
        learning_rate=0.05,
        depth=6,
        verbose=0
    )
}


In [None]:
# =========================
# 7. TRAIN, EVALUATE & BENCHMARK
# =========================
results = []

best_model = None
best_logloss = np.inf

for name, model in models.items():
    print(f"\nTraining {name} ...")

    pipeline = Pipeline([
        ("preprocessing", preprocess),
        ("model", model)
    ])

    pipeline.fit(X_train, y_train)

    preds = pipeline.predict(X_valid)
    probs = pipeline.predict_proba(X_valid)

    acc = accuracy_score(y_valid, preds)
    ll = log_loss(y_valid, probs)

    results.append({
        "Model": name,
        "Accuracy": acc,
        "LogLoss": ll
    })

    print("Accuracy :", acc)
    print("LogLoss  :", ll)

    if ll < best_logloss:
        best_logloss = ll
        best_model = pipeline


In [None]:
# =========================
# 8. BENCHMARK TABLE
# =========================
benchmark_df = pd.DataFrame(results)
benchmark_df = benchmark_df.sort_values("LogLoss")

print("\nMODEL BENCHMARK COMPARISON")
print(benchmark_df)


In [None]:
# =========================
# 9. SIMILARITY SCORE
# (Accuracy normalized vs best model)
# =========================
best_acc = benchmark_df.iloc[0]["Accuracy"]

benchmark_df["Similarity_Score"] = benchmark_df["Accuracy"] / best_acc

print("\nBENCHMARK + SIMILARITY SCORE")
print(benchmark_df)


In [None]:
# =========================
# 10. FINAL PREDICTION & SUBMISSION
# =========================
final_preds = best_model.predict(X_test_final)
final_preds = le.inverse_transform(final_preds)

submission = pd.DataFrame()
id_col = sample_submission.columns[0]

submission[id_col] = test_df[id_col] if id_col in test_df.columns else np.arange(len(test_df))
submission[target] = final_preds

submission.to_csv("submission_final.csv", index=False)
print("\nsubmission_final.csv saved!")
print(submission.head())
