In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression


In [None]:
# =========================
# 2. LOAD DATA (3 LINKS)
# =========================
train_df = pd.read_csv("/kaggle/input/your-dataset/train.csv")
test_df = pd.read_csv("/kaggle/input/your-dataset/test.csv")
sample_submission = pd.read_csv("/kaggle/input/your-dataset/sample_submission.csv")

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("Sample submission shape:", sample_submission.shape)


In [None]:
# =========================
# 3. TARGET & FEATURES
# =========================
# Multi-output regression targets
TARGET_COLS = sample_submission.columns[1:].tolist()

X = train_df.drop(columns=TARGET_COLS)
y = train_df[TARGET_COLS]

X_test_final = test_df.copy()


In [None]:
# =========================
# 4. TRAINâ€“VALID SPLIT
# =========================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [None]:
# =========================
# 5. DATA PRE-PROCESSING
# =========================
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),

        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ]), num_cols),
    ]
)


In [None]:
# =========================
# 6. MODEL SELECTION
# =========================
models = {
    "LinearRegression": MultiOutputRegressor(
        LinearRegression()
    ),

    "RandomForest": MultiOutputRegressor(
        RandomForestRegressor(
            n_estimators=300,
            random_state=42,
            n_jobs=-1
        )
    ),

    "XGBoost": MultiOutputRegressor(
        XGBRegressor(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            n_jobs=-1
        )
    ),

    "LightGBM": MultiOutputRegressor(
        LGBMRegressor(
            n_estimators=300,
            learning_rate=0.05,
            n_jobs=-1
        )
    ),

    "CatBoost": MultiOutputRegressor(
        CatBoostRegressor(
            iterations=300,
            learning_rate=0.05,
            depth=6,
            verbose=0
        )
    )
}


In [None]:
# =========================
# 7. TRAIN, EVALUATE & BENCHMARK
# =========================
results = []
best_model = None
best_rmse = np.inf

for name, model in models.items():
    print(f"\nTraining {name} ...")

    pipe = Pipeline([
        ("pre", preprocess),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)

    preds = pipe.predict(X_valid)

    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    r2 = r2_score(y_valid, preds, multioutput="uniform_average")

    results.append({
        "Model": name,
        "RMSE": rmse,
        "R2": r2
    })

    print("RMSE:", rmse)
    print("R2  :", r2)

    if rmse < best_rmse:
        best_rmse = rmse
        best_model = pipe


In [None]:
# =========================
# 8. BENCHMARK TABLE
# =========================
benchmark_df = pd.DataFrame(results).sort_values("RMSE")

print("\nMODEL BENCHMARK COMPARISON")
print(benchmark_df)


In [None]:
# =========================
# 9. FINAL PREDICTION
# =========================
final_preds = best_model.predict(X_test_final)

submission = pd.DataFrame(
    final_preds,
    columns=TARGET_COLS
)

submission.insert(
    0,
    sample_submission.columns[0],
    test_df[sample_submission.columns[0]]
    if sample_submission.columns[0] in test_df.columns
    else np.arange(len(test_df))
)


In [None]:
# =========================
# 10. SAVE SUBMISSION
# =========================
submission.to_csv("submission_final.csv", index=False)
print("\nsubmission_final.csv saved!")
print(submission.head())
