In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# pip install -q pandas numpy scikit-learn optuna tensorflow
import os, random, math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from tensorflow.keras import layers, regularizers, callbacks, Model
import optuna

# ---------- Config ----------
DATA_PATH = "/kaggle/input/training2/train_presentornot.csv"
SEED = 42
VAL_SIZE = 0.2
TFIDF_MAX_FEATURES = 5000
CAT_LATENT = 16
UNIT_LATENT = 8
DESC_LATENT = 128
AE_EPOCHS_SMALL = 200
AE_EPOCHS_TEXT = 200
AE_BATCH = 512
AE_PATIENCE = 12
# ----------------------------

In [3]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
tf.keras.utils.set_random_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

gpus = tf.config.list_physical_devices("GPU")
for g in gpus:
    try:
        tf.config.experimental.set_memory_growth(g, True)
    except:
        pass

2025-10-12 14:07:47.951099: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [None]:
def load_and_prepare(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["present"] = df.get("present", 0).fillna(0).astype(int)
    df["Description"] = df["Description"].astype(str).where(df["present"] == 0, "no extra description")
    df["Description"] = df["Description"].fillna("no extra description")
    df["product_category"] = df["product_category"].astype(str).fillna("unknown")
    df["unit"] = df["unit"].astype(str).fillna("unknown")
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    df["price"] = pd.to_numeric(df["price"], errors="coerce")
    df["value"] = df["value"].fillna(df["value"].median())
    df = df[df["price"].notnull()].reset_index(drop=True)
    return df

def dense_autoencoder(input_dim: int, latent_dim: int, binary_out: bool, noise=0.05, l2=1e-6):
    inp = layers.Input(shape=(input_dim,))
    x = layers.GaussianNoise(noise)(inp)
    x = layers.Dense(max(64, latent_dim * 2), activation="relu", kernel_regularizer=regularizers.l2(l2))(x)
    z = layers.Dense(latent_dim, activation="linear", name="latent")(x)
    x = layers.Dense(max(64, latent_dim * 2), activation="relu")(z)
    out_act = "sigmoid" if binary_out else "linear"
    out = layers.Dense(input_dim, activation=out_act)(x)
    model = Model(inp, out)
    enc = Model(inp, z)
    loss = "binary_crossentropy" if binary_out else "mse"
    model.compile(optimizer="adam", loss=loss)
    return model, enc

def fit_autoencoder(X_train, X_val, latent_dim, binary_out, epochs, batch, patience, noise=0.05, l2=1e-6):
    model, enc = dense_autoencoder(X_train.shape[1], latent_dim, binary_out, noise, l2)
    es = callbacks.EarlyStopping(monitor="val_loss", patience=patience, restore_best_weights=True, verbose=0)
    model.fit(X_train, X_train, validation_data=(X_val, X_val),
              epochs=epochs, batch_size=batch, callbacks=[es], verbose=0)
    return enc

def build_feature_blocks(train_df, val_df):
    ohe_cat = OneHotEncoder(handle_unknown="ignore", sparse=False)
    ohe_unit = OneHotEncoder(handle_unknown="ignore", sparse=False)
    tfidf = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, ngram_range=(1, 2), min_df=2)

    X_cat_tr = ohe_cat.fit_transform(train_df[["product_category"]]).astype("float32")
    X_cat_va = ohe_cat.transform(val_df[["product_category"]]).astype("float32")

    X_unit_tr = ohe_unit.fit_transform(train_df[["unit"]]).astype("float32")
    X_unit_va = ohe_unit.transform(val_df[["unit"]]).astype("float32")

    X_desc_tr = tfidf.fit_transform(train_df["Description"]).astype("float32").toarray()
    X_desc_va = tfidf.transform(val_df["Description"]).astype("float32").toarray()

    val_scaler = StandardScaler()
    v_tr = val_scaler.fit_transform(train_df[["value"]].astype("float32"))
    v_va = val_scaler.transform(val_df[["value"]].astype("float32"))

    blocks = {
        "ohe_cat": ohe_cat, "ohe_unit": ohe_unit, "tfidf": tfidf, "val_scaler": val_scaler,
        "X_cat_tr": X_cat_tr, "X_cat_va": X_cat_va,
        "X_unit_tr": X_unit_tr, "X_unit_va": X_unit_va,
        "X_desc_tr": X_desc_tr, "X_desc_va": X_desc_va,
        "v_tr": v_tr, "v_va": v_va
    }
    return blocks

def encode_blocks(blocks):
    enc_cat = fit_autoencoder(blocks["X_cat_tr"], blocks["X_cat_va"],
                              latent_dim=min(CAT_LATENT, max(2, blocks["X_cat_tr"].shape[1] // 2)),
                              binary_out=True, epochs=AE_EPOCHS_SMALL, batch=AE_BATCH, patience=AE_PATIENCE, noise=0.02)
    enc_unit = fit_autoencoder(blocks["X_unit_tr"], blocks["X_unit_va"],
                               latent_dim=min(UNIT_LATENT, max(2, blocks["X_unit_tr"].shape[1] // 2)),
                               binary_out=True, epochs=AE_EPOCHS_SMALL, batch=AE_BATCH, patience=AE_PATIENCE, noise=0.02)
    enc_desc = fit_autoencoder(blocks["X_desc_tr"], blocks["X_desc_va"],
                               latent_dim=DESC_LATENT, binary_out=False,
                               epochs=AE_EPOCHS_TEXT, batch=min(512, max(64, blocks["X_desc_tr"].shape[0] // 20)),
                               patience=AE_PATIENCE, noise=0.05, l2=1e-6)

    z_cat_tr = enc_cat.predict(blocks["X_cat_tr"], batch_size=1024, verbose=0)
    z_cat_va = enc_cat.predict(blocks["X_cat_va"], batch_size=1024, verbose=0)
    z_unit_tr = enc_unit.predict(blocks["X_unit_tr"], batch_size=1024, verbose=0)
    z_unit_va = enc_unit.predict(blocks["X_unit_va"], batch_size=1024, verbose=0)
    z_desc_tr = enc_desc.predict(blocks["X_desc_tr"], batch_size=256, verbose=0)
    z_desc_va = enc_desc.predict(blocks["X_desc_va"], batch_size=256, verbose=0)

    X_tr = np.hstack([z_cat_tr, z_unit_tr, z_desc_tr, blocks["v_tr"]]).astype("float32")
    X_va = np.hstack([z_cat_va, z_unit_va, z_desc_va, blocks["v_va"]]).astype("float32")

    encoders = {"enc_cat": enc_cat, "enc_unit": enc_unit, "enc_desc": enc_desc}
    return X_tr, X_va, encoders

def build_regressor(input_dim: int, trial: optuna.trial.Trial) -> tf.keras.Model:
    tf.keras.backend.clear_session()
    act = trial.suggest_categorical("activation", ["relu", "swish", "gelu"])
    layers_n = trial.suggest_int("layers", 2, 5)
    width = trial.suggest_categorical("width", [128, 192, 256, 384, 512, 768, 1024])
    dropout = trial.suggest_float("dropout", 0.0, 0.35)
    l2_w = trial.suggest_float("l2", 1e-7, 5e-4, log=True)
    ln = trial.suggest_categorical("norm", ["batch", "layer"])
    lr = trial.suggest_float("lr", 1e-4, 3e-3, log=True)

    inp = layers.Input(shape=(input_dim,))
    x = layers.Dense(width, activation=act, kernel_regularizer=regularizers.l2(l2_w))(inp)
    x = layers.Dropout(dropout)(x)
    for i in range(layers_n - 1):
        x = layers.Dense(int(width / (1.2 ** (i + 1))), activation=act,
                         kernel_regularizer=regularizers.l2(l2_w))(x)
        x = layers.Dropout(dropout)(x)
        x = (layers.BatchNormalization() if ln == "batch" else layers.LayerNormalization())(x)
    out = layers.Dense(1, activation="linear")(x)
    model = Model(inp, out)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                  loss=tf.keras.losses.Huber(delta=1.0), metrics=["mae"])
    return model

def tune_and_train(X_tr, y_tr, X_va, y_va, n_trials=25):
    def objective(trial):
        model = build_regressor(X_tr.shape[1], trial)
        es = callbacks.EarlyStopping(monitor="val_mae", patience=12, restore_best_weights=True, verbose=0)
        model.fit(X_tr, y_tr, validation_data=(X_va, y_va),
                  epochs=300, batch_size=trial.suggest_categorical("batch", [64, 128, 256, 512]),
                  callbacks=[es], verbose=0)
        pred = model.predict(X_va, batch_size=1024, verbose=0).ravel()
        return mean_absolute_error(y_va, pred)

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    best_model = build_regressor(X_tr.shape[1], study.best_trial)
    es = callbacks.EarlyStopping(monitor="val_mae", patience=16, restore_best_weights=True, verbose=0)
    best_model.fit(X_tr, y_tr, validation_data=(X_va, y_va),
                   epochs=500, batch_size=study.best_trial.params.get("batch", 256),
                   callbacks=[es], verbose=0)
    return study, best_model

def main():
    df = load_and_prepare(DATA_PATH)
    train_df, val_df = train_test_split(df, test_size=VAL_SIZE, random_state=SEED)
    blocks = build_feature_blocks(train_df, val_df)
    X_tr, X_va, encoders = encode_blocks(blocks)

    y_tr = train_df["price"].values.astype("float32")
    y_va = val_df["price"].values.astype("float32")

    study, model = tune_and_train(X_tr, y_tr, X_va, y_va, n_trials=30)

    va_pred = model.predict(X_va, batch_size=1024, verbose=0).ravel()
    mae = mean_absolute_error(y_va, va_pred)
    print(f"Validation MAE: {mae:,.4f}")
    print("Best params:", study.best_trial.params)

    os.makedirs("artifacts", exist_ok=True)
    model.save("artifacts/best_regressor.keras")
    encoders["enc_cat"].save("artifacts/enc_cat.keras")
    encoders["enc_unit"].save("artifacts/enc_unit.keras")
    encoders["enc_desc"].save("artifacts/enc_desc.keras")
    pd.Series(va_pred, index=val_df.index, name="price_pred").to_csv("artifacts/val_predictions.csv")
    import joblib
    joblib.dump(blocks["ohe_cat"], "artifacts/ohe_cat.joblib")
    joblib.dump(blocks["ohe_unit"], "artifacts/ohe_unit.joblib")
    joblib.dump(blocks["tfidf"], "artifacts/tfidf.joblib")
    joblib.dump(blocks["val_scaler"], "artifacts/value_scaler.joblib")
    print("Saved artifacts/")

if __name__ == "__main__":
    main()




In [None]:
# Kaggle-ready: place this in a single cell. Assumes a train.csv with the specified columns.
import os, random, json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from tensorflow.keras import layers, regularizers, callbacks, Model
import optuna
import xgboost as xgb
import joblib

# ---------------- Config ----------------
DATA_PATH = "/kaggle/input/training2/train_presentornot.csv"  # adjust if your file lives under /kaggle/input/<dataset>/train.csv
SEED = 42
VAL_SIZE = 0.2
TFIDF_MAX_FEATURES = 5000
CAT_LATENT = 16
UNIT_LATENT = 8
DESC_LATENT = 128
AE_EPOCHS_SMALL = 200
AE_EPOCHS_TEXT = 200
AE_BATCH = 512
AE_PATIENCE = 12
N_TRIALS = 40
# ---------------------------------------

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
np.random.seed(SEED); random.seed(SEED); tf.keras.utils.set_random_seed(SEED)

gpus = tf.config.list_physical_devices("GPU")
for g in gpus:
    try:
        tf.config.experimental.set_memory_growth(g, True)
    except:
        pass

def load_and_prepare(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["present"] = df.get("present", 0).fillna(0).astype(int)
    df["Description"] = df["Description"].astype(str).where(df["present"] == 0, "no extra description")
    df["Description"] = df["Description"].fillna("no extra description")
    df["product_category"] = df["product_category"].astype(str).fillna("unknown")
    df["unit"] = df["unit"].astype(str).fillna("unknown")
    df["value"] = pd.to_numeric(df["value"], errors="coerce").fillna(df["value"].median())
    df["price"] = pd.to_numeric(df["price"], errors="coerce")
    df = df[df["price"].notnull()].reset_index(drop=True)
    return df

def dense_autoencoder(input_dim: int, latent_dim: int, binary_out: bool, noise=0.05, l2=1e-6):
    inp = layers.Input(shape=(input_dim,))
    x = layers.GaussianNoise(noise)(inp)
    x = layers.Dense(max(64, latent_dim * 2), activation="relu", kernel_regularizer=regularizers.l2(l2))(x)
    z = layers.Dense(latent_dim, activation="linear", name="latent")(x)
    x = layers.Dense(max(64, latent_dim * 2), activation="relu")(z)
    out = layers.Dense(input_dim, activation="sigmoid" if binary_out else "linear")(x)
    model = Model(inp, out); enc = Model(inp, z)
    model.compile(optimizer="adam", loss="binary_crossentropy" if binary_out else "mse")
    return model, enc

def fit_autoencoder(X_train, X_val, latent_dim, binary_out, epochs, batch, patience, noise=0.05, l2=1e-6):
    model, enc = dense_autoencoder(X_train.shape[1], latent_dim, binary_out, noise, l2)
    es = callbacks.EarlyStopping(monitor="val_loss", patience=patience, restore_best_weights=True, verbose=0)
    model.fit(X_train, X_train, validation_data=(X_val, X_val),
              epochs=epochs, batch_size=batch, callbacks=[es], verbose=0)
    return enc

def build_feature_blocks(train_df, val_df):
    ohe_cat = OneHotEncoder(handle_unknown="ignore", sparse=False)
    ohe_unit = OneHotEncoder(handle_unknown="ignore", sparse=False)
    tfidf = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, ngram_range=(1, 2), min_df=2)

    X_cat_tr = ohe_cat.fit_transform(train_df[["product_category"]]).astype("float32")
    X_cat_va = ohe_cat.transform(val_df[["product_category"]]).astype("float32")

    X_unit_tr = ohe_unit.fit_transform(train_df[["unit"]]).astype("float32")
    X_unit_va = ohe_unit.transform(val_df[["unit"]]).astype("float32")

    X_desc_tr = tfidf.fit_transform(train_df["Description"]).astype("float32").toarray()
    X_desc_va = tfidf.transform(val_df["Description"]).astype("float32").toarray()

    val_scaler = StandardScaler()
    v_tr = val_scaler.fit_transform(train_df[["value"]].astype("float32"))
    v_va = val_scaler.transform(val_df[["value"]].astype("float32"))

    return {
        "ohe_cat": ohe_cat, "ohe_unit": ohe_unit, "tfidf": tfidf, "val_scaler": val_scaler,
        "X_cat_tr": X_cat_tr, "X_cat_va": X_cat_va,
        "X_unit_tr": X_unit_tr, "X_unit_va": X_unit_va,
        "X_desc_tr": X_desc_tr, "X_desc_va": X_desc_va,
        "v_tr": v_tr, "v_va": v_va
    }

def encode_blocks(blocks):
    enc_cat = fit_autoencoder(blocks["X_cat_tr"], blocks["X_cat_va"],
                              latent_dim=min(CAT_LATENT, max(2, blocks["X_cat_tr"].shape[1] // 2)),
                              binary_out=True, epochs=AE_EPOCHS_SMALL, batch=AE_BATCH, patience=AE_PATIENCE, noise=0.02)
    enc_unit = fit_autoencoder(blocks["X_unit_tr"], blocks["X_unit_va"],
                               latent_dim=min(UNIT_LATENT, max(2, blocks["X_unit_tr"].shape[1] // 2)),
                               binary_out=True, epochs=AE_EPOCHS_SMALL, batch=AE_BATCH, patience=AE_PATIENCE, noise=0.02)
    enc_desc = fit_autoencoder(blocks["X_desc_tr"], blocks["X_desc_va"],
                               latent_dim=DESC_LATENT, binary_out=False,
                               epochs=AE_EPOCHS_TEXT, batch=min(512, max(64, blocks["X_desc_tr"].shape[0] // 20)),
                               patience=AE_PATIENCE, noise=0.05, l2=1e-6)

    z_cat_tr = enc_cat.predict(blocks["X_cat_tr"], batch_size=1024, verbose=0)
    z_cat_va = enc_cat.predict(blocks["X_cat_va"], batch_size=1024, verbose=0)
    z_unit_tr = enc_unit.predict(blocks["X_unit_tr"], batch_size=1024, verbose=0)
    z_unit_va = enc_unit.predict(blocks["X_unit_va"], batch_size=1024, verbose=0)
    z_desc_tr = enc_desc.predict(blocks["X_desc_tr"], batch_size=256, verbose=0)
    z_desc_va = enc_desc.predict(blocks["X_desc_va"], batch_size=256, verbose=0)

    X_tr = np.hstack([z_cat_tr, z_unit_tr, z_desc_tr, blocks["v_tr"]]).astype("float32")
    X_va = np.hstack([z_cat_va, z_unit_va, z_desc_va, blocks["v_va"]]).astype("float32")
    return X_tr, X_va, {"enc_cat": enc_cat, "enc_unit": enc_unit, "enc_desc": enc_desc}

def build_xgb_params(trial, use_gpu: bool):
    params = {
        "objective": "reg:squarederror",
        "learning_rate": trial.suggest_float("eta", 1e-3, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-2, 10.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("lambda", 1e-6, 10.0, log=True),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "eval_metric": "mae",
        "random_state": SEED,
        "n_jobs": -1,
        "tree_method": "gpu_hist" if use_gpu else "hist"
    }
    return params

def tune_and_train_xgb(X_tr, y_tr, X_va, y_va, n_trials=N_TRIALS):
    use_gpu = len(tf.config.list_physical_devices("GPU")) > 0
    def objective(trial):
        params = build_xgb_params(trial, use_gpu)
        n_estimators = trial.suggest_int("n_estimators", 300, 3000)
        esr = trial.suggest_int("early_stopping_rounds", 50, 200)
        model = xgb.XGBRegressor(**params, n_estimators=n_estimators, verbosity=0)
        try:
            model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False, early_stopping_rounds=esr)
        except xgb.core.XGBoostError:
            params["tree_method"] = "hist"
            model = xgb.XGBRegressor(**params, n_estimators=n_estimators, verbosity=0)
            model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False, early_stopping_rounds=esr)
        pred = model.predict(X_va)
        return mean_absolute_error(y_va, pred)

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    best = build_xgb_params(study.best_trial, use_gpu)
    model = xgb.XGBRegressor(**best, n_estimators=study.best_trial.params["n_estimators"], verbosity=0)
    try:
        model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False,
                  early_stopping_rounds=study.best_trial.params["early_stopping_rounds"])
    except xgb.core.XGBoostError:
        model.set_params(tree_method="hist")
        model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False,
                  early_stopping_rounds=study.best_trial.params["early_stopping_rounds"])
    return study, model

def main():
    df = load_and_prepare(DATA_PATH)
    train_df, val_df = train_test_split(df, test_size=VAL_SIZE, random_state=SEED)
    blocks = build_feature_blocks(train_df, val_df)
    X_tr, X_va, encoders = encode_blocks(blocks)
    y_tr = train_df["price"].values.astype("float32")
    y_va = val_df["price"].values.astype("float32")

    study, model = tune_and_train_xgb(X_tr, y_tr, X_va, y_va)
    va_pred = model.predict(X_va)
    mae = mean_absolute_error(y_va, va_pred)
    print(f"Validation MAE: {mae:,.4f}")
    print("Best params:", study.best_trial.params)

    os.makedirs("artifacts", exist_ok=True)
    model.save_model("artifacts/xgb_model.json")
    encoders["enc_cat"].save("artifacts/enc_cat.keras")
    encoders["enc_unit"].save("artifacts/enc_unit.keras")
    encoders["enc_desc"].save("artifacts/enc_desc.keras")
    joblib.dump(blocks["ohe_cat"], "artifacts/ohe_cat.joblib")
    joblib.dump(blocks["ohe_unit"], "artifacts/ohe_unit.joblib")
    joblib.dump(blocks["tfidf"], "artifacts/tfidf.joblib")
    joblib.dump(blocks["val_scaler"], "artifacts/value_scaler.joblib")
    pd.Series(va_pred, index=val_df.index, name="price_pred").to_csv("artifacts/val_predictions_xgb.csv")
    with open("artifacts/optuna_best_params.json", "w") as f:
        json.dump(study.best_trial.params, f, indent=2)
    print("Saved to artifacts/")

if __name__ == "__main__":
    main()


