In [None]:
# Si cal:
#pip install pandas numpy scikit-learn lightgbm sentence-transformers

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

import lightgbm as lgb

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Agreguem demanda potencial per ID
agg_train = (
    train
    .groupby("ID")
    .agg(
        D_total=("weekly_demand", "sum"),   # TARGET
        sales_total=("weekly_sales", "sum"),
        first_week=("num_week_iso", "min"),
        last_week=("num_week_iso", "max")
    )
    .reset_index()
)

agg_train["life_cycle_real"] = agg_train["last_week"] - agg_train["first_week"] + 1


In [None]:
# Agafem el primer registre per ID com a "representant" dels atributs estàtics
static_cols = [
    "ID", "id_season", "aggregated_family", "family", "category", "fabric",
    "color_name", "color_rgb", "length_type", "silhouette_type", "waist_type",
    "sleeve_length_type", "heel_shape_type", "toecap_type", "woven_structure",
    "knit_structure", "print_type", "archetype", "moment", "ocassion",
    "phase_in", "phase_out", "life_cycle_length",
    "num_stores", "num_sizes", "has_plus_size", "price", "year"
]

static_train = (
    train
    .sort_values(["ID", "num_week_iso"])
    .groupby("ID")[static_cols]
    .first()
    .reset_index()
)

train_id = agg_train.merge(static_train, on="ID", how="left")

In [None]:
static_test = (
    test
    .sort_values(["ID", "num_week_iso"])
    .groupby("ID")[static_cols]
    .first()
    .reset_index()
)

In [None]:
def build_text_description(row):
    parts = [
        str(row.get("aggregated_family", "")),
        str(row.get("family", "")),
        str(row.get("category", "")),
        str(row.get("silhouette_type", "")),
        str(row.get("length_type", "")),
        str(row.get("waist_type", "")),
        str(row.get("sleeve_length_type", "")),
        str(row.get("fabric", "")),
        str(row.get("print_type", "")),
        str(row.get("color_name", "")),
        str(row.get("moment", "")),
        str(row.get("ocassion", "")),
        str(row.get("archetype", "")),
    ]
    # Neteja i concatena
    parts = [p for p in parts if p and p != "nan"]
    return " ".join(parts)

train_id["text_desc"] = train_id.apply(build_text_description, axis=1)
static_test["text_desc"] = static_test.apply(build_text_description, axis=1)


In [None]:
text_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

train_text_emb = text_model.encode(
    train_id["text_desc"].tolist(),
    batch_size=128,
    show_progress_bar=True
)
test_text_emb = text_model.encode(
    static_test["text_desc"].tolist(),
    batch_size=128,
    show_progress_bar=True
)


In [None]:
pca_text = PCA(n_components=50, random_state=42)
train_text_pca = pca_text.fit_transform(train_text_emb)
test_text_pca = pca_text.transform(test_text_emb)

# Els convertim a DataFrame per poder concatenar fàcilment
text_cols = [f"text_emb_{i}" for i in range(train_text_pca.shape[1])]
df_train_text = pd.DataFrame(train_text_pca, columns=text_cols, index=train_id.index)
df_test_text = pd.DataFrame(test_text_pca, columns=text_cols, index=static_test.index)


In [None]:
import ast

def parse_emb(x):
    if isinstance(x, str):
        return np.array(ast.literal_eval(x))
    return np.array(x)

img_emb_train = np.vstack(train_id["image_embedding"].apply(parse_emb).values)
img_emb_test = np.vstack(static_test["image_embedding"].apply(parse_emb).values)

# PCA opcional
pca_img = PCA(n_components=50, random_state=42)
train_img_pca = pca_img.fit_transform(img_emb_train)
test_img_pca = pca_img.transform(img_emb_test)

img_cols = [f"img_emb_{i}" for i in range(train_img_pca.shape[1])]
df_train_img = pd.DataFrame(train_img_pca, columns=img_cols, index=train_id.index)
df_test_img = pd.DataFrame(test_img_pca, columns=img_cols, index=static_test.index)


In [None]:
# Agregat per categoria i any
cat_year = (
    train
    .groupby(["category", "year"])
    .agg(
        cat_demand_total=("weekly_demand", "sum"),
        cat_price_mean=("price", "mean"),
        cat_num_products=("ID", "nunique")
    )
    .reset_index()
)

# Ordenem per categoria+any
cat_year = cat_year.sort_values(["category", "year"])

# Creixement interanual per categoria
cat_year["cat_demand_total_prev"] = (
    cat_year.groupby("category")["cat_demand_total"].shift(1)
)
cat_year["cat_demand_growth"] = (
    cat_year["cat_demand_total"] / cat_year["cat_demand_total_prev"] - 1
)

# Omplim NaNs de primer any amb 0
cat_year["cat_demand_growth"] = cat_year["cat_demand_growth"].fillna(0)

# Podríem afegir més coses (rolling, slope, etc.) més endavant


In [None]:
train_id = train_id.merge(
    cat_year[["category", "year", "cat_demand_total", "cat_demand_growth", "cat_price_mean"]],
    on=["category", "year"],
    how="left"
)

static_test = static_test.merge(
    cat_year[["category", "year", "cat_demand_total", "cat_demand_growth", "cat_price_mean"]],
    on=["category", "year"],
    how="left"
)


In [None]:
# Afegim els embeddings al DataFrame
train_feat = pd.concat([train_id, df_train_text, df_train_img], axis=1)
test_feat = pd.concat([static_test, df_test_text, df_test_img], axis=1)

# Target
y = train_feat["D_total"].values

# Treiem columnes que no volem com a features
drop_cols = [
    "D_total", "sales_total", "first_week", "last_week", "life_cycle_real",
    "text_desc", "image_embedding",  # si encara hi són
    "weekly_sales", "weekly_demand", "Production",  # per si han entrat d'alguna manera
]

for c in drop_cols:
    if c in train_feat.columns:
        train_feat = train_feat.drop(columns=c, errors="ignore")
    if c in test_feat.columns:
        test_feat = test_feat.drop(columns=c, errors="ignore")

# Guardem ID per després fer la submission
train_ids = train_feat["ID"]
test_ids = test_feat["ID"]

# No volem ID com a feature
train_feat = train_feat.drop(columns=["ID"])
test_feat = test_feat.drop(columns=["ID"])


In [None]:
cat_cols = train_feat.select_dtypes(include=["object"]).columns.tolist()

full = pd.concat([train_feat, test_feat], axis=0)

full = pd.get_dummies(full, columns=cat_cols, dummy_na=True)

train_X = full.iloc[:len(train_feat)].reset_index(drop=True)
test_X = full.iloc[len(train_feat):].reset_index(drop=True)


In [None]:
train_years = train_id["year"].values  # mateix índex que train_feat abans de transformar


In [None]:
last_year = np.max(train_years)
is_valid = train_years == last_year

X_train = train_X[~is_valid]
X_valid = train_X[is_valid]
y_train = y[~is_valid]
y_valid = y[is_valid]


In [None]:
def train_lgb_quantile(X_tr, y_tr, X_va, y_va, alpha):
    params = {
        "objective": "quantile",
        "alpha": alpha,
        "learning_rate": 0.05,
        "num_leaves": 63,
        "min_data_in_leaf": 50,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "metric": "quantile",
        "verbose": -1,
        "seed": 42,
    }

    lgb_train = lgb.Dataset(X_tr, label=y_tr)
    lgb_valid = lgb.Dataset(X_va, label=y_va)

    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        num_boost_round=2000,
        early_stopping_rounds=100,
        verbose_eval=100,
    )
    return model

quantiles = [0.5, 0.7, 0.75, 0.8, 0.85, 0.9]
models = {}
pred_valid = {}

for q in quantiles:
    print(f"Entrenando modelo para quantil {q}")
    m = train_lgb_quantile(X_train, y_train, X_valid, y_valid, alpha=q)
    models[q] = m
    pred_valid[q] = m.predict(X_valid)


In [None]:
valid_demand_real = y_valid  # D_total real de validación

var_scores = {}

for q in quantiles:
    prod_q = np.maximum(pred_valid[q], 1)  # evitamos 0 producción
    full_price_sales_q = np.minimum(valid_demand_real, prod_q)
    var_q = (full_price_sales_q / prod_q).mean()
    var_scores[q] = var_q

var_scores


In [None]:
best_q = max(var_scores, key=var_scores.get)
print("Mejor quantil:", best_q, "con VAR:", var_scores[best_q])

In [None]:
best_model = models[best_q]

test_pred = best_model.predict(test_X)
test_prod = np.maximum(test_pred, 1).astype(int)

submission = pd.DataFrame({
    "ID": test_ids,
    "Production": test_prod
})

submission.to_csv("submission.csv", index=False)