In [7]:
# =============================================================
#   ELINET OPTIMITZAT — VERSIÓ COMPLETA I FUNCIONAL
#   (LightGBM + Log-transform + PCA 64/64 + Scaling + Features)
# =============================================================

import pandas as pd
import numpy as np
import lightgbm as lgb
import torch
import ast
import warnings
import os

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# =============================================================
#   CONFIGURACIÓ GLOBAL
# =============================================================
PATH_TRAIN = "train.csv"
PATH_TEST = "test.csv"
SUBMISSION_FILE_OUTPUT = "submission_lgbm_optimitzat.csv"

TEXT_MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
IMAGE_EMBED_DIM = 512
TEXT_EMBED_DIM = 384

N_COMPONENTS_IMG = 64     # PCA agressiu
N_COMPONENTS_TEXT = 64

TARGET_QUANTILE = 0.82    # Quantil òptim per VAR

# =============================================================
#   HELPERS
# =============================================================

def combine_attributes_jerarquica(row):
    """Combina valors text per embeddings."""
    cols = [
        'aggregated_family','family','category','fabric','color_name',
        'length_type','silhouette_type','waist_type','neck_lapel_type',
        'sleeve_length_type','heel_shape_type','toecap_type',
        'woven_structure','knit_structure','print_type','archetype','moment'
    ]
    out = []
    for c in cols:
        if c in row and pd.notna(row[c]):
            out.append(str(row[c]).strip())
    return " , ".join(out)


def parse_embedding_string(s):
    """Converteix un string a vector float32."""
    try:
        return np.fromstring(s.strip('[]'), sep=",", dtype=np.float32)
    except:
        return np.zeros(IMAGE_EMBED_DIM, dtype=np.float32)


# =============================================================
#   PAS 2: FEATURE ENGINEERING TABULAR
# =============================================================

def create_features(df_train_path, df_test_path):
    print("=== PAS 2: Features Tabulars ===")

    df_train_raw = pd.read_csv(df_train_path, delimiter=';', encoding='utf-8-sig')
    df_test_raw  = pd.read_csv(df_test_path,  delimiter=';', encoding='utf-8-sig')

    agg_funcs = {
        'weekly_demand':'sum','weekly_sales':'sum',
        'Production':'first','id_season':'first','family':'first','category':'first',
        'price':'first','num_stores':'first','num_sizes':'first','life_cycle_length':'first',
        'image_embedding':'first','aggregated_family':'first','fabric':'first','color_name':'first',
        'length_type':'first','silhouette_type':'first','waist_type':'first','neck_lapel_type':'first',
        'sleeve_length_type':'first','heel_shape_type':'first','toecap_type':'first',
        'woven_structure':'first','knit_structure':'first','print_type':'first','archetype':'first',
        'moment':'first'
    }

    agg_cols = {k:v for k,v in agg_funcs.items() if k in df_train_raw.columns}
    df_train = df_train_raw.groupby('ID').agg(agg_cols).reset_index()
    df_train = df_train.rename(columns={'weekly_demand':'total_demand'})

    df_test = df_test_raw.drop_duplicates('ID').reset_index(drop=True)

    # Tendència
    stats = df_train.groupby(['id_season','category'])['total_demand'].mean().reset_index()
    stats = stats.rename(columns={'total_demand':'category_demand_mean'})
    stats['category_demand_last_season'] = stats.groupby('category')['category_demand_mean'].shift(1)
    global_med = stats['category_demand_last_season'].median()

    df_train['is_train'] = 1
    df_test['is_train'] = 0
    df_full = pd.concat([df_train, df_test], ignore_index=True)

    df_full = df_full.merge(
        stats[['id_season','category','category_demand_last_season']],
        on=['id_season','category'], how='left'
    )

    df_full['category_demand_last_season'] = df_full['category_demand_last_season'].fillna(global_med)

    for col in ['price','num_stores','num_sizes','life_cycle_length']:
        if col in df_full:
            df_full[col] = df_full[col].fillna(df_full[col].median())

    # Interaccions
    df_full['price_x_num_stores'] = df_full['price'] * df_full['num_stores']
    df_full['price_per_lifecycle'] = df_full['price'] / (df_full['life_cycle_length']+1)
    df_full['price_vs_trend'] = df_full['price'] / (df_full['category_demand_last_season']+1)
    df_full['stores_per_size'] = df_full['num_stores'] / (df_full['num_sizes']+1)
    df_full['stores_vs_trend'] = df_full['num_stores'] * df_full['category_demand_last_season']
    df_full['trend_x_lifecycle'] = df_full['category_demand_last_season'] * df_full['life_cycle_length']

    # Noves features potents
    df_full['season_index'] = df_full['id_season'] % 4

    cat_sizes = df_train_raw.groupby('category')['ID'].nunique().to_dict()
    df_full['category_scale'] = df_full['category'].map(cat_sizes)
    df_full['category_scale'] = df_full['category_scale'].fillna(df_full['category_scale'].median())

    # Separar train/test
    df_train_proc = df_full[df_full['is_train']==1].copy()
    df_test_proc  = df_full[df_full['is_train']==0].copy()

    return df_train_proc, df_test_proc, df_train_proc['total_demand']


# =============================================================
#   PAS 3: EMBEDDINGS + PCA
# =============================================================

def create_embedding_features(df_train_proc, df_test_proc):
    print("=== PAS 3: Embeddings + PCA 64/64 ===")

    df_train_ids = df_train_proc[['ID']].copy(); df_train_ids['is_train'] = 1
    df_test_ids  = df_test_proc[['ID']].copy(); df_test_ids['is_train']  = 0
    df_base = pd.concat([df_train_ids, df_test_ids], ignore_index=True)

    text_cols = [
        'ID','aggregated_family','family','category','fabric','color_name',
        'length_type','silhouette_type','waist_type','neck_lapel_type','sleeve_length_type',
        'heel_shape_type','toecap_type','woven_structure','knit_structure','print_type',
        'archetype','moment'
    ]

    df_all = pd.concat([
        df_train_proc[text_cols],
        df_test_proc[text_cols]
    ]).drop_duplicates('ID')

    df_base = df_base.merge(df_all, on='ID', how='left')
    df_base['attributes_string'] = df_base.apply(combine_attributes_jerarquica, axis=1)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    text_model = SentenceTransformer(TEXT_MODEL_NAME).to(device)

    text_raw = text_model.encode(df_base['attributes_string'].tolist(),
                                 batch_size=128, show_progress_bar=True)
    if isinstance(text_raw, torch.Tensor):
        text_raw = text_raw.cpu().numpy()

    img_df = pd.concat([
        df_train_proc[['ID','image_embedding']],
        df_test_proc[['ID','image_embedding']]
    ]).drop_duplicates('ID')

    df_base = df_base.merge(img_df, on='ID', how='left')
    img_raw = np.stack(df_base['image_embedding'].apply(parse_embedding_string))

    # PCA
    idx_tr = df_base['is_train']==1
    idx_te = df_base['is_train']==0

    scaler_t = StandardScaler()
    scaler_i = StandardScaler()

    pca_t = PCA(n_components=N_COMPONENTS_TEXT, random_state=42)
    pca_i = PCA(n_components=N_COMPONENTS_IMG , random_state=42)

    text_tr = scaler_t.fit_transform(text_raw[idx_tr])
    text_te = scaler_t.transform(text_raw[idx_te])

    img_tr  = scaler_i.fit_transform(img_raw[idx_tr])
    img_te  = scaler_i.transform(img_raw[idx_te])

    text_pca_tr = pca_t.fit_transform(text_tr)
    text_pca_te = pca_t.transform(text_te)

    img_pca_tr  = pca_i.fit_transform(img_tr)
    img_pca_te  = pca_i.transform(img_te)

    text_cols_pca = [f"text_pca_{i}" for i in range(N_COMPONENTS_TEXT)]
    img_cols_pca  = [f"img_pca_{i}"  for i in range(N_COMPONENTS_IMG)]

    df_train_emb = pd.concat([
        df_base[idx_tr][['ID']].reset_index(drop=True),
        pd.DataFrame(text_pca_tr, columns=text_cols_pca),
        pd.DataFrame(img_pca_tr,  columns=img_cols_pca)
    ], axis=1)

    df_test_emb = pd.concat([
        df_base[idx_te][['ID']].reset_index(drop=True),
        pd.DataFrame(text_pca_te, columns=text_cols_pca),
        pd.DataFrame(img_pca_te,  columns=img_cols_pca)
    ], axis=1)

    return df_train_emb, df_test_emb, text_cols_pca, img_cols_pca


# =============================================================
#   PAS 4: MODEL LGBM OPTIMITZAT
# =============================================================

def train_model(
    X_train_final, y_target_log, X_test_final,
    numerical_features, categorical_features,
    text_pca_cols, img_pca_cols
):
    print("=== PAS 4: Entrenament LGBM Optimitzat ===")

    features = numerical_features + categorical_features + text_pca_cols + img_pca_cols

    # Escalar numèriques
    scaler = StandardScaler()
    X_train_final[numerical_features] = scaler.fit_transform(X_train_final[numerical_features])
    X_test_final[numerical_features]  = scaler.transform(X_test_final[numerical_features])

    params = {
        'objective':'quantile',
        'metric':'quantile',
        'alpha':TARGET_QUANTILE,
        'learning_rate':0.01,
        'n_estimators':2000,
        'num_leaves':128,
        'min_child_samples':50,
        'feature_fraction':0.7,
        'bagging_fraction':0.7,
        'bagging_freq':1,
        'lambda_l1':0.1,
        'lambda_l2':1.0,
        'random_state':42,
        'n_jobs':-1
    }

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train_final[features], y_target_log,
              categorical_feature=categorical_features)

    pred_log = model.predict(X_test_final[features])
    pred = np.expm1(pred_log)
    pred[pred < 0] = 0

    sub = pd.DataFrame({
        'ID': X_test_final['ID'],
        'Production': pred.astype(int)
    })

    sub.to_csv(SUBMISSION_FILE_OUTPUT, index=False, sep=',')
    print("Primers resultats:")
    print(sub.head())


# =============================================================
#   MAIN
# =============================================================

if __name__ == "__main__":

    df_train_proc, df_test_proc, y_target = create_features(PATH_TRAIN, PATH_TEST)
    df_train_emb, df_test_emb, text_pca_cols, img_pca_cols = create_embedding_features(df_train_proc, df_test_proc)

    X_train_final = df_train_proc.merge(df_train_emb, on="ID", how="left")
    X_test_final  = df_test_proc.merge(df_test_emb,  on="ID", how="left")

    categorical_features = [
        'id_season','family','category','aggregated_family','fabric','color_name',
        'length_type','silhouette_type','waist_type','neck_lapel_type','sleeve_length_type',
        'heel_shape_type','toecap_type','woven_structure','knit_structure','print_type',
        'archetype','moment'
    ]

    numerical_features = [
        'price','num_stores','num_sizes','life_cycle_length','category_demand_last_season',
        'price_x_num_stores','price_per_lifecycle','price_vs_trend',
        'stores_per_size','stores_vs_trend','trend_x_lifecycle',
        'season_index','category_scale'
    ]

    categorical_features = [c for c in categorical_features if c in X_train_final.columns]
    numerical_features   = [c for c in numerical_features   if c in X_train_final.columns]

    # Encoding categòric
    for col in categorical_features:
        le = LabelEncoder()
        combined = pd.concat([X_train_final[col], X_test_final[col]]).astype(str)
        le.fit(combined)
        X_train_final[col] = le.transform(X_train_final[col].astype(str))
        X_test_final[col]  = le.transform(X_test_final[col].astype(str))

    # Log-transform del target
    y_target_log = np.log1p(y_target)

    train_model(
        X_train_final, y_target_log, X_test_final,
        numerical_features, categorical_features,
        text_pca_cols, img_pca_cols
    )


=== PAS 2: Features Tabulars ===
=== PAS 3: Embeddings + PCA 64/64 ===


Batches: 100%|██████████| 95/95 [02:31<00:00,  1.59s/it]


=== PAS 4: Entrenament LGBM Optimitzat ===
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006023 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34620
[LightGBM] [Info] Number of data points in the train set: 9843, number of used features: 157
[LightGBM] [Info] Start training from score 9.823535
Primers resultats:
    ID  Production
0   90        1135
1   16        7655
2   65       11126
3  138         810
4  166         363
