# Импорты и подготовка данных

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt

from preprocessing import metadata_preprocessing
import metrics

In [2]:
TRAIN_DATA_PATH = r'data/input/train.parquet'
TEST_DATA_PATH = r'data/input/test.parquet'

data_train = pd.read_parquet(TRAIN_DATA_PATH, engine="pyarrow")
data_test = pd.read_parquet(TEST_DATA_PATH, engine="pyarrow")

In [3]:
preprocessor = metadata_preprocessing.MultiStepPreprocessor()

In [4]:
X_train, X_val, y_train, y_val = preprocessor.fit_transform(data_train)

Processing 250048 texts...
  Processed 10000/250048
  Processed 20000/250048
  Processed 30000/250048
  Processed 40000/250048
  Processed 50000/250048
  Processed 60000/250048
  Processed 70000/250048
  Processed 80000/250048
  Processed 90000/250048
  Processed 100000/250048
  Processed 110000/250048
  Processed 120000/250048
  Processed 130000/250048
  Processed 140000/250048
  Processed 150000/250048
  Processed 160000/250048
  Processed 170000/250048
  Processed 180000/250048
  Processed 190000/250048
  Processed 200000/250048
  Processed 210000/250048
  Processed 220000/250048
  Processed 230000/250048
  Processed 240000/250048
  Processed 250000/250048


Генерация эмбеддингов:   0%|          | 0/250048 [00:00<?, ?it/s]

Processing 62513 texts...
  Processed 10000/62513
  Processed 20000/62513
  Processed 30000/62513
  Processed 40000/62513
  Processed 50000/62513
  Processed 60000/62513


Генерация эмбеддингов:   0%|          | 0/62513 [00:00<?, ?it/s]

In [5]:
test = preprocessor.transform(data_test)

Processing 70274 texts...
  Processed 10000/70274
  Processed 20000/70274
  Processed 30000/70274
  Processed 40000/70274
  Processed 50000/70274
  Processed 60000/70274
  Processed 70000/70274


Генерация эмбеддингов:   0%|          | 0/70274 [00:00<?, ?it/s]

In [12]:
train_ids = X_train.item_id
val_ids = X_val.item_id
X_train_clean = X_train.drop('item_id', axis=1, errors='ignore')
X_val_clean = X_val.drop('item_id', axis=1, errors='ignore')

target_cols = ['real_weight', 'real_height', 'real_length', 'real_width']

# Обучение моделей

## Линейные модели

In [14]:
# Ridge Regression
ridge = MultiOutputRegressor(Ridge(alpha=1.0, random_state=42))
ridge.fit(X_train_clean, y_train)
y_val_pred_ridge = ridge.predict(X_val_clean)
y_val_pred_ridge = np.maximum(0, y_val_pred_ridge)
print(f'ridge logmae macro {metrics.logmae_macro(y_val, y_val_pred_ridge)}')
print(f'ridge logmae by cat {metrics.logmae_bycat(y_val, y_val_pred_ridge)}')


ridge logmae macro 0.3656903431379472
ridge logmae by cat [0.35251901 0.51079791 0.29682336 0.30262108]


In [15]:
# Lasso Regression
lasso = MultiOutputRegressor(Lasso(alpha=0.1, random_state=42, max_iter=2000))
lasso.fit(X_train_clean, y_train)
y_val_pred_lasso = lasso.predict(X_val_clean)
y_val_pred_lasso = np.maximum(0, y_val_pred_lasso)
print(f'lasso logmae macro {metrics.logmae_macro(y_val, y_val_pred_lasso)}')
print(f'lasso logmae by cat {metrics.logmae_bycat(y_val, y_val_pred_lasso)}')

lasso logmae macro 0.3727224672745333
lasso logmae by cat [0.36071448 0.52177589 0.30024228 0.30815723]


In [16]:
# ElasticNet
elastic = MultiOutputRegressor(ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42, max_iter=2000))
elastic.fit(X_train_clean, y_train)
y_val_pred_elastic = elastic.predict(X_val_clean)
y_val_pred_elastic = np.maximum(0, y_val_pred_elastic)
print(f'elasticnet logmae macro {metrics.logmae_macro(y_val, y_val_pred_elastic)}')
print(f'elasticnet logmae by cat {metrics.logmae_bycat(y_val, y_val_pred_elastic)}')

elasticnet logmae macro 0.37001687542488193
elasticnet logmae by cat [0.35469141 0.51931804 0.29928383 0.30677423]


## Случайный лес

In [17]:
# Random Forest
rf = MultiOutputRegressor(
    RandomForestRegressor(
        n_estimators=100,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1,
        verbose=1
    )
)
rf.fit(X_train_clean, y_train)
y_val_pred_rf = rf.predict(X_val_clean)
y_val_pred_rf = np.maximum(0, y_val_pred_rf)
print(f'rf logmae macro {metrics.logmae_macro(y_val, y_val_pred_rf)}')
print(f'rf logmae by cat {metrics.logmae_bycat(y_val, y_val_pred_rf)}')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed: 37.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 96.4min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed: 34.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 82.6min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed: 33.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 81.7min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed: 39.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 96.5min finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Pa

rf logmae macro 0.3535384603474445
rf logmae by cat [0.31731115 0.50734433 0.29065628 0.29884209]


[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.3s finished


## catboost

In [35]:
catboost_models = []
y_val_pred_catboost = np.zeros_like(y_val)

task_type = 'GPU'
for i, target in enumerate(target_cols):
    print(f"\n[{i+1}/{len(target_cols)}] Training CatBoost for {target}...")

    cb_model = CatBoostRegressor(
        iterations=2000,
        learning_rate=0.1,
        depth=8,
        task_type=task_type,
        random_state=42,
        verbose=100
    )

    cb_model.fit(
        X_train_clean,
        y_train.iloc[:, i],
        eval_set=(X_val_clean, y_val.iloc[:, i]),
        early_stopping_rounds=50,
        verbose=100
    )

    catboost_models.append(cb_model)
    y_val_pred_catboost[:, i] = cb_model.predict(X_val_clean)


[1/4] Training CatBoost for real_weight...
0:	learn: 2.5252264	test: 2.5074661	best: 2.5074661 (0)	total: 48.8ms	remaining: 1m 37s
100:	learn: 1.8295532	test: 1.9162641	best: 1.9162641 (100)	total: 4.94s	remaining: 1m 32s
200:	learn: 1.7418956	test: 1.8914280	best: 1.8914280 (200)	total: 9.56s	remaining: 1m 25s
300:	learn: 1.6805086	test: 1.8811087	best: 1.8811087 (300)	total: 14.1s	remaining: 1m 19s
400:	learn: 1.6262118	test: 1.8713577	best: 1.8713577 (400)	total: 18.7s	remaining: 1m 14s
500:	learn: 1.5814335	test: 1.8668792	best: 1.8668792 (500)	total: 23.2s	remaining: 1m 9s
600:	learn: 1.5397830	test: 1.8623283	best: 1.8623283 (600)	total: 27.6s	remaining: 1m 4s
700:	learn: 1.5037133	test: 1.8593546	best: 1.8593546 (700)	total: 32.2s	remaining: 59.7s
800:	learn: 1.4708300	test: 1.8569293	best: 1.8569148 (789)	total: 36.7s	remaining: 54.9s
900:	learn: 1.4400009	test: 1.8547595	best: 1.8546880 (893)	total: 41.4s	remaining: 50.4s
1000:	learn: 1.4095924	test: 1.8530037	best: 1.8530037

In [36]:
print(f'cb logmae macro {metrics.logmae_macro(y_val, y_val_pred_catboost)}')
print(f'rf logmae by cat {metrics.logmae_bycat(y_val, y_val_pred_catboost)}')

cb logmae macro 0.3407821558793056
rf logmae by cat [0.28966317 0.4956147  0.28506702 0.29278374]


In [45]:
test_ids = test.item_id
test = test.drop('item_id', axis=1, errors='ignore')

Unnamed: 0,item_id,item_id.1
0,163755,0
1,1339648,1
2,21095,2
3,925424,3
4,780125,4
...,...,...
70269,1207676,70269
70270,1614448,70270
70271,1787906,70271
70272,897587,70272


In [53]:
test_stepik = np.zeros((test.shape[0], 4))

for i, cb_model in enumerate(catboost_models):
    test_stepik[:, i] = cb_model.predict(test)

test_stepik = pd.DataFrame(
    test_stepik,
    columns=['weight', 'height', 'length', 'width'],
)
test_stepik.insert(0, 'item_id', test_ids.iloc[:, 0].values)
test_stepik.to_csv('solution_improved_catboost.csv', index=False)