In [1]:
%cd ..

/home/den/dev/git/ozon-e-cup-2025


In [2]:
import json

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline

import src.preprocessing as preproc

In [4]:
train_ids = np.load("data/train_ids.npy")
val_ids = np.load("data/val_ids.npy")

# Preprocess meta features

In [5]:
meta_train = pd.read_csv("data/1__train_with_flags.csv")
meta_test = pd.read_csv("data/1__test_with_flags.csv")

In [6]:
meta_train["SellerID"] = meta_train["SellerID"].astype("string")
meta_test["SellerID"] = meta_test["SellerID"].astype("string")

In [7]:
preproc_pipe = Pipeline(
    steps=[
        ("num_preproc", preproc.NUMERIC_PREP_PIPELINE),
        ("cat_preproc", preproc.CATEGORIAL_PREP_PIPELINE),
    ]
)

In [8]:
preproc_pipe = preproc_pipe.fit(meta_train.iloc[train_ids])

[Pipeline] ... (step 1 of 37) Processing rating_1_count, total=   0.1s
[Pipeline] ... (step 2 of 37) Processing rating_2_count, total=   0.1s
[Pipeline] ... (step 3 of 37) Processing rating_3_count, total=   0.1s
[Pipeline] ... (step 4 of 37) Processing rating_4_count, total=   0.1s
[Pipeline] ... (step 5 of 37) Processing rating_5_count, total=   0.2s
[Pipeline]  (step 6 of 37) Processing comments_published_count, total=   0.2s
[Pipeline]  (step 7 of 37) Processing photos_published_count, total=   0.2s
[Pipeline]  (step 8 of 37) Processing videos_published_count, total=   0.1s
[Pipeline] .. (step 9 of 37) Processing PriceDiscounted, total=   0.1s
[Pipeline] . (step 10 of 37) Processing item_time_alive, total=   0.2s
[Pipeline]  (step 11 of 37) Processing item_count_fake_returns7, total=   0.2s
[Pipeline]  (step 12 of 37) Processing item_count_fake_returns30, total=   0.2s
[Pipeline]  (step 13 of 37) Processing item_count_fake_returns90, total=   0.2s
[Pipeline]  (step 14 of 37) Proces

In [9]:
meta_train_preproc = preproc_pipe.transform(meta_train.iloc[train_ids], is_train=True, is_val=False)
meta_val_preproc = preproc_pipe.transform(meta_train.iloc[val_ids], is_train=False, is_val=True)
meta_test_preproc = preproc_pipe.transform(meta_test, is_train=False, is_val=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[random_mask] = self._new_cat_name
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[random_mask] = self._new_cat_name


In [10]:
meta_train = pd.concat([meta_train_preproc, meta_val_preproc], axis=0)
meta_test = meta_test_preproc

In [11]:
meta_train.drop(columns=["description", "name_rus", "ItemID"], inplace=True)
meta_test.drop(columns=["description", "name_rus", "ItemID"], inplace=True)

In [12]:
meta_feats_cols = meta_train.columns.tolist()
meta_feats_cols.remove("id")

# Merge with text basic features

In [13]:
text_feats_train = pd.read_csv("data/X_train_text_features_basic.csv")
text_feats_test = pd.read_csv("data/X_test_text_features_basic.csv")

In [None]:
text_feats_cols = text_feats_train.columns.to_list()
text_feats_cols.remove("id")

In [15]:
set(meta_train.columns) & set(text_feats_train.columns), set(meta_test.columns) & set(text_feats_test.columns)

({'id'}, {'id'})

In [None]:
train = pd.merge(meta_train, text_feats_train, on="id", how="left")
test = pd.merge(meta_test, text_feats_test, on="id", how="left")

In [None]:
assert train.shape[1] - 1 == len(meta_feats_cols) + len(text_feats_cols)

In [None]:
train.to_csv("data/train_meta_text_basic_feats.csv", index=False)
test.to_csv("data/test_meta_text_basic_feats.csv", index=False)

In [None]:
json.dump(
    {"meta_feats": meta_feats_cols, "text_feats": text_feats_cols},
    open("data/cols.json", "w"),
)