In [1]:
%cd ..

/home/den/dev/git/ozon-e-cup-2025


In [None]:
import json

import numpy as np
import pandas as pd

In [None]:
train_tfidf = pd.read_csv("data/X_train_text_features_1k_tfidf.csv")
test_tfidf = pd.read_csv("data/X_test_text_features_1k_tfidf.csv")

train_tfidf.shape, test_tfidf.shape

((197198, 2022), (22760, 2022))

In [None]:
train_ids = np.load("data/train_ids.npy")
val_ids = np.load("data/val_ids.npy")
test_ids = np.load("data/test_ids.npy")

len(train_ids), len(val_ids), len(test_ids)

(155979, 41219, 22760)

# Drop unnesessary columns

In [None]:
cols = json.load(open("data/cols.json"))

for k in cols:
    print(k, len(cols[k]))

meta 76
basic_text 28


In [None]:
train_tfidf.drop(columns=set(cols["basic_text"]) - {"id"}, inplace=True, errors="ignore")
test_tfidf.drop(columns=set(cols["basic_text"]) - {"id"}, inplace=True, errors="ignore")

In [None]:
assert "id" in train_tfidf.columns
assert "id" in test_tfidf.columns

assert "ItemID" not in train_tfidf.columns
assert "ItemID" not in test_tfidf.columns

assert "resolution" not in train_tfidf.columns
assert "resolution" not in test_tfidf.columns

# Split at train/val, create id2idx mappings and convert to numpy

In [None]:
val_tfidf = train_tfidf[train_tfidf["id"].isin(val_ids)]
train_tfidf = train_tfidf[train_tfidf["id"].isin(train_ids)]
test_tfidf = test_tfidf

train_tfidf.shape, val_tfidf.shape, test_tfidf.shape

((155979, 2001), (41219, 2001), (22760, 2001))

In [None]:
train_id2idx = {id: idx for idx, id in enumerate(train_tfidf["id"].tolist())}
val_id2idx = {id: idx for idx, id in enumerate(val_tfidf["id"].tolist())}
test_id2idx = {id: idx for idx, id in enumerate(test_tfidf["id"].tolist())}

In [None]:
assert len(train_id2idx) == len(train_tfidf)
assert len(val_id2idx) == len(val_tfidf)
assert len(test_id2idx) == len(test_tfidf)

In [None]:
# Create numpy arrays
train_tfidf = np.array(train_tfidf.drop(columns="id"))
val_tfidf = np.array(val_tfidf.drop(columns="id"))
test_tfidf = np.array(test_tfidf.drop(columns="id"))

In [None]:
train_tfidf.shape, val_tfidf.shape, test_tfidf.shape

((155979, 2000), (41219, 2000), (22760, 2000))

# Save full vectors

In [None]:
json.dump(train_id2idx, open("data/id2idx_tfidf_train.json", "w"))
json.dump(val_id2idx, open("data/id2idx_tfidf_val.json", "w"))
json.dump(test_id2idx, open("data/id2idx_tfidf_test.json", "w"))

np.save("data/tfidf_train.npy", train_tfidf)
np.save("data/tfidf_val.npy", val_tfidf)
np.save("data/tfidf_test.npy", test_tfidf)

# Apply TruncatedSVD

In [None]:
train_tfidf = np.load("data/tfidf_train.npy")
val_tfidf = np.load("data/tfidf_val.npy")
test_tfidf = np.load("data/tfidf_test.npy")

train_tfidf.shape, val_tfidf.shape, test_tfidf.shape

((155979, 2000), (41219, 2000), (22760, 2000))

In [None]:
((train_tfidf == 0).sum() / train_tfidf.size * 100).round(2), ((val_tfidf == 0).sum() / val_tfidf.size * 100).round(
    2
), ((test_tfidf == 0).sum() / test_tfidf.size * 100).round(2),

(np.float64(90.81), np.float64(90.51), np.float64(88.29))

In [None]:
from scipy.sparse import csr_matrix

train_tfidf = csr_matrix(train_tfidf)
val_tfidf = csr_matrix(val_tfidf)
test_tfidf = csr_matrix(test_tfidf)

In [None]:
from sklearn.decomposition import TruncatedSVD

tsvd = TruncatedSVD(n_components=100, random_state=42)
tsvd.fit(train_tfidf)

0,1,2
,n_components,100
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42
,tol,0.0


In [None]:
tsvd.explained_variance_ratio_.sum()

np.float64(0.5803790073897127)

In [None]:
train_tfidf_100 = tsvd.transform(train_tfidf)
val_tfidf_100 = tsvd.transform(val_tfidf)
test_tfidf_100 = tsvd.transform(test_tfidf)

In [None]:
train_tfidf_100.shape, val_tfidf_100.shape, test_tfidf_100.shape

((155979, 100), (41219, 100), (22760, 100))

In [15]:
np.save("data/tfidf_train_100.npy", train_tfidf_100)
np.save("data/tfidf_val_100.npy", val_tfidf_100)
np.save("data/tfidf_test_100.npy", test_tfidf_100)