In [58]:
import pandas as pd
import numpy as np

metadata = pd.read_csv("metadata.csv")

In [59]:
metadata["timestamp"] = pd.to_datetime(metadata["timestamp"])
metadata = metadata.sort_values("timestamp")
split_date = pd.Timestamp("2025-04-01")
train_df = metadata[metadata["timestamp"] < split_date]
valid_df = metadata[metadata["timestamp"] >= split_date]

In [60]:
train_items = set(train_df["edition_id"].unique())
valid_items = set(valid_df["edition_id"].unique())
cold_items = valid_items - train_items
print("Всего книг в valid:", len(valid_items))
print("Холодных книг в valid:", len(cold_items))
print("Доля холодных:", len(cold_items) / len(valid_items))

Всего книг в valid: 10339
Холодных книг в valid: 3185
Доля холодных: 0.3080568720379147


In [61]:
item_pop = (
    train_df
    .groupby("edition_id")
    .size()
    .rename("item_popularity")
    .reset_index()
)
train_df = train_df.merge(item_pop, on="edition_id", how="left")
valid_df = valid_df.merge(item_pop, on="edition_id", how="left")
train_df["item_popularity"] = train_df["item_popularity"].fillna(0)
valid_df["item_popularity"] = valid_df["item_popularity"].fillna(0)

user_genre_cnt = (
    train_df
    .groupby(["user_id", "genre_id"])
    .size()
    .rename("user_genre_cnt")
    .reset_index()
)

user_total = (
    train_df
    .groupby("user_id")
    .size()
    .rename("user_total")
    .reset_index()
)

user_genre_freq = user_genre_cnt.merge(user_total, on="user_id")
user_genre_freq["user_genre_freq"] = (
    user_genre_freq["user_genre_cnt"] / user_genre_freq["user_total"]
)

user_genre_freq = user_genre_freq[
    ["user_id", "genre_id", "user_genre_freq"]
]
train_df = train_df.merge(
    user_genre_freq,
    on=["user_id", "genre_id"],
    how="left"
)

valid_df = valid_df.merge(
    user_genre_freq,
    on=["user_id", "genre_id"],
    how="left"
)
train_df["user_genre_freq"] = train_df["user_genre_freq"].fillna(0)
valid_df["user_genre_freq"] = valid_df["user_genre_freq"].fillna(0)
user_author_seen = (
    train_df
    .groupby(["user_id", "author_id"])
    .size()
    .rename("user_author_seen")
    .reset_index()
)

user_author_seen["user_author_seen"] = 1
train_df = train_df.merge(
    user_author_seen,
    on=["user_id", "author_id"],
    how="left"
)

valid_df = valid_df.merge(
    user_author_seen,
    on=["user_id", "author_id"],
    how="left"
)
train_df["user_author_seen"] = train_df["user_author_seen"].fillna(0)
valid_df["user_author_seen"] = valid_df["user_author_seen"].fillna(0)

In [None]:
valid_df

In [63]:
print("train:", len(train_df))
print("valid:", len(valid_df))

print(train_df["timestamp"].min(), train_df["timestamp"].max())
print(valid_df["timestamp"].min(), valid_df["timestamp"].max())

train: 434696
valid: 30048
2024-10-14 18:48:56 2025-03-31 23:59:46
2025-04-01 00:00:55 2025-04-12 12:44:23


In [64]:
train_df.drop(columns=["Unnamed: 0"], inplace=True)
valid_df.drop(columns=["Unnamed: 0"], inplace=True)

In [65]:
train_df.drop(columns=["rating"], inplace=True)
valid_df.drop(columns=["rating"], inplace=True)

train_df["description"] = train_df["description"].fillna("Нет описания")
valid_df["description"] = valid_df["description"].fillna("Нет описания")

train_df.drop(columns=["author_name"], inplace=True)
valid_df.drop(columns=["author_name"], inplace=True)

train_df["age"] = train_df["age"].fillna(train_df["age"].median())
valid_df["age"] = valid_df["age"].fillna(train_df["age"].median())

train_df["gender"] = train_df["gender"].fillna(0)
valid_df["gender"] = valid_df["gender"].fillna(0)

In [77]:
items = metadata[["edition_id", "title", "description"]].drop_duplicates("edition_id").copy()
items["text"] = (items["title"].fillna("") + " " + items["description"].fillna("")).str.lower().str.strip()

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_items = set(train_df["edition_id"].unique())
items_train = items[items["edition_id"].isin(train_items)].copy()

tfidf = TfidfVectorizer(
    max_features=200_000,      # можно 50k–300k
    ngram_range=(1, 2),        # униграммы+биграммы
    min_df=2,                  # игнор редких
    max_df=0.9,                # игнор слишком частых
    token_pattern=r"(?u)\b\w+\b",
)

X_tfidf_train = tfidf.fit_transform(items_train["text"])


In [79]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, random_state=42)
X_svd_train = svd.fit_transform(X_tfidf_train)

In [80]:
X_tfidf_all = tfidf.transform(items["text"])
X_svd_all = svd.transform(X_tfidf_all)

In [81]:
import numpy as np

svd_cols = [f"tfidf_svd_{i}" for i in range(X_svd_all.shape[1])]
item_text_df = pd.DataFrame(X_svd_all, columns=svd_cols)
item_text_df.insert(0, "edition_id", items["edition_id"].values)

train_df = train_df.merge(item_text_df, on="edition_id", how="left")
valid_df = valid_df.merge(item_text_df, on="edition_id", how="left")

train_df[svd_cols] = train_df[svd_cols].fillna(0.0)
valid_df[svd_cols] = valid_df[svd_cols].fillna(0.0)

In [None]:
train_df.drop(columns=["description", "title"], inplace=True)
valid_df.drop(columns=["description", "title"], inplace=True)

In [97]:
train_df.drop(columns=["genre_name"], inplace=True)
valid_df.drop(columns=["genre_name"], inplace=True)

In [99]:
train_df.to_csv("train_data.csv", index=False)
valid_df.to_csv("validation_data.csv", index=False)