In [2]:
import pandas as pd
import numpy as np

metadata = pd.read_csv("E:\Hakaton\participants\participants\data\metadata.csv")

In [3]:
metadata["timestamp"] = pd.to_datetime(metadata["timestamp"])
metadata = metadata.sort_values("timestamp")
split_date = pd.Timestamp("2025-04-01")
train_df = metadata[metadata["timestamp"] < split_date]
valid_df = metadata[metadata["timestamp"] >= split_date]

In [4]:
train_items = set(train_df["edition_id"].unique())
valid_items = set(valid_df["edition_id"].unique())
cold_items = valid_items - train_items
print("Всего книг в valid:", len(valid_items))
print("Холодных книг в valid:", len(cold_items))
print("Доля холодных:", len(cold_items) / len(valid_items))

Всего книг в valid: 10339
Холодных книг в valid: 3185
Доля холодных: 0.3080568720379147


In [7]:
item_pop = (
    train_df
    .groupby("edition_id")
    .size()
    .rename("item_popularity")
    .reset_index()
)

train_df = train_df.merge(item_pop, on="edition_id", how="left")
valid_df = valid_df.merge(item_pop, on="edition_id", how="left")
train_df["item_popularity"] = train_df["item_popularity"].fillna(0)
valid_df["item_popularity"] = valid_df["item_popularity"].fillna(0)

user_genre_cnt = (
    train_df
    .groupby(["user_id", "genre_id"])
    .size()
    .rename("user_genre_cnt")
    .reset_index()
)

user_total = (
    train_df
    .groupby("user_id")
    .size()
    .rename("user_total")
    .reset_index()
)

user_genre_freq = user_genre_cnt.merge(user_total, on="user_id")
user_genre_freq["user_genre_freq"] = (
    user_genre_freq["user_genre_cnt"] / user_genre_freq["user_total"]
)

user_genre_freq = user_genre_freq[
    ["user_id", "genre_id", "user_genre_freq"]
]
train_df = train_df.merge(
    user_genre_freq,
    on=["user_id", "genre_id"],
    how="left"
)

valid_df = valid_df.merge(
    user_genre_freq,
    on=["user_id", "genre_id"],
    how="left"
)
train_df["user_genre_freq"] = train_df["user_genre_freq"].fillna(0)
valid_df["user_genre_freq"] = valid_df["user_genre_freq"].fillna(0)
user_author_seen = (
    train_df
    .groupby(["user_id", "author_id"])
    .size()
    .rename("user_author_seen")
    .reset_index()
)

user_author_seen["user_author_seen"] = 1
train_df = train_df.merge(
    user_author_seen,
    on=["user_id", "author_id"],
    how="left"
)

valid_df = valid_df.merge(
    user_author_seen,
    on=["user_id", "author_id"],
    how="left"
)
train_df["user_author_seen"] = train_df["user_author_seen"].fillna(0)
valid_df["user_author_seen"] = valid_df["user_author_seen"].fillna(0)

In [None]:
print("train:", len(train_df))
print("valid:", len(valid_df))

print(train_df["timestamp"].min(), train_df["timestamp"].max())
print(valid_df["timestamp"].min(), valid_df["timestamp"].max())

train: 434696
valid: 30048
2024-10-14 18:48:56 2025-03-31 23:59:46
2025-04-01 00:00:55 2025-04-12 12:44:23


In [None]:
"""
FULL-VIBECODE функция генерации негативов
Ебаное говно, надо исправлять 100% и самим разбираться
как делаются негативы, я просто заебался пока сидел воял это

TODO:
Переделать генерацию негативов полностью самостоятельно
"""


import numpy as np
import pandas as pd

def build_stores(train_pos_df):
    item_store_cols = [
        "edition_id","rating","book_id","author_id","publication_year","age_restriction",
        "language_id","publisher_id","title","description","genre_id","genre_name",
        "author_name","book_age_years","item_popularity"
    ]
    item_store = train_pos_df[item_store_cols].drop_duplicates("edition_id")

    user_store = train_pos_df[["user_id","gender","age"]].drop_duplicates("user_id")

    user_genre_store = train_pos_df[["user_id","genre_id","user_genre_freq"]].drop_duplicates(["user_id","genre_id"])

    user_author_store = train_pos_df[["user_id","author_id","user_author_seen"]].drop_duplicates(["user_id","author_id"])

    return item_store, user_store, user_genre_store, user_author_store


def gen_negs(pos_df, item_pool, n_neg=50, seed=42):
    rng = np.random.default_rng(seed)
    pool = np.array(list(item_pool))

    seen = pos_df.groupby("user_id")["edition_id"].apply(set).to_dict()
    users = pos_df["user_id"].unique()

    rows = []
    for u in users:
        seen_items = seen.get(u, set())
        candidates = pool[~np.isin(pool, list(seen_items))]
        if len(candidates) == 0:
            continue
        k = min(n_neg, len(candidates))
        neg_items = rng.choice(candidates, size=k, replace=False)
        rows.extend([(u, int(it)) for it in neg_items])

    neg = pd.DataFrame(rows, columns=["user_id","edition_id"])
    neg["event_type"] = 0
    neg["timestamp"] = pd.NaT
    neg["year"] = 0
    neg["day"] = 0
    neg["rating"] = np.nan
    return neg


def attach_features(neg_df, item_store, user_store, user_genre_store, user_author_store, age_fill, gender_fill=0):
    neg_df = neg_df.merge(item_store, on="edition_id", how="left")
    neg_df = neg_df.merge(user_store, on="user_id", how="left")
    neg_df = neg_df.merge(user_genre_store, on=["user_id","genre_id"], how="left")
    neg_df = neg_df.merge(user_author_store, on=["user_id","author_id"], how="left")

    neg_df["item_popularity"] = neg_df["item_popularity"].fillna(0)
    neg_df["user_genre_freq"] = neg_df["user_genre_freq"].fillna(0)
    neg_df["user_author_seen"] = neg_df["user_author_seen"].fillna(0)

    neg_df["age"] = neg_df["age"].fillna(age_fill)
    neg_df["gender"] = neg_df["gender"].fillna(gender_fill)

    neg_df["description"] = neg_df["description"].fillna("Нет описания")
    return neg_df



train_pos_df, valid_pos_df = train_df.copy(), valid_df.copy()
item_store, user_store, user_genre_store, user_author_store = build_stores(train_pos_df)
age_fill = train_pos_df["age"].median()

train_item_pool = set(train_pos_df["edition_id"].unique())
all_item_pool = set(metadata["edition_id"].unique())

train_neg = gen_negs(train_pos_df, train_item_pool, n_neg=50, seed=1)
train_neg = attach_features(train_neg, item_store, user_store, user_genre_store, user_author_store, age_fill)
train_df = pd.concat([train_pos_df, train_neg], ignore_index=True)

valid_neg = gen_negs(valid_pos_df, all_item_pool, n_neg=50, seed=2)
valid_neg = attach_features(valid_neg, item_store, user_store, user_genre_store, user_author_store, age_fill)
valid_df = pd.concat([valid_pos_df, valid_neg], ignore_index=True)

In [13]:
train_df.drop(columns=["Unnamed: 0"], inplace=True)
valid_df.drop(columns=["Unnamed: 0"], inplace=True)

In [14]:
train_df.drop(columns=["rating"], inplace=True)
valid_df.drop(columns=["rating"], inplace=True)

train_df["description"] = train_df["description"].fillna("Нет описания")
valid_df["description"] = valid_df["description"].fillna("Нет описания")

train_df.drop(columns=["author_name"], inplace=True)
valid_df.drop(columns=["author_name"], inplace=True)

train_df["age"] = train_df["age"].fillna(train_df["age"].median())
valid_df["age"] = valid_df["age"].fillna(train_df["age"].median())

train_df["gender"] = train_df["gender"].fillna(0)
valid_df["gender"] = valid_df["gender"].fillna(0)

train_df.drop(columns=["rating_x", "rating_y"], inplace=True)
valid_df.drop(columns=["rating_x", "rating_y"], inplace=True)

In [15]:
items = metadata[["edition_id", "title", "description"]].drop_duplicates("edition_id").copy()
items["text"] = (items["title"].fillna("") + " " + items["description"].fillna("")).str.lower().str.strip()

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_items = set(train_df["edition_id"].unique())
items_train = items[items["edition_id"].isin(train_items)].copy()

tfidf = TfidfVectorizer(
    max_features=200_000,      # можно 50k–300k
    ngram_range=(1, 2),        # униграммы+биграммы
    min_df=2,                  # игнор редких
    max_df=0.9,                # игнор слишком частых
    token_pattern=r"(?u)\b\w+\b",
)

X_tfidf_train = tfidf.fit_transform(items_train["text"])


In [17]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, random_state=42)
X_svd_train = svd.fit_transform(X_tfidf_train)

In [18]:
X_tfidf_all = tfidf.transform(items["text"])
X_svd_all = svd.transform(X_tfidf_all)

In [19]:
import numpy as np

svd_cols = [f"tfidf_svd_{i}" for i in range(X_svd_all.shape[1])]
item_text_df = pd.DataFrame(X_svd_all, columns=svd_cols)
item_text_df.insert(0, "edition_id", items["edition_id"].values)

train_df = train_df.merge(item_text_df, on="edition_id", how="left")
valid_df = valid_df.merge(item_text_df, on="edition_id", how="left")

train_df[svd_cols] = train_df[svd_cols].fillna(0.0)
valid_df[svd_cols] = valid_df[svd_cols].fillna(0.0)

In [20]:
train_df.drop(columns=["description", "title"], inplace=True)
valid_df.drop(columns=["description", "title"], inplace=True)

In [21]:
train_df.drop(columns=["genre_name"], inplace=True)
valid_df.drop(columns=["genre_name"], inplace=True)

In [30]:
train_df.drop(columns=["timestamp", "year", "day"], inplace=True)
valid_df.drop(columns=["timestamp", "year", "day"], inplace=True)

In [None]:
"""
ЗАМЕТКА:
У многих книг пропущен publication_year (0) из за чего их возраст стал 2025 лет

TODO:
Поправить надо бы
"""

In [34]:
train_df.to_csv("train_data.csv", index=False)
valid_df.to_csv("validation_data.csv", index=False)

In [None]:
"""
Делаем датафреймы с фичами что бы потом на предикте
юзать их. Полная хуйня в реализации, тут мб какая то ошибка есть

TODO:
Придумать что то получше для мерджа данных при предикте
"""

item_features = (
    train_df[
        ["edition_id",
         "book_id", "author_id", "genre_id",
         "publication_year", "age_restriction",
         "language_id", "publisher_id",
         "book_age_years",
         "item_popularity"]
        + [c for c in train_df.columns if c.startswith("tfidf_svd_")]
    ]
    .drop_duplicates("edition_id")
)

item_features.to_csv("item_features.csv", index=False)

user_features = (
    train_df[
        ["user_id", "age", "gender"]
    ]
    .drop_duplicates("user_id")
)

user_features.to_csv("user_features.csv", index=False)

user_genre_freq = (
    train_df[
        ["user_id", "genre_id", "user_genre_freq"]
    ]
    .drop_duplicates(["user_id", "genre_id"])
)

user_genre_freq.to_csv("user_genre_freq.csv", index=False)

user_author_seen = (
    train_df[
        ["user_id", "author_id", "user_author_seen"]
    ]
    .drop_duplicates(["user_id", "author_id"])
)

user_author_seen.to_csv("user_author_seen.csv", index=False)
