In [1]:
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import math
from typing import List, Optional
from sklearn.utils import shuffle
from sklearn.linear_model import SGDClassifier
tqdm.pandas()

In [2]:
labeled_train = pd.read_parquet('labeled_train.parquet')
unlabeled_train = pd.read_parquet('unlabeled_train.parquet')
categor = pd.read_csv('category_tree.csv')
labeled_train.shape, labeled_train['cat_id'].nunique(), unlabeled_train.shape

((716552, 4), 776, (784742, 3))

In [3]:
def build_category_tree_path(category_tree):
    """
    Строит дерево категорий, добавляя для каждой категории её цепочку предков.
    :param category_tree: Словарь вида {cat_id: parent_id}, где у корневых категорий parent_id = None.
    :return: Словарь {cat_id: {"level": уровень, "ancestors": [предки в порядке от корня до родителя]}}.
    """
    category_info = {}

    def get_path_and_level(cat_id):
        if cat_id in category_info:
            return category_info[cat_id]["level"], category_info[cat_id]["ancestors"]
        parent_id = category_tree.get(cat_id)
        if np.isnan(parent_id):
            level = 1
            ancestors = []
        else:
            parent_level, parent_ancestors = get_path_and_level(parent_id)
            level = parent_level + 1
            ancestors = parent_ancestors + [parent_id]
        category_info[cat_id] = {"level": level, "ancestors": ancestors}
        return level, ancestors

    for cat_id in category_tree:
        get_path_and_level(cat_id)

    return category_info


def find_lowest_common_ancestor(true_id, pred_id, category_info):
    """
    Находит наибольшего общего предка (Lowest Common Ancestor - LCA) между предсказанной 
    и истинной категорией.
    
    :param true_id: Истинная категория.
    :param pred_id: Предсказанная категория.
    :param category_info: Словарь с уровнями категорий.
    :return: (LCA, уровень LCA).
    """
    true_info = category_info.get(true_id, {"level": 0, "ancestors": []})
    pred_info = category_info.get(pred_id, {"level": 0, "ancestors": []})

    # Совпадает — нет необходимости искать предка
    if true_id == pred_id:
        return true_id, true_info["level"]

    # Собираем множества предков
    true_ancestors = set(true_info["ancestors"] + [true_id])
    pred_ancestors = set(pred_info["ancestors"] + [pred_id])

    # Ищем наибольшего общего предка
    common_ancestors = true_ancestors.intersection(pred_ancestors)
    if not common_ancestors:
        return None, 0  # Категории не связаны — полный штраф

    # Выбираем самого глубокого предка 
    lca = max(common_ancestors, key=lambda cat: category_info[cat]["level"])
    
    return lca, category_info[lca]["level"]


def hierarchical_accuracy_with_branch_check(predicted_ids, true_ids, category_tree):
    """
    Рассчитывает метрику, учитывая иерархию категорий и наибольшего общего предка (LCA).

    :param predicted_ids: Список предсказанных категорий.
    :param true_ids: Список правильных категорий.
    :param category_tree: Словарь {cat_id: parent_id}, описывающий иерархию категорий.
    :return: Средняя метрика по всем примерам.
    """
    assert len(true_ids) == len(predicted_ids), "Длина списков не совпадает"
    
    # Словарь {cat_id: {"level": level, "ancestors": ancestors}}
    category_info = build_category_tree_path(category_tree)

    total_score = 0

    for true_id, pred_id in zip(true_ids, predicted_ids):
        # Находим LCA для истинного и предсказанного значения
        lca, lca_level = find_lowest_common_ancestor(true_id, pred_id, category_info)

        if lca is None:
            score = 0  # Если совпадений нет, штрафуем по максимуму
        else:
            true_level = category_info.get(true_id, {"level": 0})["level"]
            level_difference = max(0, true_level - lca_level)  # LCA сравниваем с истиной
            
            # Дисконтируем на разницу уровней
            score = 1 / math.exp(level_difference)

        total_score += score

    return total_score / len(true_ids)

In [4]:
categor_dict = set(categor.groupby('parent_id')['cat_name'].apply(list).to_dict().keys())
category_tree = {}
for _, row in categor.iterrows():
    cat_id = row['cat_id']
    parent_id = row['parent_id']
    if pd.isna(parent_id):
        parent_id = np.nan  
    category_tree[cat_id] = parent_id
labeled_train.shape, labeled_train['cat_id'].nunique(), len(categor_dict)

((716552, 4), 776, 490)

In [1]:
776-418

358

In [5]:
labeled_train['source_name'] = labeled_train['source_name'].str.lower()

most_common_cat_id = labeled_train.groupby('source_name')['cat_id'].agg(
    lambda x: x.value_counts().index[0] if len(x) > 0 else None
).reset_index()
most_common_cat_id.rename(columns={'cat_id': 'most_common_cat_id'}, inplace=True)

labeled_train = labeled_train.merge(most_common_cat_id, on='source_name', how='left')
labeled_train['cat_id'] = labeled_train['most_common_cat_id']

labeled_train.drop(columns=['most_common_cat_id'], inplace=True)

labeled_train.shape, labeled_train['cat_id'].nunique()

((716552, 4), 773)

In [6]:
unlabeled_train_temp = unlabeled_train.copy()
labeled_train_temp = labeled_train.copy()


labeled_train_temp['attributes'] = labeled_train_temp['attributes'].str.lower()
unlabeled_train_temp['attributes'] = unlabeled_train_temp['attributes'].str.lower()

labeled_train_temp = labeled_train_temp.drop_duplicates(["source_name"])
unlabeled_train_temp = unlabeled_train_temp.drop_duplicates(["source_name"])


labeled_train_temp = labeled_train_temp[labeled_train_temp['attributes'] != "[{}]"]
unlabeled_train_temp = unlabeled_train_temp[unlabeled_train_temp['attributes'] != "[{}]"]

unlabeled_train_temp = unlabeled_train_temp.merge(
    labeled_train_temp[['attributes', 'cat_id']], 
    on='attributes', 
    how='left'
).dropna(subset=['cat_id'])

labeled_train = pd.concat([labeled_train, unlabeled_train_temp], ignore_index=True)

In [7]:
labeled_train['source_name'] = labeled_train['source_name'].str.lower()
most_common_cat_id = labeled_train.groupby('source_name')['cat_id'].agg(lambda x: x.mode().iloc[0]).reset_index()
most_common_cat_id.rename(columns={'cat_id': 'most_common_cat_id'}, inplace=True)
labeled_train = labeled_train.merge(most_common_cat_id, on='source_name', how='left')
labeled_train['cat_id'] = labeled_train['most_common_cat_id']
labeled_train.drop(columns=['most_common_cat_id'], inplace=True)
labeled_train = labeled_train.drop_duplicates(["source_name"])
labeled_train.shape, labeled_train['cat_id'].nunique()

((668362, 4), 773)

In [8]:
class_counts = labeled_train['cat_id'].value_counts()

labeled_train_filtered = labeled_train[labeled_train['cat_id'].isin(class_counts[class_counts > 10].index)]

train, test = train_test_split(
    labeled_train_filtered, 
    test_size=0.05, 
    random_state=42, 
    stratify=labeled_train_filtered['cat_id']
)

X_train, y_train = train['source_name'], train['cat_id']
X_test, y_test = test['source_name'], test['cat_id']
labeled_train_filtered['cat_id'].nunique(), labeled_train_filtered['cat_id'].shape

(407, (667348,))

In [9]:
vectorizer = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=20000,
        # min_df=2,
        # max_df=0.95,
        ngram_range=(1, 3),
        sublinear_tf=True,

    )),
    ('svd', TruncatedSVD(n_components=500))
])

vectorizer.fit_transform(labeled_train['source_name'])

X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [10]:
clf = SGDClassifier(
    loss='log_loss',         
    penalty='l2',            
    alpha=0.0001,           
    learning_rate='optimal',
    random_state=42
)

classes = y_train.unique()

# Генератор батчей
def batch_generator(X, y, batch_size=5000):
    for x in tqdm(range(0, len(X), batch_size)):
        yield X[x:x+batch_size], y[x:x+batch_size]


for epoch in range(2):
    print('Эпоха - ', epoch+1)
    
    indices = np.random.permutation(len(X_train_tfidf))
    X_train_shuffled = X_train_tfidf[indices]
    y_train_shuffled = y_train.iloc[indices] if hasattr(y_train, 'iloc') else y_train[indices]
    
    for X_batch, y_batch in batch_generator(X_train_shuffled, y_train_shuffled, batch_size=5000):
        clf.partial_fit(X_batch, y_batch, classes=classes)
    
    y_pred = clf.predict(X_test_tfidf)
    hda = hierarchical_accuracy_with_branch_check(y_pred, y_test, category_tree)
    print(f"Epoch {epoch+1} HDA: {hda:.4f}")


with open("baseline_model.pkl", "wb") as f:
    pickle.dump(clf, f)
with open("baseline_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

Эпоха -  1


  0%|          | 0/127 [00:00<?, ?it/s]

Epoch 1 HDA: 0.7572
Эпоха -  2


  0%|          | 0/127 [00:00<?, ?it/s]

Epoch 2 HDA: 0.7611
