# Файл с общим анализом данных и фрагментами функционала

### Загружаем данные

In [2]:
import pandas as pd

categoryes = pd.read_csv("training-data/retail-rocket/category_tree.csv")
events = pd.read_csv("training-data/retail-rocket/events.csv")
properties1 = pd.read_csv("training-data/retail-rocket/item_properties_part1.csv")
properties2 = pd.read_csv("training-data/retail-rocket/item_properties_part2.csv")

events = events.sort_values(by="timestamp")
properties = pd.concat([properties1, properties2])

#data truncation
# categoryes = categoryes[0:1000]
# events = events[0:1000]
# properties = properties1[0:1000]

### Обрабатываем категории

In [3]:
NONE = -1

sorted_categoryes = categoryes.fillna(NONE).sort_values(by="parentid")
sorted_categoryes["parentid"] = sorted_categoryes["parentid"].astype(int)
print(sorted_categoryes.dtypes)
sorted_categoryes.head()

categoryid    int64
parentid      int64
dtype: object


Unnamed: 0,categoryid,parentid
1476,1452,-1
1484,1182,-1
861,1490,-1
589,791,-1
1629,140,-1


#### Представляем категории в виде дерева

In [3]:
from app.modules.categories import Tree, TreeNode


# заполняем словарь parent - childrens
category_map: dict[int, list[int]] = {}
for _, row in sorted_categoryes.iterrows():
    parentid = int(row['parentid'])
    categoryid = int(row['categoryid'])
    
    if parentid not in category_map:
        category_map[parentid] = []
    category_map[parentid].append(categoryid)

# заполняем дерево
categoryes_tree = Tree()

stack: list[TreeNode] = [categoryes_tree._root]
while len(stack) > 0:
    node = stack.pop()
    
    key = node.value if node.value != "" else NONE
    if key not in category_map:
        continue
        
    for child_value in category_map[key]:
        stack.append(categoryes_tree.append_to(node, child_value))



In [None]:
categoryes_tree._root.childrens

### Обрабатывем события

Находим уникальные события, чтобы создать EventEnum

In [4]:
events["event"].unique()

array(['addtocart', 'view', 'transaction'], dtype=object)

находим последовательность действий для всех пользователей

In [4]:
import numpy as np
from app.common.schemas import Event
from app.common.enums import EventEnum


users_events: dict[int, list[Event]] = {}
for _, user_event in events.iterrows():
    pyd_event = Event(
        event=EventEnum(user_event["event"]),
        user_id=user_event["visitorid"],
        item_id=user_event["itemid"],
        transaction_id=user_event["transactionid"] if not np.isnan(user_event["transactionid"]) else None,
        timestamp=user_event["timestamp"]
    )
    if users_events.get(user_event["visitorid"]) is None:
        users_events[user_event["visitorid"]] = [pyd_event]
    else:
        users_events[user_event["visitorid"]].append(pyd_event)

### Обрабатывем параметры предметов

```python
formalizable_properties: dict[
    int,  # property
    int   # индекс формализуемости: число_использований/число_всех_значений_параметра
]
```

In [6]:
# удаляем параметры присущие только 5-ти процентам предметов
properties_counts = properties["property"].value_counts()
properties_counts = properties_counts[properties_counts > int(len(properties_counts)*0.9)]
properties_counts = properties_counts.index.tolist()

# создаём таблицу формализуемости параметров
stat_properties: dict[int, list[int, list[int]]] = {}
for _, item_property in properties.iterrows():
    if item_property["property"] not in properties_counts:
        continue
    if stat_properties.get(item_property["property"]) is None:
        stat_properties[item_property["property"]] = [1, [item_property["value"]]]
    elif item_property["value"] in stat_properties[item_property["property"]][1]:
        stat_properties[item_property["property"]][0] += 1
    else:
        stat_properties[item_property["property"]][0] += 1
        stat_properties[item_property["property"]][1].append(item_property["value"])

# находим наиболее формализуемые параметры
formalizable_properties = {prop: stat[0]/len(stat[1]) for prop, stat in stat_properties.items()}

# разделяем параметры по популярности
popular_properties = properties_counts[0:int(len(properties_counts)/3)]
standart_properties = properties_counts[int(len(properties_counts)/3):int(len(properties_counts)*2/3)]
unpopular_properties = properties_counts[int(len(properties_counts)*2/3):int(len(properties_counts))]

KeyboardInterrupt: 

сохраняем полученные результаты

In [None]:
import json
from pathlib import Path


with Path("training-data/combined/popular_properties.json").open("w") as f:
    json.dump(popular_properties, f)
with Path("training-data/combined/standart_properties.json").open("w") as f:
    json.dump(standart_properties, f)
with Path("training-data/combined/unpopular_properties.json").open("w") as f:
    json.dump(unpopular_properties, f)
with Path("training-data/combined/formalizable_properties.json").open("w") as f:
    json.dump(formalizable_properties, f)

создаём таблицу со всеми снимками срстояний параметров для каждого параметра
```python
item_properties: dict[
    int, # itemid
    dict[ # все property для itemid
        int, # property
        list[ # список всех состояний для property
            tuple[
                int, # timestamp
                str  # value
            ]
        ]
    ]
]
```

In [7]:
item_properties: dict[int, dict[int, list[tuple[int, str]]]] = {}
item_avalable: dict[int, list[tuple[int, bool]]] = {}
item_category: dict[int, list[tuple[int, int]]] = {}
for _, values in properties.iterrows():
    if values["property"] == "categoryid":
        if item_category.get(values["itemid"]) is None:
            item_category[values["itemid"]] = [(values["timestamp"], values["value"])]
        else:
            item_category[values["itemid"]].append((values["timestamp"], values["value"]))
        continue

    if values["property"] == "available":
        if item_avalable.get(values["itemid"]) is None:
            item_avalable[values["itemid"]] = [(values["timestamp"], values["value"])]
        else:
            item_avalable[values["itemid"]].append((values["timestamp"], values["value"]))
        continue

    if item_properties.get(values["itemid"]) is None:
        item_properties[values["itemid"]] = {
            values["property"]: [(values["timestamp"], values["value"])]
        }
    elif item_properties[values["itemid"]].get(values["property"]) is None:
        item_properties[values["itemid"]][values["property"]] = [(values["timestamp"], values["value"])]
    else:
        item_properties[values["itemid"]][values["property"]].append((values["timestamp"], values["value"]))

сохраняем полученные результаты

In [5]:
import json
import itertools
from pathlib import Path
from typing import Any

In [6]:
with Path("training-data/combined/item_properties.json").open("w") as f:
    json.dump(item_properties, f)
with Path("training-data/combined/item_avalable.json").open("w") as f:
    json.dump(item_avalable, f)
with Path("training-data/combined/item_category.json").open("w") as f:
    json.dump(item_category, f)

NameError: name 'item_properties' is not defined

In [None]:
f1 = {
    key: [
        {
            'event': str(value.event),
            'user_id': value.user_id,
            'item_id': value.item_id,
            'transaction_id': value.transaction_id,
            'timestamp': value.timestamp.timestamp()
        } for value in values
    ] for key, values in users_events.items()
}
with Path("training-data/combined/users_events.json").open("w") as f:
    json.dump(f1, f)
short_users_events:dict[int, list[dict[str, Any]]]=dict(itertools.islice(f1.items(), int(len(f1)/500)))
with Path("training-data/combined/short_users_events.json").open("w") as f:
    json.dump(short_users_events, f)
    
    

In [None]:
with Path("training-data/combined/item_properties.json").open("r") as f:
    item_properties=dict(json.loads(f.read()))
with Path("training-data/combined/formalizable_properties.json").open("r") as f:
    formalizable_properties=dict(json.loads(f.read()))
with Path("training-data/combined/item_category.json").open("r") as f:
    item_category=dict(json.loads(f.read()))
with Path("training-data/combined/short_users_events.json").open("r") as f:
    short_users_events=dict(json.loads(f.read()))

In [None]:
short_item_properties: dict[int, dict[int, list[tuple[int, str]]]] = {}
short_formalizable_properties: dict[int, float] = {}
short_item_category: dict[int, list[tuple[int, int]]] = {}
for events in short_users_events.values():
    for event in events:
        if short_item_properties.get(event["item_id"]) is None and item_properties.get(str(event["item_id"])) is not None:
            short_item_properties[event["item_id"]] = item_properties[str(event["item_id"])]
        if short_item_category.get(event["item_id"]) is None and item_category.get(str(event["item_id"])) is not None:
            short_item_category[event["item_id"]] = item_category[str(event["item_id"])]
for props in short_item_properties.values():
    for prop in props.keys():
        if short_formalizable_properties.get(prop) is None and formalizable_properties.get(str(prop)) is not None:
            short_formalizable_properties[prop] = formalizable_properties[str(prop)]

In [None]:
with Path("training-data/combined/short_item_properties.json").open("w") as f:
    json.dump(short_item_properties, f)
with Path("training-data/combined/short_item_category.json").open("w") as f:
    json.dump(short_item_category, f) 
with Path("training-data/combined/short_formalizable_properties.json").open("w") as f:
    json.dump(short_formalizable_properties, f)

In [None]:
short_item_category: dict[int, list[tuple[int, int]]] = {}
for events in short_users_events.values():
    for event in events:
        if (
            short_item_category.get(event["item_id"]) is None
            and short_item_category.get(event["item_id"]) is not None
            and formalizable_properties.get(event["item_id"]) is not None
        ):
            short_item_category[event["item_id"]] = item_category[event["item_id"]]
with Path("training-data/combined/short_item_category.json").open("w") as f:
    json.dump(short_item_category, f)

Используем сохраненные данные, очищая ОЗУ от предыдущих значений (нужно перезапустить ipynb)

In [12]:
import json
from pathlib import Path
from app.common.schemas import Event

with Path("training-data/combined/short_users_events.json").open("r") as f:
    short_users_events=dict(json.loads(f.read()))
users_events: dict[int, list[Event]] = {
    int(visitor_id): [
        Event.model_validate(event) for event in user_events
    ] for visitor_id, user_events in short_users_events.items()
}

with Path("training-data/combined/short_item_properties.json").open("r") as f:
    item_properties=dict(json.loads(f.read()))
with Path("training-data/combined/short_item_category.json").open("r") as f:
    item_category=dict(json.loads(f.read()))
with Path("training-data/combined/short_formalizable_properties.json").open("r") as f:
    short_formalizable_properties=dict(json.loads(f.read()))

        


In [13]:
len(users_events)

1407580

In [None]:
from app.common.schemas import TrainingEvent
from app.common.enums import SeasonEnum

training_events: list[list[TrainingEvent]] =[]
for user_events in users_events.values():
    prev_event: Event | None = None
    training_user_events = []
    for user_event in sorted(user_events, key=lambda event: event.timestamp):
        training_event = TrainingEvent(
            event=user_event.event,
            season=SeasonEnum((user_event.timestamp.month % 12) // 3),
            timedelta_days=(user_event.timestamp - prev_event.timestamp).days if prev_event is not None else 0,
        )
        
        actual_timestamp: int = 0
        actual_category: int = 0
        if item_category.get(str(user_event.item_id)) is not None:
            for timestamp, category in item_category[str(user_event.item_id)]:
                if user_event.timestamp.timestamp() - timestamp < user_event.timestamp.timestamp() - actual_timestamp and user_event.timestamp.timestamp() - timestamp >= 0:
                    actual_timestamp = timestamp
                    actual_category = category
            training_event.item_category = actual_category
        
        form_factor = 0
        if item_properties.get(str(user_event.item_id)) is not None:
            for item_property, property_values in item_properties[str(user_event.item_id)].items():
                if short_formalizable_properties.get(item_property) is not None and short_formalizable_properties[item_property] > form_factor:
                    actual_timestamp: int = 0
                    actual_value: str = ""
                    for timestamp, values in property_values:
                        if user_event.timestamp.timestamp() - timestamp/1000 < user_event.timestamp.timestamp() - actual_timestamp and user_event.timestamp.timestamp() - timestamp/1000 >= 0:
                            actual_timestamp = int(timestamp/1000)
                            actual_value = values
                    if actual_value != "":
                        training_event.item_property = item_property
                        training_event.property_value = actual_value
                        form_factor = short_formalizable_properties[item_property]
        
        training_user_events.append(training_event)
        prev_event = user_event
    
    training_events.append(training_user_events)


In [26]:
import random
training_events[random.randint(0, len(training_events)-1)]

[TrainingEvent(event=<EventEnum.VIEW: 'view'>, item_category=0, item_property=0, property_value='', season=<SeasonEnum.SPRING: 1>, timedelta_days=0)]

In [None]:
from app.common.enums import EventEnum

props: list[int | None] = []
values: list[str | None] = []
categories: list[int | None] = []

for user_training_event in training_events:
    for training_event in user_training_event:
        if training_event.item_property not in props:
            props.append(training_event.item_property)
        if training_event.property_value not in values:
            values.append(training_event.property_value)
        if training_event.item_category not in categories:
            categories.append(training_event.item_category)
    

num_propeties = len(props)
num_propet_values = len(values)
num_categories = len(categories)
num_actions = len(EventEnum)

In [None]:
from app.modules.ml.model import prepare_data

all_actions: list[list[int]] = []
all_properties: list[list[int]] = []
all_values: list[list[str]] = []
all_categories: list[list[int]] = []
all_deltas: list[list[int]] = []
all_seasons: list[list[int]] = []

all_target_actions: list[list[int]] = []
all_target_properties: list[list[int]] = []
all_target_values: list[list[str]] = []
all_target_categories: list[list[int]] = []
for training_event in training_events:
    data = prepare_data(training_event)
    if data is not None:
        (
            actions,
            properties,
            values,
            categories,
            deltas,
            seasons,
            target_actions,
            target_properties,
            target_values,
            target_categories
        ) = data
        all_actions.extend(actions)
        all_properties.extend(properties)
        all_values.extend(values)
        all_categories.extend(categories)
        all_deltas.extend(deltas)
        all_seasons.extend(seasons)
        all_target_actions.extend(target_actions)
        all_target_properties.extend(target_properties)
        all_target_values.extend(target_values)
        all_target_categories.extend(target_categories)

In [32]:
from keras.layers import StringLookup

# Преобразуем значения в строки (заменяем None на пустую строку)
all_values_str = []
for values in all_values:
    values_str = [str(v) if v is not None else "" for v in values]
    all_values_str.append(values_str)

all_target_values_str = []
for target_values in all_target_values:
    target_values_str = [str(v) if v is not None else "" for v in target_values]
    all_target_values_str.append(target_values_str)

# Создаем vocabulary, исключая пустую строку (которая будет mask_token)
vocab_set = set()
for values in all_values_str + all_target_values_str:
    for value in values:
        if value != "":  # Исключаем пустую строку
            vocab_set.add(value)

vocab = sorted(list(vocab_set))

# Создаем lookup слой без указания mask_token в vocabulary
lookup_layer = StringLookup(vocabulary=vocab, output_mode="int", mask_token="")

# Теперь преобразуем данные
vectorized_values = []
for values in all_values_str:
    # Преобразуем None/пустые строки в маскирующий токен
    encoded = lookup_layer(values)
    vectorized_values.append(encoded)

target_vectorized_values = []
for target_values in all_target_values_str:
    encoded = lookup_layer(target_values)
    target_vectorized_values.append(encoded)

In [8]:
import numpy as np

# Преобразуем в numpy массивы
all_actions = np.array(all_actions, dtype=np.int32)
all_properties = np.array(all_properties, dtype=np.int32)
vectorized_values = np.array(vectorized_values, dtype=np.int32)
all_categories = np.array(all_categories, dtype=np.int32)
all_deltas = np.array(all_deltas, dtype=np.float32)
all_seasons = np.array(all_seasons, dtype=np.int32)

all_target_actions = np.array(all_target_actions, dtype=np.int32)
all_target_properties = np.array(all_target_properties, dtype=np.int32)
target_vectorized_values = np.array(target_vectorized_values, dtype=np.int32)
all_target_categories = np.array(all_target_categories, dtype=np.int32)

# Добавляем размерность для временных дельт
all_deltas = np.expand_dims(all_deltas, axis=-1)

In [11]:
# Преобразуем train_targets и val_targets в one-hot формат
def convert_to_onehot(targets_list, num_classes_list):
    """Преобразует список целочисленных меток в one-hot формат"""
    onehot_list = []
    for i, targets in enumerate(targets_list):
        # targets: (n_samples, SEQ_LENGTH)
        # Нужно преобразовать в (n_samples, SEQ_LENGTH, num_classes)
        n_samples, seq_len = targets.shape
        num_classes = num_classes_list[i]
        
        # Преобразуем в one-hot
        onehot = np.zeros((n_samples, seq_len, num_classes), dtype=np.float32)
        for s in range(n_samples):
            for t in range(seq_len):
                idx = targets[s, t]
                if idx < num_classes:  # Проверка на выход за границы
                    onehot[s, t, idx] = 1.0
        onehot_list.append(onehot)
    return onehot_list

# Определяем количество классов для каждого выхода
num_classes_list = [
    num_actions,        # для action_output
    num_propeties,      # для property_output  
    num_propet_values,  # для value_output
    num_categories      # для category_output
]

# Преобразуем целевые данные в one-hot
train_targets_onehot = convert_to_onehot(train_targets, num_classes_list)
val_targets_onehot = convert_to_onehot(val_targets, num_classes_list)

In [None]:
from keras.utils import to_categorical
# После получения train_inputs, val_inputs, train_targets, val_targets
# train_inputs имеет форму (6, n_samples, SEQ_LENGTH), но нужно (n_samples, SEQ_LENGTH) для каждого входа

# Распаковываем train_inputs
train_inputs_unpacked = [
    train_inputs[0],  # all_actions
    train_inputs[1],  # all_properties
    train_inputs[2],  # vectorized_values
    train_inputs[3],  # all_categories
    train_inputs[4],  # all_deltas (уже с добавленной размерностью)
    train_inputs[5],  # all_seasons
]

val_inputs_unpacked = [
    val_inputs[0],  # all_actions
    val_inputs[1],  # all_properties
    val_inputs[2],  # vectorized_values
    val_inputs[3],  # all_categories
    val_inputs[4],  # all_deltas
    val_inputs[5],  # all_seasons
]

# Преобразуем train_targets и val_targets в one-hot формат
train_targets_onehot = []
val_targets_onehot = []

num_classes_list = [
    num_actions,
    num_propeties,
    num_propet_values,
    num_categories
]

# Проверяем, что у нас есть данные для обучения
if len(train_targets) > 0 and len(train_targets[0]) > 0:
    for i in range(4):  # 4 целевых выхода
        n_samples = train_targets[i].shape[0]
        seq_len = train_targets[i].shape[1] if len(train_targets[i].shape) > 1 else 1
        num_classes = num_classes_list[i]
        
        # Преобразуем в one-hot
        if len(train_targets[i].shape) == 2:
            train_target_flat = train_targets[i].reshape(-1)
            train_onehot_flat = to_categorical(train_target_flat, num_classes=num_classes)
            train_onehot = train_onehot_flat.reshape(n_samples, seq_len, num_classes)
        else:
            train_onehot = to_categorical(train_targets[i], num_classes=num_classes)
            
        train_targets_onehot.append(train_onehot)
        
        # Аналогично для val_targets
        n_samples_val = val_targets[i].shape[0]
        if len(val_targets[i].shape) == 2:
            val_target_flat = val_targets[i].reshape(-1)
            val_onehot_flat = to_categorical(val_target_flat, num_classes=num_classes)
            val_onehot = val_onehot_flat.reshape(n_samples_val, seq_len, num_classes)
        else:
            val_onehot = to_categorical(val_targets[i], num_classes=num_classes)
            
        val_targets_onehot.append(val_onehot)

# Убедимся, что все массивы имеют правильную форму
print("Проверка форм данных:")
for i, inp in enumerate(train_inputs_unpacked):
    print(f"train_inputs[{i}].shape = {inp.shape}")

for i, tar in enumerate(train_targets_onehot):
    print(f"train_targets_onehot[{i}].shape = {tar.shape}")

IndexError: index 3 is out of bounds for axis 1 with size 3

In [None]:
all_sequences = []
seq_length = 10

for i in range(len(all_actions)):
    all_sequences.append(
        {
            "actions": all_actions[i],
            "params": [int(param) if param != '' else 0 for param in all_properties[i]],
            "values": [int(val.replace(" ", "")) if val != '' else 0 for val in all_values[i]],
            "categories": all_categories[i],
            "days_since_prev": all_deltas[i],
            "seasons": all_seasons[i],
        }
    )


In [47]:
from app.modules.ml.sasrec_model import ComplexSASRec
# Разделяем на train/val
train_size = int(0.8 * len(all_sequences))
train_sequences = all_sequences[:train_size]
val_sequences = all_sequences[train_size:]

# Определяем размеры словарей

# Создаем модель
print("Инициализация модели...")
model = ComplexSASRec(
    n_actions=num_actions,
    n_params=num_propeties,
    n_values=num_propet_values,
    n_categories=num_categories,
    max_seq_length=11,
    embedding_dim=32,
    num_blocks=2,
    num_heads=2,
    dropout_rate=0.1,
    learning_rate=1e-3,
    batch_size=16,
    num_epochs=5,
)

# Обучаем модель
print("Обучение модели...")
model.train_on_sequences(train_sequences, val_sequences)

# Тестируем предсказание
print("\nТестирование предсказаний...")
test_sequence = {
    "actions": [1, 2, 3, 1, 2, 3, 1, 2],
    "params": [1, 3, 2, 1, 3, 2, 1, 3],
    "values": [5, 10, 7, 5, 10, 7, 5, 10],
    "categories": [1, 1, 2, 1, 2, 1, 2, 1],
    "days_since_prev": [0.0, 1.0, 0.5, 2.0, 0.0, 1.5, 0.5, 2.0],
    "seasons": [1, 1, 2, 2, 3, 3, 4, 4],
}

prediction = model.predict_next_event(test_sequence)

print("\nПредсказание следующего события:")
for key, value in prediction.items():
    print(f"{key}: {value}")

Инициализация модели...
Обучение модели...


IndexError: index out of range in self

In [12]:
history = model.fit(
    train_inputs,
    {
        "action_output": train_targets_onehot[0],
        "property_output": train_targets_onehot[1],
        "value_output": train_targets_onehot[2],
        "category_output": train_targets_onehot[3]
    },
    epochs=50,
    batch_size=32,
    validation_data=(
        val_inputs,
        {
            "action_output": val_targets_onehot[0],
            "property_output": val_targets_onehot[1],
            "value_output": val_targets_onehot[2],
            "category_output": val_targets_onehot[3]
        }
    ),
)

Epoch 1/50


ValueError: Attr 'Toutput_types' of 'OptionalFromValue' Op passed list of length 0 less than minimum 1.