# The Autocoder for music recommendtion system

In [101]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import torch
import torch.nn as nn
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from datasets import Dataset, DatasetDict, load_dataset
from typing import Literal
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.impute import SimpleImputer

In [3]:
music_dataset = load_dataset("yandex/yambda", "flat-50m", streaming=True)

In [4]:
class YambdaDataset:
    INTERACTIONS = frozenset([
        "likes", "listens", "multi_event", "dislikes", "unlikes", "undislikes"
    ])

    def __init__(
        self,
        dataset_type: Literal["flat", "sequential"] = "flat",
        dataset_size: Literal["50m", "500m", "5b"] = "50m"
    ):
        self.dataset_type = dataset_type
        self.dataset_size = dataset_size

    def interaction(self, event_type: Literal[
        "likes", "listens", "multi_event", "dislikes", "unlikes", "undislikes"
    ]) -> Dataset:
        return self._download(f"{self.dataset_type}/{self.dataset_size}", f"{event_type}.parquet")

    def audio_embeddings(self) -> Dataset:
        return self._download("", "embeddings.parquet")

    def album_item_mapping(self) -> Dataset:
        return self._download("", "album_item_mapping.parquet")

    def artist_item_mapping(self) -> Dataset:
        return self._download("", "artist_item_mapping.parquet")

    def genre_item_mapping(self) -> Dataset:
        return self._download("", "genre_item_mapping.parquet")

    def user_item_mapping(self) -> Dataset:
        return self._download("", "user_item_mapping.parquet")

    @staticmethod
    def _download(data_dir: str, file: str) -> Dataset:
        data = load_dataset("yandex/yambda", data_dir=data_dir, data_files=file)
        return data["train"]



In [5]:
# Инициализация загрузчика
dataset = YambdaDataset("flat", "50m")

listens_ds = dataset.interaction("listens").to_pandas()
likes_ds = dataset.interaction("likes").to_pandas()
dislikes_ds = dataset.interaction("dislikes").to_pandas()
unlikes_ds = dataset.interaction("unlikes").to_pandas()
undislikes_ds = dataset.interaction("undislikes").to_pandas()
# embeddings_ds = dataset.audio_embeddings()

In [6]:
# Вывод шапок
print(listens_ds.head())
print(likes_ds.head())
print(dislikes_ds.head())
print(unlikes_ds.head())
print(undislikes_ds.head())

   uid  timestamp  item_id  is_organic  played_ratio_pct  track_length_seconds
0  100      39420  8326270           0               100                   170
1  100      39420  1441281           0               100                   105
2  100      39625   286361           0               100                   185
3  100      40110   732449           0               100                   240
4  100      40360  3397170           0                46                   130
   uid  timestamp  item_id  is_organic
0  100      44755   732449           1
1  100    1155860  6568592           0
2  100    1259125  5411243           1
3  100    1260005  7371186           0
4  100    1263935  4943655           0
   uid  timestamp  item_id  is_organic
0  100    3087560  9170134           1
1  100    3936560  8661238           1
2  100   20432825  4927727           1
3  400    2980800   399811           1
4  400    3157520  1575108           1
   uid  timestamp  item_id  is_organic
0  100    3087555  

In [7]:
# Создаём  таблицу только с нужными колонками и флагом
listens = listens_ds[['uid', 'item_id', 'played_ratio_pct']].copy()

likes = likes_ds[['uid', 'item_id']].copy()
likes['liked'] = True

dislikes = dislikes_ds[['uid', 'item_id']].copy()
dislikes['disliked'] = True

In [8]:
listens = listens.merge(
    likes,
    on=['uid', 'item_id'],
    how='left'
)

listens['liked'] = listens['liked'].fillna(False)

listens = listens.merge(
    dislikes,
    on=['uid', 'item_id'],
    how='left'
)

listens['disliked'] = listens['disliked'].fillna(False)

print(listens.head())

   uid  item_id  played_ratio_pct  liked  disliked
0  100  8326270               100  False     False
1  100  1441281               100  False     False
2  100   286361               100  False     False
3  100   732449               100   True     False
4  100  3397170                46  False     False


In [11]:
# Поиск NAN, выбросов и дубликатов
print(listens.isnull().sum())
listens[listens.duplicated(keep=False)]

uid                 0
item_id             0
played_ratio_pct    0
liked               0
disliked            0
dtype: int64


Unnamed: 0,uid,item_id,played_ratio_pct,liked,disliked
0,100,8326270,100,False,False
2,100,286361,100,False,False
3,100,732449,100,True,False
5,100,7849270,100,False,False
6,100,1449307,100,False,False
...,...,...,...,...,...
47166492,1000000,1578810,99,False,False
47166520,1000000,4952290,100,False,False
47166531,1000000,6320264,100,False,False
47166550,1000000,1464852,99,False,False


In [14]:
# Create interaction column
listens['interaction'] = listens['liked'] * 1 + listens['disliked'] * -1

# Drop liked and disliked columns
listens = listens.drop(columns=['liked', 'disliked'])

# Display the transformed DataFrame
print(listens)

              uid  item_id  played_ratio_pct  interaction
0             100  8326270               100            0
1             100  1441281               100            0
2             100   286361               100            0
3             100   732449               100            1
4             100  3397170                46            0
...           ...      ...               ...          ...
47166555  1000000  3369589                99            0
47166556  1000000  8120372                99            0
47166557  1000000  1578810                99            0
47166558  1000000  3732104               100            0
47166559  1000000  2978154                74            0

[47166560 rows x 4 columns]


In [16]:
stats_df = listens.describe()
stats_df.to_excel("statistic.xlsx")

In [17]:
# Calculate total number of elements in the table
total_elements = listens.size

# Count zero or empty values
zero_or_empty_count = (listens == 0).sum().sum() + listens.isnull().sum().sum()

# Calculate sparsity
sparsity = zero_or_empty_count / total_elements

print(f"Sparsity of the listens table: {sparsity:.4f}")

Sparsity of the listens table: 0.2128


In [18]:
# Calculate standard sparsity: proportion of zero or empty values to total elements
# For boolean columns, False is considered sparse
num_elements = listens.size
num_sparse_elements = (listens == False).sum().sum() + listens.isnull().sum().sum()
standard_sparsity = num_sparse_elements / num_elements

# Calculate logical sparsity: rows where both liked and disliked are False
logical_sparse_rows = listens[(listens['interaction'] == 0) & (listens['played_ratio_pct'] < 70)]
logical_sparsity = len(logical_sparse_rows) / len(listens)

print(f"Standard Sparsity: {standard_sparsity:.2f}")
print(f"Logical Sparsity: {logical_sparsity:.2f}")


Standard Sparsity: 0.21
Logical Sparsity: 0.30


In [86]:
encoder = LabelEncoder()
listens['uid_enc'] = encoder.fit_transform(listens['uid'])
listens['item_id_enc'] = encoder.fit_transform(listens['item_id'])

In [87]:
# Удаляем треки, которые слушали менее 25 раз
track_counts = listens.groupby('item_id_enc').size()
valid_tracks = track_counts[track_counts >= 25].index
listens = listens[listens['item_id_enc'].isin(valid_tracks)]

# Удаляем пользователей, которые не поставили ни одного лайка
user_likes = listens.groupby('uid')['interaction'].sum()
valid_users = user_likes[user_likes > 0].index
listens = listens[listens['uid'].isin(valid_users)]

# Display the transformed
print(listens)

              uid  item_id  played_ratio_pct  interaction  uid_enc   
0             100  8326270               100            0        0  \
1             100  1441281               100            0        0   
2             100   286361               100            0        0   
3             100   732449               100            1        0   
4             100  3397170                46            0        0   
...           ...      ...               ...          ...      ...   
47166555  1000000  3369589                99            0     7125   
47166556  1000000  8120372                99            0     7125   
47166557  1000000  1578810                99            0     7125   
47166558  1000000  3732104               100            0     7125   
47166559  1000000  2978154                74            0     7125   

          item_id_enc  item_id_freq  
0              128364          5014  
1               22169            57  
2                4463          1464  
3      

In [102]:
# Масштабируем поведенческие признаки
scaler_behavior = StandardScaler()
behavior_scaled = scaler_behavior.fit_transform(listens[['played_ratio_pct', 'interaction']])

# Масштабируем категориальные признаки (если не используешь Embedding)
scaler_uid = MinMaxScaler()
scaler_item = MinMaxScaler()

uid_scaled = scaler_uid.fit_transform(listens[['uid_enc']])
item_scaled = scaler_item.fit_transform(listens[['item_id_enc']])
combined_array = np.hstack([behavior_scaled, uid_scaled, item_scaled])

combined_df = pd.DataFrame(
    combined_array,
    columns=['played_ratio_scaled', 'interaction_scaled', 'uid_scaled', 'item_scaled']
)

stats = combined_df.describe()
print(stats)


       played_ratio_scaled  interaction_scaled    uid_scaled   item_scaled
count         3.951092e+07        3.951092e+07  3.951092e+07  3.951092e+07
mean         -1.336530e-16        7.333292e-17  4.972041e-01  5.024049e-01
std           1.000000e+00        1.000000e+00  2.879984e-01  2.876649e-01
min          -1.454058e+00       -2.821177e+00  0.000000e+00  0.000000e+00
25%          -1.318251e+00       -5.571720e-01  2.474386e-01  2.554118e-01
50%           8.093802e-01       -5.571720e-01  4.936140e-01  5.020597e-01
75%           8.093802e-01        1.706833e+00  7.454035e-01  7.510506e-01
max           2.144809e+00        1.706833e+00  1.000000e+00  1.000000e+00


In [103]:
X_train, X_test = train_test_split(combined_df, test_size=0.2, random_state=42)

In [104]:
input_dim = combined_df.shape[1]
encoding_dim = 2

In [105]:
# Размер входного вектора
input_layer = Input(shape=(input_dim,))

# ЭНКОДЕР
x = Dense(128)(input_layer)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.3)(x)

x = Dense(64)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.3)(x)

encoded = Dense(32, activation='relu')(x)  # Боттлнек

# ДЕКОДЕР (зеркально)
x = Dense(64)(encoded)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.3)(x)

x = Dense(128)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.3)(x)

# Выходной слой
decoded = Dense(input_dim, activation='linear')(x)

# Компиляция модели
autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer=Adam(learning_rate=0.0001), loss='mse')

# Структура модели
autoencoder.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 4)]               0         
                                                                 
 dense_36 (Dense)            (None, 128)               640       
                                                                 
 batch_normalization_17 (Bat  (None, 128)              512       
 chNormalization)                                                
                                                                 
 activation_16 (Activation)  (None, 128)               0         
                                                                 
 dropout_22 (Dropout)        (None, 128)               0         
                                                                 
 dense_37 (Dense)            (None, 64)                8256      
                                                           

In [106]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

checkpoint = ModelCheckpoint(filepath='best_autoencoder.h5', monitor='val_loss', save_best_only=True, verbose=1)

autoencoder.fit(X_train, X_train, epochs=10, batch_size=2024, shuffle=True, 
                validation_data=(X_test, X_test), callbacks=[early_stop, checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.00308, saving model to best_autoencoder.h5
Epoch 2/10
Epoch 2: val_loss improved from 0.00308 to 0.00210, saving model to best_autoencoder.h5
Epoch 3/10
Epoch 3: val_loss did not improve from 0.00210
Epoch 4/10
Epoch 4: val_loss did not improve from 0.00210
Epoch 5/10
   21/15617 [..............................] - ETA: 8:21 - loss: 0.0150

KeyboardInterrupt: 

In [107]:
loss = autoencoder.evaluate(X_test, X_test)
print(f"Test reconstruction loss: {loss:.4f}")

# Получаем реконструированные данные
X_pred = autoencoder.predict(X_test)

Test reconstruction loss: 0.0022


In [108]:
# Выбираем один пример для отображения
example_index = np.random.randint(0, X_test.shape[0])
original = X_test[example_index]
reconstructed = X_pred[example_index]

# Строим график
plt.figure(figsize=(12, 6))
plt.plot(original, label='Оригинальные данные', color='blue', marker="o" )
plt.plot(reconstructed, label='Реконструкция автокодера', color='green', linestyle='dashed', marker="o")
plt.title('Сравнение оригинала и реконструкции')
plt.xlabel('Признаки / временные шаги')
plt.ylabel('Значение')
plt.legend()
plt.grid(True)
plt.show()

KeyError: 2632515

In [None]:
# # Сохраняем модель в формате TensorFlow SavedModel
# autoencoder.save('Models/Autoencoder_Music_Recommendation', save_format="keras")
autoencoder.save('Models/Autoencoder_Music_Recommendation_low_loss', save_format="keras")

In [36]:
# Для формата SavedModel
# autoencoder = tf.keras.models.load_model('Models/Autoencoder_Music_Recommendation.keras')

autoencoder.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 2)]               0         
                                                                 
 dense_1 (Dense)             (None, 32)                96        
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 2)                 34        
                                                                 
Total params: 658
Trainable params: 658
Non-trainable params:

In [109]:
# Получаем реконструкции
# X_pred = autoencoder.predict(X_test)

# Вычисляем ошибку реконструкции для каждого примера
errors = np.mean(np.square(X_test - X_pred), axis=1)

stats_error = pd.Series(errors).describe()
stats_error.to_excel("statistic_error.xlsx")

# Статистика
print(f"Средняя ошибка реконструкции: {np.mean(errors):.6f}")
print(f"Медианная ошибка: {np.median(errors):.6f}")
print(f"Максимальная ошибка: {np.max(errors):.6f}")
print(f"Минимальная ошибка: {np.min(errors):.6f}")

# # Гистограмма ошибок
# plt.figure(figsize=(10, 5))
# plt.hist(errors, bins=100, color='teal', alpha=0.7)
# plt.title('Распределение ошибки реконструкции')
# plt.xlabel('Ошибка (MSE)')
# plt.ylabel('Количество примеров')
# plt.grid(True)
# plt.show()

Средняя ошибка реконструкции: 0.002224
Медианная ошибка: 0.000688
Максимальная ошибка: 0.566495
Минимальная ошибка: 0.000000
