In [3]:
from pathlib import Path

import numpy as np
import pandas as pd
from statsmodels.distributions.empirical_distribution import ECDF
import matplotlib.pyplot as plt
from torch.utils.data import Dataset

  import pandas.util.testing as tm


In [5]:
users = pd.read_csv(
    "users.dat",
    sep="::",
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
    encoding='latin-1',
    engine='python'
)

ratings = pd.read_csv(
    "ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
    encoding='latin-1',
    engine='python'
)

movies = pd.read_csv(
    "movies.dat", sep="::", names=["movie_id", "title", "genres"],
    encoding='latin-1',
    engine='python'
)

In [3]:
users[:2]

Unnamed: 0,user_id,sex,age_group,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072


In [79]:
ratings[:2]

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109


In [80]:
movies[:2]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy


In [6]:
ratings_df = pd.merge(ratings, movies)[['user_id', 'title', 'rating', 'unix_timestamp']]
ratings_df["user_id"] = ratings_df["user_id"].astype(str)

In [82]:
ratings_per_user = ratings_df.groupby('user_id').rating.count()
ratings_per_item = ratings_df.groupby('title').rating.count()

print(f"Total No. of users: {len(ratings_df.user_id.unique())}")
print(f"Total No. of items: {len(ratings_df.title.unique())}")
print("\n")

print(f"Max observed rating: {ratings_df.rating.max()}")
print(f"Min observed rating: {ratings_df.rating.min()}")
print("\n")

print(f"Max no. of user ratings: {ratings_per_user.max()}")
print(f"Min no. of user ratings: {ratings_per_user.min()}")
print(f"Median no. of ratings per user: {ratings_per_user.median()}")
print("\n")

print(f"Max no. of item ratings: {ratings_per_item.max()}")
print(f"Min no. of item ratings: {ratings_per_item.min()}")
print(f"Median no. of ratings per item: {ratings_per_item.median()}")

Total No. of users: 6040
Total No. of items: 3706


Max observed rating: 5
Min observed rating: 1


Max no. of user ratings: 2314
Min no. of user ratings: 20
Median no. of ratings per user: 96.0


Max no. of item ratings: 3428
Min no. of item ratings: 1
Median no. of ratings per item: 123.5


In [83]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   user_id         1000209 non-null  object
 1   title           1000209 non-null  object
 2   rating          1000209 non-null  int64 
 3   unix_timestamp  1000209 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 38.2+ MB


In [7]:
# Определим функцию, которая будет получать n оценок для каждого пользователя:
def get_last_n_ratings_by_user(
    df, n, min_ratings_per_user=1, user_colname="user_id", timestamp_colname="unix_timestamp"
):
    return (
        df.groupby(user_colname)
        .filter(lambda x: len(x) >= min_ratings_per_user)
        .sort_values(timestamp_colname)
        .groupby(user_colname)
        .tail(n)
        .sort_values(user_colname)
    )

In [8]:
get_last_n_ratings_by_user(ratings_df, 1)

Unnamed: 0,user_id,title,rating,unix_timestamp
28501,1,Pocahontas (1995),5,978824351
482398,10,Hero (1992),5,980638688
800008,100,Apocalypse Now (1979),2,977594963
496041,1000,"Streetcar Named Desire, A (1951)",5,975042421
305563,1001,Austin Powers: The Spy Who Shagged Me (1999),2,1028605534
...,...,...,...,...
767773,995,French Kiss (1995),3,975099776
573889,996,Almost Famous (2000),5,1001227064
76463,997,Gladiator (2000),4,978915132
998801,998,See the Sea (Regarde la mer) (1997),5,975192573


In [9]:
# функция, которая будет помечать n оценок на пользователя так, чтобы они 
# попадали бы в проверочную выборку. Делается это с использованием столбца is_valid:
def mark_last_n_ratings_as_validation_set(
    df, n, min_ratings=1, user_colname="user_id", timestamp_colname="unix_timestamp"
):
    """
    Отмечает n последних по времени оценок, что включает их в проверочную выборку.
    Делается это путём добавления дополнительного столбца 'is_valid' в df.
    :param df: объект DataFrame, содержащий оценки, данные пользователем
    :param n: количество оценок, которые надо включить в проверочную выборку
    :param min_ratings: включать лишь пользователей, имеющих более этого количества оценок
    :param user_id_colname: имя столбца, содержащего идентификатор пользователя
    :param timestamp_colname: имя столбца, содержащего отметку времени
    :return: тот же df, в который добавлен дополнительный столбец 'is_valid'
    """
    df["is_valid"] = False
    df.loc[
        get_last_n_ratings_by_user(
            df,
            n,
            min_ratings,
            user_colname=user_colname,
            timestamp_colname=timestamp_colname,
        ).index,
        "is_valid",
    ] = True
    return df

In [10]:
mark_last_n_ratings_as_validation_set(ratings_df, 1)

Unnamed: 0,user_id,title,rating,unix_timestamp,is_valid
0,1,One Flew Over the Cuckoo's Nest (1975),5,978300760,False
1,2,One Flew Over the Cuckoo's Nest (1975),5,978298413,False
2,12,One Flew Over the Cuckoo's Nest (1975),4,978220179,False
3,15,One Flew Over the Cuckoo's Nest (1975),4,978199279,False
4,17,One Flew Over the Cuckoo's Nest (1975),5,978158471,False
...,...,...,...,...,...
1000204,5949,Modulations (1998),5,958846401,False
1000205,5675,Broken Vessels (1998),3,976029116,False
1000206,5780,White Boys (1999),1,958153068,False
1000207,5851,One Little Indian (1973),5,957756608,False


In [11]:
train_df = ratings_df[ratings_df.is_valid==False]
valid_df = ratings_df[ratings_df.is_valid==True]

In [12]:
median_rating = train_df.rating.median(); median_rating

4.0

In [13]:
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error

predictions = np.array([median_rating]* len(valid_df))

mae = mean_absolute_error(valid_df.rating, predictions)
mse = mean_squared_error(valid_df.rating, predictions)
rmse = math.sqrt(mse)

print(f'mae: {mae}')
print(f'mse: {mse}')
print(f'rmse: {rmse}')

mae: 0.91158940397351
mse: 1.5304635761589405
rmse: 1.2371190630488806


In [14]:
# Кодируем юзеров и фильмы
user_lookup = {v: i+1 for i, v in enumerate(ratings_df['user_id'].unique())}
movie_lookup = {v: i+1 for i, v in enumerate(ratings_df['title'].unique())}

In [15]:
class UserItemRatingDataset(Dataset):
    def __init__(self, df, movie_lookup, user_lookup):
        self.df = df
        self.movie_lookup = movie_lookup
        self.user_lookup = user_lookup

    def __getitem__(self, index):
        row = self.df.iloc[index]
        user_id = self.user_lookup[row.user_id]
        movie_id = self.movie_lookup[row.title]
        
        rating = torch.tensor(row.rating, dtype=torch.float32)
        
        return (user_id, movie_id), rating

    def __len__(self):
        return len(self.df)

In [16]:
train_dataset = UserItemRatingDataset(train_df, movie_lookup, user_lookup)
valid_dataset = UserItemRatingDataset(valid_df, movie_lookup, user_lookup)

In [17]:
import torch
from torch import nn

# Определим модель
class MfDotBias(nn.Module):

    def __init__(
        self, n_factors, n_users, n_items, ratings_range=None, use_biases=True
    ):
        super().__init__()
        self.bias = use_biases
        self.y_range = ratings_range
        self.user_embedding = nn.Embedding(n_users+1, n_factors, padding_idx=0)
        self.item_embedding = nn.Embedding(n_items+1, n_factors, padding_idx=0)

        if use_biases:
            self.user_bias = nn.Embedding(n_users+1, 1, padding_idx=0)
            self.item_bias = nn.Embedding(n_items+1, 1, padding_idx=0)

    def forward(self, inputs):
        users, items = inputs
        dot = self.user_embedding(users) * self.item_embedding(items)
        result = dot.sum(1)
        if self.bias:
            result = (
                result + self.user_bias(users).squeeze() + self.item_bias(items).squeeze()
            )

        if self.y_range is None:
            return result
        else:
            return (
                torch.sigmoid(result) * (self.y_range[1] - self.y_range[0])
                + self.y_range[0]
            )

In [18]:
from functools import partial
!pip install pytorch_accelerated
!pip install torchmetrics
from pytorch_accelerated import Trainer, notebook_launcher 
from pytorch_accelerated.trainer import TrainerPlaceholderValues, DEFAULT_CALLBACKS
from pytorch_accelerated.callbacks import EarlyStoppingCallback, SaveBestModelCallback, TrainerCallback, StopTrainingError
import torchmetrics

Collecting pytorch_accelerated
  Downloading pytorch_accelerated-0.1.22-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 622 kB/s 
Collecting accelerate~=0.5.1
  Downloading accelerate-0.5.1-py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 5.4 MB/s 
Installing collected packages: accelerate, pytorch-accelerated
Successfully installed accelerate-0.5.1 pytorch-accelerated-0.1.22
Collecting torchmetrics
  Downloading torchmetrics-0.7.2-py3-none-any.whl (397 kB)
[K     |████████████████████████████████| 397 kB 4.3 MB/s 
Collecting pyDeprecate==0.3.*
  Downloading pyDeprecate-0.3.2-py3-none-any.whl (10 kB)
Installing collected packages: pyDeprecate, torchmetrics
Successfully installed pyDeprecate-0.3.2 torchmetrics-0.7.2


In [19]:
class RecommenderMetricsCallback(TrainerCallback):
    def __init__(self):
        self.metrics = torchmetrics.MetricCollection(
            {
                "mse": torchmetrics.MeanSquaredError(),
                "mae": torchmetrics.MeanAbsoluteError(),
            }
        )

    def _move_to_device(self, trainer):
        self.metrics.to(trainer.device)

    def on_training_run_start(self, trainer, **kwargs):
        self._move_to_device(trainer)

    def on_evaluation_run_start(self, trainer, **kwargs):
        self._move_to_device(trainer)

    def on_eval_step_end(self, trainer, batch, batch_output, **kwargs):
        preds = batch_output["model_outputs"]
        self.metrics.update(preds, batch[1])

    def on_eval_epoch_end(self, trainer, **kwargs):
        metrics = self.metrics.compute()
        
        mse = metrics["mse"].cpu()
        trainer.run_history.update_metric("mae", metrics["mae"].cpu())
        trainer.run_history.update_metric("mse", mse)
        trainer.run_history.update_metric("rmse",  math.sqrt(mse))

        self.metrics.reset()

In [20]:
def train_mf_model():
    model = MfDotBias(
        120, len(user_lookup), len(movie_lookup), ratings_range=[0.5, 5.5]
    )
    loss_func = torch.nn.MSELoss()

    optimizer = torch.optim.AdamW(model.parameters(), lr=0.01)

    create_sched_fn = partial(
        torch.optim.lr_scheduler.OneCycleLR,
        max_lr=0.01,
        epochs=TrainerPlaceholderValues.NUM_EPOCHS,
        steps_per_epoch=TrainerPlaceholderValues.NUM_UPDATE_STEPS_PER_EPOCH,
    )

    trainer = Trainer(
        model=model,
        loss_func=loss_func,
        optimizer=optimizer,
        callbacks=(
            RecommenderMetricsCallback,
            *DEFAULT_CALLBACKS,
            SaveBestModelCallback(watch_metric="mae"),
            EarlyStoppingCallback(
                early_stopping_patience=2,
                early_stopping_threshold=0.001,
                watch_metric="mae",
            ),
        ),
    )

    trainer.train(
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        num_epochs=30,
        per_device_batch_size=512,
        create_scheduler_fn=create_sched_fn,
    )

In [100]:
notebook_launcher(train_mf_model, num_processes=2)

Launching training on CPU.


  0%|          | 0/1942 [03:48<?, ?it/s]



Starting training run

Starting epoch 1


 23%|██▎       | 448/1942 [00:50<02:03, 12.14it/s]

KeyboardInterrupt: ignored

In [45]:
ratings_df.head()

Unnamed: 0,user_id,title,rating,unix_timestamp,is_valid
0,1,One Flew Over the Cuckoo's Nest (1975),5,978300760,False
1,2,One Flew Over the Cuckoo's Nest (1975),5,978298413,False
2,12,One Flew Over the Cuckoo's Nest (1975),4,978220179,False
3,15,One Flew Over the Cuckoo's Nest (1975),4,978199279,False
4,17,One Flew Over the Cuckoo's Nest (1975),5,978158471,False


In [None]:
# Использование трансформеров для выдачи прогнозов, основанных на последовательностях оценок:

In [21]:
grouped_ratings = ratings_df.sort_values(by='unix_timestamp').groupby('user_id').agg(tuple).reset_index()
grouped_ratings['num_ratings'] = grouped_ratings['rating'].apply(lambda row: len(row))

In [102]:
grouped_ratings[:3]

Unnamed: 0,user_id,title,rating,unix_timestamp,is_valid,num_ratings
0,1,"(Girl, Interrupted (1999), Cinderella (1950), ...","(4, 5, 4, 5, 3, 5, 4, 4, 5, 4, 5, 3, 4, 4, 4, ...","(978300019, 978300055, 978300055, 978300055, 9...","(False, False, False, False, False, False, Fal...",53
1,10,"(Godfather, The (1972), Pretty Woman (1990), S...","(3, 4, 3, 4, 4, 3, 5, 5, 5, 3, 3, 4, 5, 4, 4, ...","(978224375, 978224375, 978224375, 978224400, 9...","(False, False, False, False, False, False, Fal...",401
2,100,"(Starship Troopers (1997), Star Wars: Episode ...","(3, 4, 4, 3, 4, 3, 1, 1, 5, 4, 4, 3, 4, 2, 3, ...","(977593595, 977593595, 977593607, 977593624, 9...","(False, False, False, False, False, False, Fal...",76


In [103]:
# sequence_length = 10

In [22]:
def create_sequences(values, sequence_length=10):
    sequences = []
    for i, v in enumerate(values):
        seq = values[:i+1]
        if len(seq) > sequence_length:
            seq = seq[i-sequence_length+1:i+1]
        elif len(seq) < sequence_length:
            seq =(*(['[PAD]'] * (sequence_length - len(seq))), *seq)
       
        sequences.append(seq)
    return sequences

In [23]:
# Применение ко всему df
grouped_cols = ['title', 'rating', 'unix_timestamp', 'is_valid'] 
for col in grouped_cols:
    grouped_ratings[col] = grouped_ratings[col].apply(lambda x: create_sequences(x)) #sequence_length

In [24]:
# Сейчас у нас имеется одна строка, содержащая все последовательности для конкретного пользователя. 
# надо преобразовать данные таким образом, чтобы каждая последовательность располагалась бы в 
# собственной строке, но, в то же время, сохраняла бы связь с ID пользователя
exploded_ratings = grouped_ratings[['user_id', 'title']].explode('title', ignore_index=True)
dfs = [grouped_ratings[[col]].explode(col, ignore_index=True) for col in grouped_cols[1:]]
seq_df = pd.concat([exploded_ratings, *dfs], axis=1)

In [23]:
seq_df

Unnamed: 0,user_id,title,rating,unix_timestamp,is_valid
0,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA..."
1,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA..."
2,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA..."
3,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], Gir...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], 4, ...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], 978...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], Fal..."
4,1,"([PAD], [PAD], [PAD], [PAD], [PAD], Girl, Inte...","([PAD], [PAD], [PAD], [PAD], [PAD], 4, 5, 4, 5...","([PAD], [PAD], [PAD], [PAD], [PAD], 978300019,...","([PAD], [PAD], [PAD], [PAD], [PAD], False, Fal..."
...,...,...,...,...,...
1000204,999,"(General's Daughter, The (1999), Powder (1995)...","(3, 3, 2, 1, 3, 2, 3, 2, 4, 3)","(975364681, 975364717, 975364717, 975364717, 9...","(False, False, False, False, False, False, Fal..."
1000205,999,"(Powder (1995), We're No Angels (1989), Out of...","(3, 2, 1, 3, 2, 3, 2, 4, 3, 3)","(975364717, 975364717, 975364717, 975364743, 9...","(False, False, False, False, False, False, Fal..."
1000206,999,"(We're No Angels (1989), Out of Africa (1985),...","(2, 1, 3, 2, 3, 2, 4, 3, 3, 3)","(975364717, 975364717, 975364743, 975364743, 9...","(False, False, False, False, False, False, Fal..."
1000207,999,"(Out of Africa (1985), Instinct (1999), Corrup...","(1, 3, 2, 3, 2, 4, 3, 3, 3, 2)","(975364717, 975364743, 975364743, 975364784, 9...","(False, False, False, False, False, False, Fal..."


In [25]:
# Для "is_valid" оставляем только последнее значение, для отбора примеров для прогноза
def get_last_entry(sequence):
    return sequence[-1]

seq_df['is_valid'] = seq_df['is_valid'].apply(get_last_entry)

In [26]:
#чтобы облегчить доступ к оценке, которую мы пытаемся спрогнозировать, выделим её в отдельный столбец:
seq_df['target_rating'] = seq_df['rating'].apply(get_last_entry)
seq_df['previous_ratings'] = seq_df['rating'].apply(lambda seq: seq[:-1])
seq_df.drop(columns=['rating'], inplace=True)

In [27]:
# Чтобы модель, при вычислении показателей внутреннего внимания, не использовала бы вспомогательные токены,
# мы можем предоставить трансформеру маску для механизма внутреннего внимания. Маска должна содержать значение 
# True для вспомогательного токена, а для других данных — False. Построим маску для каждой из строк, а так же 
# создадим столбец, содержащий сведения о количестве вспомогательных токенов:

seq_df['pad_mask'] = seq_df['title'].apply(lambda x: (np.array(x) == '[PAD]'))
seq_df['num_pads'] = seq_df['pad_mask'].apply(sum)
seq_df['pad_mask'] = seq_df['pad_mask'].apply(lambda x: x.tolist())

In [27]:
seq_df[:5]

Unnamed: 0,user_id,title,unix_timestamp,is_valid,target_rating,previous_ratings,pad_mask,num_pads
0,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...",False,4,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","[True, True, True, True, True, True, True, Tru...",9
1,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...",False,5,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","[True, True, True, True, True, True, True, Tru...",8
2,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...",False,4,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","[True, True, True, True, True, True, True, Fal...",7
3,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], Gir...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], 978...",False,5,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], 4, ...","[True, True, True, True, True, True, False, Fa...",6
4,1,"([PAD], [PAD], [PAD], [PAD], [PAD], Girl, Inte...","([PAD], [PAD], [PAD], [PAD], [PAD], 978300019,...",False,3,"([PAD], [PAD], [PAD], [PAD], [PAD], 4, 5, 4, 5)","[True, True, True, True, True, False, False, F...",5


In [28]:
train_seq_df = seq_df[seq_df.is_valid == False]
valid_seq_df = seq_df[seq_df.is_valid == True]

In [29]:
#прежде чем мы сможем передать эти данные модели, нужно создать поисковую таблицу для кодирования сведений о 
# пользователях и фильмах. Но в этот раз, при создании поисковой таблицы фильмов, нам нужно учесть наличие в 
# данных вспомогательного токена:
user_lookup = {v: i+1 for i, v in enumerate(ratings_df['user_id'].unique())}
def create_feature_lookup(df, feature):
    lookup = {v: i+1 for i, v in enumerate(df[feature].unique())}
    lookup['[PAD]'] = 0
    return lookup
  
movie_lookup = create_feature_lookup(ratings_df, 'title')

In [30]:
seq_df[:1]

Unnamed: 0,user_id,title,unix_timestamp,is_valid,target_rating,previous_ratings,pad_mask,num_pads
0,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...",False,4,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","[True, True, True, True, True, True, True, Tru...",9


In [31]:
# Теперь мы работаем с последовательностями оценок, а не с отдельными оценками. 
# Поэтому нужно создать новый класс Dataset, содержащий обработанный DataFrame:
class MovieSequenceDataset(Dataset):
    def __init__(self, df, movie_lookup, user_lookup):
        super().__init__()
        self.df = df
        self.movie_lookup = movie_lookup
        self.user_lookup = user_lookup

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        data = self.df.iloc[index]
        user_id = self.user_lookup[str(data.user_id)]
        movie_ids = torch.tensor([self.movie_lookup[title] for title in data.title])

        previous_ratings = torch.tensor(
            [rating if rating != "[PAD]" else 0 for rating in data.previous_ratings]
        )

        attention_mask = torch.tensor(data.pad_mask)
        target_rating = data.target_rating
        encoded_features = {
            "user_id": user_id,
            "movie_ids": movie_ids,
            "ratings": previous_ratings,
        }

        return (encoded_features, attention_mask), torch.tensor(
            target_rating, dtype=torch.float32
        )

In [32]:
train_dataset = MovieSequenceDataset(train_seq_df, movie_lookup, user_lookup)
valid_dataset = MovieSequenceDataset(valid_seq_df, movie_lookup, user_lookup)

In [33]:
features
# position_embeddings = nn.Embedding(sequence_length, 120)
# positions = torch.arange( 0, 10, 1, dtype=int, ) #device=features["movie_ids"].device)        )     
# position_embeddings(positions)[1]

NameError: ignored

In [33]:
# Опишем модель, основанную на трансформере. Для начала, учитывая то, что модель матричной факторизации способна 
# достичь хороших результатов, используя лишь идентификаторы пользователя и фильма, пока включим сюда лишь эти данные:
class BstTransformer(nn.Module):
    def __init__(
        self,
        movies_num_unique,
        users_num_unique,
        sequence_length=10,
        embedding_size=120,
        num_transformer_layers=1,
        ratings_range=(0.5, 5.5),
    ):
        super().__init__()
        self.sequence_length = sequence_length
        self.embedding_size = embedding_size
        self.y_range = ratings_range
        self.movies_embeddings = nn.Embedding(
            movies_num_unique + 1, embedding_size, padding_idx=0
        )
        self.user_embeddings = nn.Embedding(users_num_unique + 1, embedding_size)
        self.position_embeddings = nn.Embedding(sequence_length, embedding_size)
        self.encoder = nn.TransformerEncoder(
            encoder_layer=nn.TransformerEncoderLayer(
                d_model=self.embedding_size,
                nhead=12,
                dropout=0.1,
                batch_first=True,
                activation="gelu",
            ),
            num_layers=num_transformer_layers,
        )
        self.linear = nn.Sequential(
            nn.Linear(
                embedding_size + (embedding_size * sequence_length),
                1024,
            ),
            nn.BatchNorm1d(1024),
            nn.Mish(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Mish(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.Mish(),
            nn.Linear(256, 1),
            nn.Sigmoid(),
        )
    def forward(self, inputs):
        features, mask = inputs
        encoded_user_id = self.user_embeddings(features["user_id"])
        user_features = encoded_user_id
        encoded_movies = self.movies_embeddings(features["movie_ids"])
        positions = torch.arange(
            0, self.sequence_length, 1, dtype=int, device=features["movie_ids"].device
        )
        positions = self.position_embeddings(positions)
        transformer_features = encoded_movies + positions
        transformer_output = self.encoder(
            transformer_features, src_key_padding_mask=mask
        )
        transformer_output = torch.flatten(transformer_output, start_dim=1)
        combined_output = torch.cat((transformer_output, user_features), dim=1)
        rating = self.linear(combined_output)
        rating = rating.squeeze()
        if self.y_range is None:
            return rating
        else:
            return rating * (self.y_range[1] - self.y_range[0]) + self.y_range[0]

In [36]:
def train_seq_model():
    model = BstTransformer(
        len(movie_lookup), len(user_lookup) ) #, sequence_length, embedding_size=120
        
    loss_func = torch.nn.MSELoss()

    optimizer = torch.optim.AdamW(model.parameters(), lr=0.01)

    create_sched_fn = partial(
        torch.optim.lr_scheduler.OneCycleLR,
        max_lr=0.01,
        epochs=TrainerPlaceholderValues.NUM_EPOCHS,
        steps_per_epoch=TrainerPlaceholderValues.NUM_UPDATE_STEPS_PER_EPOCH,
    )

    trainer = Trainer(
        model=model,
        loss_func=loss_func,
        optimizer=optimizer,
        callbacks=(
            RecommenderMetricsCallback,
            *DEFAULT_CALLBACKS,
            SaveBestModelCallback(watch_metric="mae"),
            EarlyStoppingCallback(
                early_stopping_patience=2,
                early_stopping_threshold=0.001,
                watch_metric="mae",
            ),
        ),
    )

    trainer.train(
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        num_epochs=10,
        per_device_batch_size=512,
        create_scheduler_fn=create_sched_fn,
    )

In [4]:
# notebook_launcher(train_seq_model, num_processes=2)

In [1]:
# Добавляем последовательность оценок фильмов пользователями

In [41]:
class BstTransformer(nn.Module):
    def __init__(
        self,
        movies_num_unique,
        users_num_unique,
        sequence_length=10,
        embedding_size=120,
        num_transformer_layers=1,
        ratings_range=(0.5, 5.5),
    ):
        super().__init__()
        self.sequence_length = sequence_length
        self.y_range = ratings_range
        self.movies_embeddings = nn.Embedding(
            movies_num_unique + 1, embedding_size, padding_idx=0
        )
        self.user_embeddings = nn.Embedding(users_num_unique + 1, embedding_size)
        self.ratings_embeddings = nn.Embedding(6, embedding_size, padding_idx=0)
        self.position_embeddings = nn.Embedding(sequence_length, embedding_size)
        self.encoder = nn.TransformerEncoder(
            encoder_layer=nn.TransformerEncoderLayer(
                d_model=embedding_size,
                nhead=12,
                dropout=0.1,
                batch_first=True,
                activation="gelu",
            ),
            num_layers=num_transformer_layers,
        )
        self.linear = nn.Sequential(
            nn.Linear(
                embedding_size + (embedding_size * sequence_length),
                1024,
            ),
            nn.BatchNorm1d(1024),
            nn.Mish(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Mish(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.Mish(),
            nn.Linear(256, 1),
            nn.Sigmoid(),
        )
    def forward(self, inputs):
        features, mask = inputs
        encoded_user_id = self.user_embeddings(features["user_id"])
        user_features = encoded_user_id
        movie_history = features["movie_ids"][:, :-1]
        target_movie = features["movie_ids"][:, -1]
        ratings = self.ratings_embeddings(features["ratings"])
        encoded_movies = self.movies_embeddings(movie_history)
        encoded_target_movie = self.movies_embeddings(target_movie)
        positions = torch.arange(
            0,
            self.sequence_length - 1,
            1,
            dtype=int,
            device=features["movie_ids"].device,
        )
        positions = self.position_embeddings(positions)
        encoded_sequence_movies_with_position_and_rating = (
            encoded_movies + ratings + positions
        )
        encoded_target_movie = encoded_target_movie.unsqueeze(1)
        transformer_features = torch.cat(
            (encoded_sequence_movies_with_position_and_rating, encoded_target_movie),
            dim=1,
        )
        transformer_output = self.encoder(
            transformer_features, src_key_padding_mask=mask
        )
        transformer_output = torch.flatten(transformer_output, start_dim=1)
        combined_output = torch.cat((transformer_output, user_features), dim=1)
        rating = self.linear(combined_output)
        rating = rating.squeeze()
        if self.y_range is None:
            return rating
        else:
            return rating * (self.y_range[1] - self.y_range[0]) + self.y_range[0]

In [1]:
notebook_launcher(train_seq_model, num_processes=2)

In [94]:
# Добавление признаков пользователей

In [95]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
users['sex_encoded'] = le.fit_transform(users.sex)
users['age_group_encoded'] = le.fit_transform(users.age_group)
users["user_id"] = users["user_id"].astype(str)


In [98]:
seq_with_user_features = pd.merge(seq_df, users)

In [99]:
train_df = seq_with_user_features[seq_with_user_features.is_valid == False]
valid_df = seq_with_user_features[seq_with_user_features.is_valid == True]

In [100]:
# ДОбавление новых признаков в датасет
class MovieSequenceDataset(Dataset):
    def __init__(self, df, movie_lookup, user_lookup):
        super().__init__()
        self.df = df
        self.movie_lookup = movie_lookup
        self.user_lookup = user_lookup

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        data = self.df.iloc[index]
        user_id = self.user_lookup[str(data.user_id)]
        movie_ids = torch.tensor([self.movie_lookup[title] for title in data.title])

        previous_ratings = torch.tensor(
            [rating if rating != "[PAD]" else 0 for rating in data.previous_ratings]
        )

        attention_mask = torch.tensor(data.pad_mask)
        target_rating = data.target_rating
        encoded_features = {
            "user_id": user_id,
            "movie_ids": movie_ids,
            "ratings": previous_ratings,
            "age_group": data["age_group_encoded"],
            "sex": data["sex_encoded"],
            "occupation": data["occupation"],
        }

        return (encoded_features, attention_mask), torch.tensor(
            target_rating, dtype=torch.float32
        )

In [101]:
train_dataset = MovieSequenceDataset(train_df, movie_lookup, user_lookup)
valid_dataset = MovieSequenceDataset(valid_df, movie_lookup, user_lookup)

In [1]:
# for features, mask in train_dataset:
#   features , mask

In [111]:
features[0]

{'age_group': 4,
 'movie_ids': tensor([ 295,  896,  320,  892,  201,  677,  531, 2004,  418,  415]),
 'occupation': 17,
 'ratings': tensor([2, 4, 3, 4, 4, 4, 4, 3, 3]),
 'sex': 1,
 'user_id': 2094}

In [2]:
class BstTransformer(nn.Module):
    def __init__(
        self,
        movies_num_unique,
        users_num_unique,
        sequence_length=10,
        embedding_size=120,
        num_transformer_layers=1,
        ratings_range=(0.5, 5.5),
    ):
        super().__init__()
        self.sequence_length = sequence_length
        self.y_range = ratings_range
        self.movies_embeddings = nn.Embedding(
            movies_num_unique + 1, embedding_size, padding_idx=0
        )
        self.user_embeddings = nn.Embedding(users_num_unique + 1, embedding_size)
        self.ratings_embeddings = nn.Embedding(6, embedding_size, padding_idx=0)
        self.position_embeddings = nn.Embedding(sequence_length, embedding_size)

        self.sex_embeddings = nn.Embedding(
            3,
            2,
        )
        self.occupation_embeddings = nn.Embedding(
            22,
            11,
        )
        self.age_group_embeddings = nn.Embedding(
            8,
            4,
        )

        self.encoder = nn.TransformerEncoder(
            encoder_layer=nn.TransformerEncoderLayer(
                d_model=embedding_size,
                nhead=12,
                dropout=0.1,
                batch_first=True,
                activation="gelu",
            ),
            num_layers=num_transformer_layers,
        )

        self.linear = nn.Sequential(
            nn.Linear(
                embedding_size + (embedding_size * sequence_length) + 4 + 11 + 2,
                1024,
            ),
            nn.BatchNorm1d(1024),
            nn.Mish(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Mish(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.Mish(),
            nn.Linear(256, 1),
            nn.Sigmoid(),
        )

    def forward(self, inputs):
        features, mask = inputs

        user_id = self.user_embeddings(features["user_id"])

        age_group = self.age_group_embeddings(features["age_group"])
        sex = self.sex_embeddings(features["sex"])
        occupation = self.occupation_embeddings(features["occupation"])

        user_features = user_features = torch.cat(
            (user_id, sex, age_group, occupation), 1
        )

        movie_history = features["movie_ids"][:, :-1]
        target_movie = features["movie_ids"][:, -1]

        ratings = self.ratings_embeddings(features["ratings"])

        encoded_movies = self.movies_embeddings(movie_history)
        encoded_target_movie = self.movies_embeddings(target_movie)

        positions = torch.arange(
            0,
            self.sequence_length - 1,
            1,
            dtype=int,
            device=features["movie_ids"].device,
        )
        positions = self.position_embeddings(positions)

        encoded_sequence_movies_with_position_and_rating = (
            encoded_movies + ratings + positions
        )
        encoded_target_movie = encoded_target_movie.unsqueeze(1)

        transformer_features = torch.cat(
            (encoded_sequence_movies_with_position_and_rating, encoded_target_movie),
            dim=1,
        )
        transformer_output = self.encoder(
            transformer_features, src_key_padding_mask=mask
        )
        transformer_output = torch.flatten(transformer_output, start_dim=1)

        combined_output = torch.cat((transformer_output, user_features), dim=1)

        rating = self.linear(combined_output)
        rating = rating.squeeze()
        if self.y_range is None:
            return rating
        else:
            return rating * (self.y_range[1] - self.y_range[0]) + self.y_range[0]

NameError: ignored

In [None]:
notebook_launcher(train_seq_model, num_processes=2)