In [1]:
import pandas as pd
import numpy as np
import warnings
import gc 


pd.set_option('display.max_columns', 50)
warnings.filterwarnings('ignore')

In [2]:
# открываем csv-шники
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

display(movies_df.head(2), ratings_df.head(2))

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817


In [3]:
# реиндекс фильмов
movies_ids_dict = {x:i for i, x in enumerate(movies_df['movieId'].unique())}

movies_df['movieId'] = movies_df['movieId'].apply(lambda x: movies_ids_dict[x])
movies_df.head()

Unnamed: 0,movieId,title,genres
0,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,Jumanji (1995),Adventure|Children|Fantasy
2,2,Grumpier Old Men (1995),Comedy|Romance
3,3,Waiting to Exhale (1995),Comedy|Drama|Romance
4,4,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_df['movieId'] = ratings_df['movieId'].apply(lambda x: movies_ids_dict[x])

In [5]:
# реиндекс пользователей

# реиндекс фильмов
users_ids_dict = {x:i for i, x in enumerate(ratings_df['userId'].unique())}

ratings_df['userId'] = ratings_df['userId'].apply(lambda x: users_ids_dict[x])


In [6]:
# проверка на NaN и дубликаты
movies_df.isna().sum().sum(), ratings_df.isna().sum().sum(), movies_df.duplicated().sum().sum(), ratings_df.duplicated().sum().sum()

(0, 0, 0, 0)

In [7]:
# конвертирую timestamp в datetime и получаю временные признаки
ratings_df['datetime'] = pd.to_datetime(ratings_df['timestamp'], unit='s')
ratings_df = ratings_df.drop(columns=['timestamp'])

ratings_df['year'] = ratings_df['datetime'].dt.year.astype(np.int32)
ratings_df['month'] = ratings_df['datetime'].dt.month.astype(np.int8)
ratings_df['day'] = ratings_df['datetime'].dt.day.astype(np.int16)
ratings_df['hour'] = ratings_df['datetime'].dt.hour.astype(np.int8)
ratings_df['minute'] = ratings_df['datetime'].dt.minute.astype(np.int8)
ratings_df['second'] = ratings_df['datetime'].dt.second.astype(np.int8)
ratings_df['day_of_week'] = ratings_df['datetime'].dt.dayofweek.astype(np.int8)  # Monday=0, Sunday=6
ratings_df['is_weekend'] = (ratings_df['datetime'].dt.dayofweek > 4).astype(np.int8)

ratings_df = ratings_df.drop(columns=['datetime'])

ratings_df.head()

Unnamed: 0,userId,movieId,rating,year,month,day,hour,minute,second,day_of_week,is_weekend
0,0,292,5.0,2006,5,17,15,34,4,2,0
1,0,302,3.5,2006,5,17,12,26,57,2,0
2,0,303,5.0,2006,5,17,12,27,8,2,0
3,0,654,5.0,2006,5,17,15,13,40,2,0
4,0,878,3.5,2006,5,17,12,21,50,2,0


In [8]:
def convert_genres(s):
    s = s.split('|')
    return s

movies_df['genres'] = movies_df['genres'].apply(convert_genres)
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,0,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,1,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,2,Grumpier Old Men (1995),"[Comedy, Romance]"


In [9]:
all_genres = list(map(np.array, np.unique(np.array(movies_df['genres']))))
p = []
for x in all_genres:
    for y in x:
        p.append(y)

all_genres = np.unique(p)

del p
gc.collect()

all_genres

array(['(no genres listed)', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'], dtype='<U18')

In [10]:
# векторизация жанров
ndim = len(all_genres)
genres_dict = {
    c: i for i, c in enumerate(all_genres)
}

movies_df['genres'] = movies_df['genres'].apply(lambda o: [genres_dict[x] for x in o])

# векторизация
def vectorize_genres(lst):
    vector = np.zeros(ndim)
    for x in lst:
        vector[x] = 1
    
    return vector

movies_df['genres'] = movies_df['genres'].apply(vectorize_genres)
movies_df.head()

Unnamed: 0,movieId,title,genres
0,0,Toy Story (1995),"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
1,1,Jumanji (1995),"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,Grumpier Old Men (1995),"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
3,3,Waiting to Exhale (1995),"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
4,4,Father of the Bride Part II (1995),"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."


In [11]:
for i in range(ndim):
    movies_df[f'genre_{i}'] = movies_df['genres'].apply(lambda x: x[i]).astype(np.int8)

movies_df = movies_df.drop(columns=['genres'])
movies_df.head(3)

Unnamed: 0,movieId,title,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
0,0,Toy Story (1995),0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,1,Jumanji (1995),0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,2,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [12]:
# мерджу рейтинги и фильмы
meta_df = ratings_df.merge(movies_df.drop(columns='title'), how='left', on='movieId')
meta_df.head()

Unnamed: 0,userId,movieId,rating,year,month,day,hour,minute,second,day_of_week,is_weekend,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
0,0,292,5.0,2006,5,17,15,34,4,2,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0
1,0,302,3.5,2006,5,17,12,26,57,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,303,5.0,2006,5,17,12,27,8,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,654,5.0,2006,5,17,15,13,40,2,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
4,0,878,3.5,2006,5,17,12,21,50,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0


In [13]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 31 columns):
 #   Column       Dtype  
---  ------       -----  
 0   userId       int64  
 1   movieId      int64  
 2   rating       float64
 3   year         int32  
 4   month        int8   
 5   day          int16  
 6   hour         int8   
 7   minute       int8   
 8   second       int8   
 9   day_of_week  int8   
 10  is_weekend   int8   
 11  genre_0      int8   
 12  genre_1      int8   
 13  genre_2      int8   
 14  genre_3      int8   
 15  genre_4      int8   
 16  genre_5      int8   
 17  genre_6      int8   
 18  genre_7      int8   
 19  genre_8      int8   
 20  genre_9      int8   
 21  genre_10     int8   
 22  genre_11     int8   
 23  genre_12     int8   
 24  genre_13     int8   
 25  genre_14     int8   
 26  genre_15     int8   
 27  genre_16     int8   
 28  genre_17     int8   
 29  genre_18     int8   
 30  genre_19     int8   
dtypes: float64(1), int16(1

## Моделировние

### Подготовка датасета

In [14]:
gc.collect()

16

In [15]:
meta_df.head(2)

Unnamed: 0,userId,movieId,rating,year,month,day,hour,minute,second,day_of_week,is_weekend,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
0,0,292,5.0,2006,5,17,15,34,4,2,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0
1,0,302,3.5,2006,5,17,12,26,57,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [16]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(meta_df)

In [17]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [18]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

user_cols = ['year', 'month', 'hour', 'second', 'day_of_week', 'is_weekend']
movie_cols = ['genre_'+str(i) for i in range(20)]

class MovieRatingsDataset(Dataset):
    def __init__(self, df, is_train=True):
        self.users = torch.tensor(df['userId'].values).to(device)
        self.movies = torch.tensor(df['movieId'].values).to(device)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32).to(device)
        self.user_features = torch.tensor(df[user_cols].values, dtype=torch.float32).to(device)
        self.movie_features = torch.tensor(df[movie_cols].values, dtype=torch.float32).to(device)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return (
            self.users[idx],
            self.movies[idx],
            self.user_features[idx],
            self.movie_features[idx],
            self.ratings[idx]
        )

In [19]:
train_dataset = MovieRatingsDataset(train_df)
valid_dataset = MovieRatingsDataset(valid_df)

train_loader = DataLoader(train_dataset, batch_size=50000)
valid_loader = DataLoader(valid_dataset, batch_size=50000)

sample_batch = next(iter(train_dataset))

In [20]:
class RecommenderNet(nn.Module):
    def __init__(self, num_users, num_movies, user_feature_dim, movie_feature_dim, embedding_size):
        super(RecommenderNet, self).__init__()

        # Embeddings for user and movie IDs
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.movie_embedding = nn.Embedding(num_movies, embedding_size)

        # Additional layers for user and movie features
        self.user_feature_layer = nn.Linear(user_feature_dim, embedding_size)
        self.movie_feature_layer = nn.Linear(movie_feature_dim, embedding_size)

        # Calculate the total features after concatenation
        # Here, we assume the output size of feature layers is also 'embedding_size'
        total_feature_size = embedding_size * 4  # 2 embeddings + 2 feature layers

        # Example final layer - Adjust the input size here
        self.fc = nn.Linear(total_feature_size, 1)

    def forward(self, user_ids, movie_ids, user_features, movie_features):
        user_embedded = self.user_embedding(user_ids)
        movie_embedded = self.movie_embedding(movie_ids)

        user_feature_embedded = self.user_feature_layer(user_features)
        movie_feature_embedded = self.movie_feature_layer(movie_features)

        # Combine all features
        features = torch.cat([user_embedded, movie_embedded, user_feature_embedded, movie_feature_embedded], dim=1)

        # Pass through the final layer
        rating = self.fc(features)

        return rating.squeeze()

In [21]:
# конфиг
num_users = ratings_df['userId'].nunique()
num_movies = movies_df['movieId'].nunique()
user_feature_dim = len(user_cols)
movie_feature_dim = len(movie_cols)
embedding_size = 64

num_epochs = 20

In [22]:
import wandb

wandb.login()
wandb.init()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [23]:
from torch.optim.lr_scheduler import StepLR



def train_and_validate(model, train_loader, test_loader, epochs=5):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    best_metric = 2

    scheduler = StepLR(optimizer=optimizer, step_size=2, gamma=0.3)

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for user_ids, movie_ids, user_feats, movie_feats, ratings in train_loader:
            optimizer.zero_grad()
            predictions = model(user_ids, movie_ids, user_feats, movie_feats)
            loss = loss_fn(predictions, ratings)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        

        scheduler.step()

        # Validation phase
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for user_ids, movie_ids, user_feats, movie_feats, ratings in test_loader:
                predictions = model(user_ids, movie_ids, user_feats, movie_feats)
                loss = loss_fn(predictions, ratings)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(test_loader)

        if avg_val_loss < best_metric:
            torch.save(model.state_dict(), './best_model')
            best_metric = avg_val_loss

        wandb.log({'train loss': avg_train_loss})
        wandb.log({'val loss': avg_val_loss})

        print(f'Epoch {epoch+1}: Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')


In [24]:
# Initialize your model
model = RecommenderNet(num_users, num_movies, user_feature_dim, movie_feature_dim, embedding_size).to(device)

# Train and validate the model
train_and_validate(model, train_loader, valid_loader, epochs=num_epochs)

Epoch 1: Train Loss: 238.7271, Validation Loss: 1.7452
Epoch 2: Train Loss: 1.4996, Validation Loss: 1.2821
Epoch 3: Train Loss: 1.2250, Validation Loss: 1.1740
Epoch 4: Train Loss: 1.1258, Validation Loss: 1.0843
Epoch 5: Train Loss: 1.0695, Validation Loss: 1.0602
Epoch 6: Train Loss: 1.0455, Validation Loss: 1.0364
Epoch 7: Train Loss: 1.0298, Validation Loss: 1.0291
Epoch 8: Train Loss: 1.0222, Validation Loss: 1.0213
Epoch 9: Train Loss: 1.0169, Validation Loss: 1.0187
Epoch 10: Train Loss: 1.0142, Validation Loss: 1.0159
Epoch 11: Train Loss: 1.0123, Validation Loss: 1.0149
Epoch 12: Train Loss: 1.0113, Validation Loss: 1.0138
Epoch 13: Train Loss: 1.0106, Validation Loss: 1.0135
Epoch 14: Train Loss: 1.0102, Validation Loss: 1.0130
Epoch 15: Train Loss: 1.0099, Validation Loss: 1.0129
Epoch 16: Train Loss: 1.0097, Validation Loss: 1.0127
Epoch 17: Train Loss: 1.0096, Validation Loss: 1.0127
Epoch 18: Train Loss: 1.0095, Validation Loss: 1.0126
Epoch 19: Train Loss: 1.0095, Valid

# Задачи

## Основная задача

In [25]:
import torch

model = RecommenderNet(num_users, num_movies, user_feature_dim, movie_feature_dim, embedding_size)
model.load_state_dict(torch.load('./best_model'))
model.eval()  

RecommenderNet(
  (user_embedding): Embedding(162541, 64)
  (movie_embedding): Embedding(62423, 64)
  (user_feature_layer): Linear(in_features=6, out_features=64, bias=True)
  (movie_feature_layer): Linear(in_features=20, out_features=64, bias=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [26]:
some_movie_id = 42
user_id = 42

top_n = 10

orig_title = movies_df[movies_df['movieId'] == some_movie_id]['title']

some_movie_id = movies_ids_dict[some_movie_id]
user_id = users_ids_dict[user_id]

movie_id = some_movie_id

print(orig_title)

42    Restoration (1995)
Name: title, dtype: object


In [27]:
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_movies(movie_id, embeddings, top_n=10):
    # Получаем эмбеддинг интересующего фильма

    movie_embedding = torch.tensor(embeddings[movie_id]).unsqueeze(0)

    # Вычисляем косинусное сходство между этим фильмом и всеми остальными
    similarities = cosine_similarity(movie_embedding, embeddings)[0]

    # Находим индексы фильмов с наивысшим сходством
    most_similar = similarities.argsort()[-top_n:]

    #list(map(lambda x: reverse_movies_ids[x], most_similar))

    return most_similar 

# Предполагая, что модель уже обучена
movie_embeddings = model.movie_embedding.weight.data.cpu().numpy()
similar_movies = find_similar_movies(some_movie_id, movie_embeddings)

In [28]:
for idx in similar_movies:
    print(*movies_df[movies_df['movieId'] == idx]['title'])

Pennies from Heaven (1936)
The Pee-Wee Herman Show on Broadway (2011)
Dinosaur Island (2014)
Final Option, The (Who Dares Wins) (1982)
Kedi (2016)
Tough Kids (1983)
Sierra (1950)
Machines (2016)
Pistol Opera (Pisutoru opera) (2001)
Dead Presidents (1995)


### Доп. задание 1

In [29]:
def find_similar_movies(movie_id, embeddings, top_n=10):

    movie_embedding = torch.tensor(embeddings[movie_id]).unsqueeze(0)
    similarities = cosine_similarity(movie_embedding, embeddings)[0]

    # Получение индексов и значений для top_n наиболее похожих фильмов
    most_similar_indices = similarities.argsort()[-top_n:][::-1]

    most_similar_values = similarities[most_similar_indices]

    #most_similar_indices = list(map(lambda x: reverse_movies_ids[x], most_similar_indices))

    return list(zip(most_similar_indices, most_similar_values))

# Получение эмбеддингов
movie_embeddings = model.movie_embedding.weight.data.cpu().numpy()
similar_movies = find_similar_movies(some_movie_id, movie_embeddings, top_n=top_n)

In [30]:
for idx, similarity in similar_movies:
    movie_title = movies_df[movies_df['movieId'] == idx]['title']
    #print(f"{movie_title}: {similarity:.3f}")
    print(*movie_title, end=' ')
    print(similarity)

Dead Presidents (1995) 1.0000001
Pistol Opera (Pisutoru opera) (2001) 0.52448535
Machines (2016) 0.4961232
Sierra (1950) 0.49328798
Tough Kids (1983) 0.47015893
Kedi (2016) 0.46420833
Final Option, The (Who Dares Wins) (1982) 0.46415567
Dinosaur Island (2014) 0.46093518
The Pee-Wee Herman Show on Broadway (2011) 0.4591334
Pennies from Heaven (1936) 0.4566235


### Доп. задание 2

#### Получение Эмбеддингов Пользователя и Фильмов

In [31]:
user_embeddings = model.user_embedding.weight.data.cpu().numpy()
movie_embeddings = model.movie_embedding.weight.data.cpu().numpy()

In [32]:
def find_recommended_movies(user_id, user_embeddings, movie_embeddings, ratings_df, movies_df, top_n=10):
    # Получение эмбеддинга пользователя

    user_id = users_ids_dict[user_id]

    user_embedding = user_embeddings[user_id]

    # Вычисление сходства между эмбеддингом пользователя и эмбеддингами всех фильмов
    similarities = np.dot(movie_embeddings, user_embedding)

    # Получение индексов фильмов с наивысшим сходством
    top_movie_indices = similarities.argsort()[::-1]

    # Исключение фильмов, которые пользователь уже смотрел
    watched_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
    recommended_indices = [idx for idx in top_movie_indices if idx not in watched_movies][:top_n]

    # Получение названий фильмов
    recommended_movies = movies_df.loc[movies_df['movieId'].isin(recommended_indices)]
    return recommended_movies[['movieId', 'title']]


In [33]:
recommended_movies = find_recommended_movies(user_id, user_embeddings, movie_embeddings, ratings_df, movies_df, top_n=top_n)
display(recommended_movies)


Unnamed: 0,movieId,title
8794,8794,In Praise of Older Women (1978)
12727,12727,Tyler Perry's The Family That Preys (2008)
15968,15968,Up in Arms (1944)
17769,17769,When Love Is Not Enough: The Lois Wilson Story...
20687,20687,"Life and Adventures of Santa Claus, The (1985)"
29689,29689,Night Train to Terror (1985)
40512,40512,Gooby (2009)
40676,40676,Prey (1978)
45574,45574,Ithaca (2015)
52604,52604,WWE: Greatest Wrestling Stars of the '90s (2009)


### Доп. задание 3

#### Определение Функции Для Объяснения Рекомендаций

In [36]:
def explain_recommendations(user_id, recommended_movies, ratings_df, movies_df):
    explanations = []
    genre_columns = [col for col in movies_df.columns if col.startswith('genre_')]

    # Получаем фильмы, которые пользователь уже смотрел и высоко оценил
    user_ratings = ratings_df[ratings_df['userId'] == user_id]
    user_high_ratings = user_ratings[user_ratings['rating'] > 4]
    user_high_ratings = user_high_ratings.merge(movies_df, on='movieId')

    # Добавляем жанровые профили к рекомендованным фильмам
    recommended_with_genres = recommended_movies.merge(movies_df[genre_columns + ['movieId']], on='movieId')

    for _, recommended_movie in recommended_with_genres.iterrows():
        # Берем жанровый профиль рекомендованного фильма
        recommended_genre_profile = recommended_movie[genre_columns].values

        # Ищем схожие фильмы среди высоко оцененных
        user_high_ratings['similarity'] = user_high_ratings[genre_columns].apply(
            lambda row: np.dot(row.values, recommended_genre_profile), axis=1
        )
        similar_movies = user_high_ratings[user_high_ratings['similarity'] > 0]

        if not similar_movies.empty:
            highest_rated_movie = similar_movies.sort_values(by='similarity', ascending=False).iloc[0]
            explanation = f"Потому что вы посмотрели и оценили фильм '{highest_rated_movie['title']}' на {highest_rated_movie['rating']} баллов, который имеет схожие жанровые характеристики."
            explanations.append((recommended_movie['title'], explanation))

    return explanations



In [37]:
explanations = explain_recommendations(user_id, recommended_movies, ratings_df, movies_df)

for title, explanation in explanations:
    print(f"Рекомендация фильма '{title}': {explanation}")

Рекомендация фильма 'Night Train to Terror (1985)': Потому что вы посмотрели и оценили фильм 'Silence of the Lambs, The (1991)' на 5.0 баллов, который имеет схожие жанровые характеристики.
Рекомендация фильма 'Prey (1978)': Потому что вы посмотрели и оценили фильм 'Silence of the Lambs, The (1991)' на 5.0 баллов, который имеет схожие жанровые характеристики.
