In [45]:
import pandas as pd
import numpy as np
import warnings
import gc 


pd.set_option('display.max_columns', 50)
warnings.filterwarnings('ignore')

In [46]:
# открываем csv-шники
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

display(movies_df.head(2), ratings_df.head(2))

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817


In [47]:
# проверка на NaN и дубликаты
movies_df.isna().sum().sum(), ratings_df.isna().sum().sum(), movies_df.duplicated().sum().sum(), ratings_df.duplicated().sum().sum()

(0, 0, 0, 0)

In [48]:
# конвертирую timestamp в datetime и получаю временные признаки
ratings_df['datetime'] = pd.to_datetime(ratings_df['timestamp'], unit='s')
ratings_df = ratings_df.drop(columns=['timestamp'])

ratings_df['year'] = ratings_df['datetime'].dt.year.astype(np.int32)
ratings_df['month'] = ratings_df['datetime'].dt.month.astype(np.int8)
ratings_df['day'] = ratings_df['datetime'].dt.day.astype(np.int16)
ratings_df['hour'] = ratings_df['datetime'].dt.hour.astype(np.int8)
ratings_df['minute'] = ratings_df['datetime'].dt.minute.astype(np.int8)
ratings_df['second'] = ratings_df['datetime'].dt.second.astype(np.int8)
ratings_df['day_of_week'] = ratings_df['datetime'].dt.dayofweek.astype(np.int8)  # Monday=0, Sunday=6
ratings_df['is_weekend'] = (ratings_df['datetime'].dt.dayofweek > 4).astype(np.int8)

ratings_df = ratings_df.drop(columns=['datetime'])

ratings_df.head()

Unnamed: 0,userId,movieId,rating,year,month,day,hour,minute,second,day_of_week,is_weekend
0,1,296,5.0,2006,5,17,15,34,4,2,0
1,1,306,3.5,2006,5,17,12,26,57,2,0
2,1,307,5.0,2006,5,17,12,27,8,2,0
3,1,665,5.0,2006,5,17,15,13,40,2,0
4,1,899,3.5,2006,5,17,12,21,50,2,0


In [49]:
def convert_genres(s):
    s = s.split('|')
    return s

movies_df['genres'] = movies_df['genres'].apply(convert_genres)
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"


In [50]:
#movies_df['year'] = movies_df['title'].apply(lambda x: if int(x.split('(')[-1][:-2]))
movies_df['title'] = movies_df['title'].apply(lambda x: x.split('(')[0])

movies_df.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji,"[Adventure, Children, Fantasy]"


In [51]:
all_genres = list(map(np.array, np.unique(np.array(movies_df['genres']))))
p = []
for x in all_genres:
    for y in x:
        p.append(y)

all_genres = np.unique(p)

del p
gc.collect()

all_genres

array(['(no genres listed)', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'], dtype='<U18')

In [52]:
# векторизация жанров
ndim = len(all_genres)
genres_dict = {
    c: i for i, c in enumerate(all_genres)
}

movies_df['genres'] = movies_df['genres'].apply(lambda o: [genres_dict[x] for x in o])

# векторизация
def vectorize_genres(lst):
    vector = np.zeros(ndim)
    for x in lst:
        vector[x] = 1
    
    return vector

movies_df['genres'] = movies_df['genres'].apply(vectorize_genres)
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
1,2,Jumanji,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,Grumpier Old Men,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
3,4,Waiting to Exhale,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
4,5,Father of the Bride Part II,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."


In [53]:
for i in range(ndim):
    movies_df[f'genre_{i}'] = movies_df['genres'].apply(lambda x: x[i]).astype(np.int8)

movies_df = movies_df.drop(columns=['genres'])
movies_df.head(3)

Unnamed: 0,movieId,title,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
0,1,Toy Story,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [54]:
# мерджу рейтинги и фильмы
meta_df = ratings_df.merge(movies_df.drop(columns='title'), how='left', on='movieId')
meta_df.head()

Unnamed: 0,userId,movieId,rating,year,month,day,hour,minute,second,day_of_week,is_weekend,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
0,1,296,5.0,2006,5,17,15,34,4,2,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0
1,1,306,3.5,2006,5,17,12,26,57,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,1,307,5.0,2006,5,17,12,27,8,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,1,665,5.0,2006,5,17,15,13,40,2,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
4,1,899,3.5,2006,5,17,12,21,50,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0


In [55]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 31 columns):
 #   Column       Dtype  
---  ------       -----  
 0   userId       int64  
 1   movieId      int64  
 2   rating       float64
 3   year         int32  
 4   month        int8   
 5   day          int16  
 6   hour         int8   
 7   minute       int8   
 8   second       int8   
 9   day_of_week  int8   
 10  is_weekend   int8   
 11  genre_0      int8   
 12  genre_1      int8   
 13  genre_2      int8   
 14  genre_3      int8   
 15  genre_4      int8   
 16  genre_5      int8   
 17  genre_6      int8   
 18  genre_7      int8   
 19  genre_8      int8   
 20  genre_9      int8   
 21  genre_10     int8   
 22  genre_11     int8   
 23  genre_12     int8   
 24  genre_13     int8   
 25  genre_14     int8   
 26  genre_15     int8   
 27  genre_16     int8   
 28  genre_17     int8   
 29  genre_18     int8   
 30  genre_19     int8   
dtypes: float64(1), int16(1

## Моделировние

### Подготовка датасета

In [56]:
gc.collect()

16

In [57]:
meta_df.head(2)

Unnamed: 0,userId,movieId,rating,year,month,day,hour,minute,second,day_of_week,is_weekend,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
0,1,296,5.0,2006,5,17,15,34,4,2,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0
1,1,306,3.5,2006,5,17,12,26,57,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [19]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(meta_df)

In [20]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [21]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

user_cols = ['year', 'month', 'hour', 'second', 'day_of_week', 'is_weekend']
movie_cols = ['genre_'+str(i) for i in range(20)]

class MovieRatingsDataset(Dataset):
    def __init__(self, df, is_train=True):
        self.users = torch.tensor(df['userId'].values).to(device)
        self.movies = torch.tensor(df['movieId'].values).to(device)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32).to(device)
        self.user_features = torch.tensor(df[user_cols].values, dtype=torch.float32).to(device)
        self.movie_features = torch.tensor(df[movie_cols].values, dtype=torch.float32).to(device)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return (
            self.users[idx],
            self.movies[idx],
            self.user_features[idx],
            self.movie_features[idx],
            self.ratings[idx]
        )

In [22]:
train_dataset = MovieRatingsDataset(train_df)
valid_dataset = MovieRatingsDataset(valid_df)

train_loader = DataLoader(train_dataset, batch_size=50000)
valid_loader = DataLoader(valid_dataset, batch_size=50000)

sample_batch = next(iter(train_dataset))

In [23]:
class RecommenderNet(nn.Module):
    def __init__(self, num_users, num_movies, user_feature_dim, movie_feature_dim, embedding_size):
        super(RecommenderNet, self).__init__()

        # Embeddings for user and movie IDs
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.movie_embedding = nn.Embedding(num_movies, embedding_size)

        # Additional layers for user and movie features
        self.user_feature_layer = nn.Linear(user_feature_dim, embedding_size)
        self.movie_feature_layer = nn.Linear(movie_feature_dim, embedding_size)

        # Calculate the total features after concatenation
        # Here, we assume the output size of feature layers is also 'embedding_size'
        total_feature_size = embedding_size * 4  # 2 embeddings + 2 feature layers

        # Example final layer - Adjust the input size here
        self.fc = nn.Linear(total_feature_size, 1)

    def forward(self, user_ids, movie_ids, user_features, movie_features):
        user_embedded = self.user_embedding(user_ids)
        movie_embedded = self.movie_embedding(movie_ids)

        user_feature_embedded = self.user_feature_layer(user_features)
        movie_feature_embedded = self.movie_feature_layer(movie_features)

        # Combine all features
        features = torch.cat([user_embedded, movie_embedded, user_feature_embedded, movie_feature_embedded], dim=1)

        # Pass through the final layer
        rating = self.fc(features)

        return rating.squeeze()



In [24]:
# конфиг
num_users = meta_df['userId'].max() + 1
num_movies = meta_df['movieId'].max() + 1
user_feature_dim = len(user_cols)
movie_feature_dim = len(movie_cols)
embedding_size = 64

num_epochs = 20

In [43]:
import wandb

wandb.login()
wandb.init()



In [45]:
from torch.optim.lr_scheduler import StepLR



def train_and_validate(model, train_loader, test_loader, epochs=5):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    best_metric = 2

    scheduler = StepLR(optimizer=optimizer, step_size=2, gamma=0.3)

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for user_ids, movie_ids, user_feats, movie_feats, ratings in train_loader:
            optimizer.zero_grad()
            predictions = model(user_ids, movie_ids, user_feats, movie_feats)
            loss = loss_fn(predictions, ratings)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        

        scheduler.step()

        # Validation phase
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for user_ids, movie_ids, user_feats, movie_feats, ratings in test_loader:
                predictions = model(user_ids, movie_ids, user_feats, movie_feats)
                loss = loss_fn(predictions, ratings)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(test_loader)

        if avg_val_loss < best_metric:
            torch.save(model.state_dict(), './best_model')
            best_metric = avg_val_loss

        wandb.log({'train loss': avg_train_loss})
        wandb.log({'val loss': avg_val_loss})

        print(f'Epoch {epoch+1}: Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')


In [46]:
# Initialize your model
model = RecommenderNet(num_users, num_movies, user_feature_dim, movie_feature_dim, embedding_size).to(device)

# Train and validate the model
train_and_validate(model, train_loader, valid_loader, epochs=num_epochs)

Epoch 1: Train Loss: 829.5831, Validation Loss: 2.4864
Epoch 2: Train Loss: 2.0908, Validation Loss: 1.7400
Epoch 3: Train Loss: 1.6640, Validation Loss: 1.5758
Epoch 4: Train Loss: 1.5081, Validation Loss: 1.4312
Epoch 5: Train Loss: 1.4170, Validation Loss: 1.3900
Epoch 6: Train Loss: 1.3745, Validation Loss: 1.3468
Epoch 7: Train Loss: 1.3457, Validation Loss: 1.3327
Epoch 8: Train Loss: 1.3306, Validation Loss: 1.3168
Epoch 9: Train Loss: 1.3197, Validation Loss: 1.3114
Epoch 10: Train Loss: 1.3138, Validation Loss: 1.3051
Epoch 11: Train Loss: 1.3095, Validation Loss: 1.3029
Epoch 12: Train Loss: 1.3071, Validation Loss: 1.3002
Epoch 13: Train Loss: 1.3052, Validation Loss: 1.2993
Epoch 14: Train Loss: 1.3042, Validation Loss: 1.2982
Epoch 15: Train Loss: 1.3035, Validation Loss: 1.2978
Epoch 16: Train Loss: 1.3031, Validation Loss: 1.2974
Epoch 17: Train Loss: 1.3028, Validation Loss: 1.2973
Epoch 18: Train Loss: 1.3026, Validation Loss: 1.2971
Epoch 19: Train Loss: 1.3025, Valid

# Задачи

## Основная задача

In [58]:
import torch

model = RecommenderNet(num_users, num_movies, user_feature_dim, movie_feature_dim, embedding_size)
model.load_state_dict(torch.load('./best_model'))
model.eval()  

RecommenderNet(
  (user_embedding): Embedding(162542, 64)
  (movie_embedding): Embedding(209172, 64)
  (user_feature_layer): Linear(in_features=6, out_features=64, bias=True)
  (movie_feature_layer): Linear(in_features=20, out_features=64, bias=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [131]:
movies_df.shape

(62423, 22)

In [169]:
some_movie_id = 42

In [170]:
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_movies(movie_id, embeddings, top_n=10):
    # Получаем эмбеддинг интересующего фильма

    movie_embedding = torch.tensor(embeddings[movie_id]).unsqueeze(0)

    # Вычисляем косинусное сходство между этим фильмом и всеми остальными
    similarities = cosine_similarity(movie_embedding, embeddings)[0]

    # Находим индексы фильмов с наивысшим сходством
    most_similar = similarities.argsort()[-top_n:]
    #p = []
    
    #for e in most_similar:
        #if len(movies_df[movies_df['movieId'] == e]['title']) > 0:
            #p.append(e)

    return most_similar

# Предполагая, что модель уже обучена
movie_embeddings = model.movie_embedding.weight.data.cpu().numpy()
similar_movies = find_similar_movies(some_movie_id, movie_embeddings)

In [171]:
for idx in similar_movies:
    print(*movies_df[movies_df['movieId'] == idx]['title'])

Prosecuting Evil: The Extraordinary World of Ben Ferencz 


Delam Mikhad 

Afonya 


Falling in Love 
Dead Presidents 


### Доп. задание 1

In [172]:
def find_similar_movies(movie_id, embeddings, top_n=10):

    movie_embedding = torch.tensor(embeddings[movie_id]).unsqueeze(0)
    similarities = cosine_similarity(movie_embedding, embeddings)[0]

    # Получение индексов и значений для top_n наиболее похожих фильмов
    most_similar_indices = similarities.argsort()[-top_n:][::-1]

    most_similar_values = similarities[most_similar_indices]

    return list(zip(most_similar_indices, most_similar_values))

# Получение эмбеддингов
movie_embeddings = model.movie_embedding.weight.data.cpu().numpy()
similar_movies = find_similar_movies(some_movie_id, movie_embeddings, top_n=10)

In [179]:
for idx, similarity in similar_movies:
    movie_title = movies_df[movies_df['movieId'] == idx]['title']
    #print(f"{movie_title}: {similarity:.3f}")
    print(*movie_title, end=' ')
    print(similarity)

Dead Presidents  1.0
Falling in Love  0.5729036
 0.5321094
 0.51268554
Afonya  0.51128185
 0.4969297
Delam Mikhad  0.48930246
 0.4775353
 0.4736694
Prosecuting Evil: The Extraordinary World of Ben Ferencz  0.47296485


### Доп. задание 2

#### Получение Эмбеддингов Пользователя и Фильмов

In [199]:
user_embeddings = model.user_embedding.weight.data.cpu().numpy()
movie_embeddings = model.movie_embedding.weight.data.cpu().numpy()

In [200]:
def find_recommended_movies(user_id, user_embeddings, movie_embeddings, ratings_df, movies_df, top_n=10):
    # Получение эмбеддинга пользователя
    user_embedding = user_embeddings[user_id]

    # Вычисление сходства между эмбеддингом пользователя и эмбеддингами всех фильмов
    similarities = np.dot(movie_embeddings, user_embedding)

    # Получение индексов фильмов с наивысшим сходством
    top_movie_indices = similarities.argsort()[::-1]

    # Исключение фильмов, которые пользователь уже смотрел
    watched_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
    recommended_indices = [idx for idx in top_movie_indices if idx not in watched_movies][:top_n]

    # Получение названий фильмов
    recommended_movies = movies_df.loc[movies_df['movieId'].isin(recommended_indices)]
    return recommended_movies[['movieId', 'title']]


In [210]:
# Предполагая, что модель уже обучена и user_id - это индекс, а не исходный ID
user_id = 1  # Пример пользователя

recommended_movies = find_recommended_movies(user_id, user_embeddings, movie_embeddings, ratings_df, movies_df, top_n=10)
display(recommended_movies)


Unnamed: 0,movieId,title
27753,130073,Cinderella
41216,161185,Helix


### Доп. задание 3

#### Определение Функции Для Объяснения Рекомендаций

In [211]:
def explain_recommendations(user_id, recommended_movies, ratings_df, movies_df):
    explanations = []
    genre_columns = [col for col in movies_df.columns if col.startswith('genre_')]

    # Получаем фильмы, которые пользователь уже смотрел и высоко оценил
    user_ratings = ratings_df[ratings_df['userId'] == user_id]
    user_high_ratings = user_ratings[user_ratings['rating'] > 4]
    user_high_ratings = user_high_ratings.merge(movies_df, on='movieId')

    # Добавляем жанровые профили к рекомендованным фильмам
    recommended_with_genres = recommended_movies.merge(movies_df[genre_columns + ['movieId']], on='movieId')

    for _, recommended_movie in recommended_with_genres.iterrows():
        # Берем жанровый профиль рекомендованного фильма
        recommended_genre_profile = recommended_movie[genre_columns].values

        # Ищем схожие фильмы среди высоко оцененных
        user_high_ratings['similarity'] = user_high_ratings[genre_columns].apply(
            lambda row: np.dot(row.values, recommended_genre_profile), axis=1
        )
        similar_movies = user_high_ratings[user_high_ratings['similarity'] > 0]

        if not similar_movies.empty:
            highest_rated_movie = similar_movies.iloc[0]
            explanation = f"Потому что вы посмотрели и оценили фильм '{highest_rated_movie['title']}' на {highest_rated_movie['rating']} баллов."
            explanations.append((recommended_movie['title'], explanation))

    return explanations


In [212]:
explanations = explain_recommendations(user_id, recommended_movies, ratings_df, movies_df)

for title, explanation in explanations:
    print(f"Рекомендация фильма '{title}': {explanation}")


Рекомендация фильма 'Cinderella ': Потому что вы посмотрели и оценили фильм 'Pulp Fiction ' на 5.0 баллов.
Рекомендация фильма 'Helix ': Потому что вы посмотрели и оценили фильм 'Saragossa Manuscript, The ' на 5.0 баллов.
