In [1]:
%load_ext autoreload
%autoreload 2

In [62]:
import numpy as np
import pytorch_lightning as pl
import torch
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

from autoencoder.model import UserAutoEncoder
from data import get_movielens_1m

### Prepare users features

In [3]:
movies_df, users_df, ratings_df = get_movielens_1m()

In [4]:
genres = set()

for genre in movies_df.Genres.unique():
    all_parts = genre.split('|')
    genres.update(all_parts)

In [5]:
dataset = users_df.copy(deep=True)
dataset['female'] = dataset['Gender'].apply(lambda x: 1 if x == 'F' else 0)
dataset['male'] = dataset['Gender'].apply(lambda x: 1 if x == 'M' else 0)
dataset.drop(columns=['Gender'], inplace=True)

for genre in genres:
    dataset[f'avg_{genre}'] = 0

In [6]:
train_ratings, test_ratings = train_test_split(
    ratings_df.sort_values(by=['Timestamp']),
    shuffle=False,
    test_size=0.1
)

train_ratings = train_ratings.merge(movies_df, on='MovieID')

In [7]:
for idx, user in tqdm(dataset.iterrows(), total=len(dataset)):
    ratings_by_user = train_ratings[train_ratings['UserID'] == user.UserID]
    for genre in genres:
        ratings_by_genre = ratings_by_user[ratings_by_user['Genres'].apply(lambda x: genre in x)]
        if len(ratings_by_genre) > 0:
            dataset.loc[idx, f'avg_{genre}'] = ratings_by_genre['Rating'].mean()

  0%|          | 0/6040 [00:00<?, ?it/s]

In [8]:
dataset

Unnamed: 0,UserID,Age,Occupation,Zip-code,female,male,avg_Thriller,avg_Horror,avg_Children's,avg_Mystery,...,avg_Sci-Fi,avg_Drama,avg_Fantasy,avg_Comedy,avg_Documentary,avg_War,avg_Musical,avg_Action,avg_Romance,avg_Film-Noir
0,1,1,10,48067,1,0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2,56,16,70072,0,1,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,3,25,15,55117,0,1,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,4,45,7,02460,0,1,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,5,25,20,55455,0,1,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,6036,25,15,32603,1,0,3.142857,2.986486,3.444444,3.411765,...,2.834320,3.505376,3.00,3.203065,3.909091,3.785714,3.709677,3.000000,3.352459,4.058824
6036,6037,45,1,76006,1,0,3.705882,4.111111,3.666667,3.692308,...,3.692308,3.877551,4.25,3.576271,4.000000,4.000000,4.000000,3.642857,3.681818,3.444444
6037,6038,56,1,14706,1,0,0.000000,2.500000,3.000000,0.000000,...,4.000000,3.888889,0.00,3.833333,0.000000,4.000000,0.000000,3.000000,4.166667,0.000000
6038,6039,45,0,01060,1,0,4.142857,4.000000,3.529412,4.176471,...,4.250000,4.000000,3.60,3.723077,0.000000,4.111111,3.690476,4.000000,3.800000,4.500000


### Train embeddings

In [9]:
X = dataset.drop(columns=['UserID', 'Zip-code']).to_numpy().astype(np.float32)
model = UserAutoEncoder(n_features=X.shape[1])
x_dataloader = DataLoader(X, batch_size=100, num_workers=4)
trainer = pl.Trainer(max_epochs=100)

GPU available: False, used: False
TPU available: None, using: 0 TPU cores


In [10]:
trainer.fit(model, x_dataloader)


  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 2.8 K 
1 | decoder | Sequential | 2.8 K 
---------------------------------------
5.7 K     Trainable params
0         Non-trainable params
5.7 K     Total params
0.023     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

1

In [16]:
embeddings = model(torch.Tensor(X)).detach().numpy()
embeddings.shape

(6040, 64)

### Use KNN to recommend (predict ratings)

In [60]:
K = 50

In [19]:
knn = NearestNeighbors()
knn.fit(embeddings)

NearestNeighbors()

In [49]:
def get_prediction(uid: int, mid: int):
    neighbours = knn.kneighbors(embeddings[uid - 1].reshape(1, -1), n_neighbors=K+1, return_distance=False).flatten()[1:]
    neighbours = neighbours + 1  # users indexing starts at 1

    ratings_of_movie = train_ratings[train_ratings['MovieID'] == mid]
    ratings_of_movie = ratings_of_movie[ratings_of_movie['UserID'].isin(neighbours)]

    return ratings_of_movie['Rating'].mean()

In [53]:
test_ratings['prediction'] = None

for idx, row in tqdm(test_ratings.iterrows(), total=len(test_ratings)):
    user_id = row['UserID']
    movie_id = row['MovieID']

    test_ratings.loc[idx, 'prediction'] = get_prediction(user_id, movie_id)

  0%|          | 0/100021 [00:00<?, ?it/s]

In [67]:
ratings_for_rmse = test_ratings.dropna()

In [68]:
y_true = ratings_for_rmse['Rating'].to_numpy()
y_pred = ratings_for_rmse['prediction'].to_numpy()

In [69]:
rmse = mean_squared_error(y_true, y_pred, squared=False)
rmse

1.080310957188026

In [70]:
mse = mean_squared_error(y_true, y_pred)
mse

1.1670717642205088