Implement the GNN from https://github.com/MiladGhorbaniG/GNN-for-Recommendation-System.

In [253]:
# import required modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch_geometric.utils import degree
from sklearn.preprocessing import MultiLabelBinarizer

import torch
from torch import nn, optim

In [254]:
# Read the major datasets
user_movie = pd.read_csv("../data/raw/ml-100k/u.data", delimiter="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"])
user_movie.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [255]:
user_movie.describe()

Unnamed: 0,user_id,item_id,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [256]:
user_dataset = pd.read_csv("../data/raw/ml-100k/u.user", delimiter="|", header=None, names=["id", "age", "gender", "occupation", "zip_code"])
user_dataset.head()

Unnamed: 0,id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [257]:
genres = pd.read_csv("../data/raw/ml-100k/u.genre", header=None, delimiter='|', names=["genre_name", "genre_id"])
genre_list = list(genres['genre_name'])
genre_list

['unknown',
 'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [258]:
item_dataset = pd.read_csv("../data/raw/ml-100k/u.item", delimiter="|", header=None, names=["movie_id", "movie_title", "release_date", "video_release_date", "IMDB_URL", *genre_list], encoding='latin')
item_dataset.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDB_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


All the ids in the datasets are consecutive numbers starting from 1, so there is no need to re-encode the columns to avoid out of bounds issue when indexing embeddings.

In [259]:
counts = user_movie.rating.value_counts()
counts

rating
4    34174
3    27145
5    21201
2    11370
1     6110
Name: count, dtype: int64

As we can see, a significant part of the movie ratings is greater than or equal to 3.

In [260]:
user_movie[user_movie['rating'].isin([4, 5])]

Unnamed: 0,user_id,item_id,rating,timestamp
5,298,474,4,884182806
7,253,465,5,891628467
11,286,1014,5,879781125
12,200,222,5,876042340
16,122,387,5,879270459
...,...,...,...,...
99988,421,498,4,892241344
99989,495,1091,4,888637503
99990,806,421,4,882388897
99991,676,538,4,892685437


In [261]:
total_count = counts.sum()
filtered_ratings = user_movie[user_movie['rating'].isin([4, 5])]

print(f"Proportion of 4s and 5s combined: {len(filtered_ratings)/total_count}.")

Proportion of 4s and 5s combined: 0.55375.


Defining model for predictions:

In [263]:
class MovieLensNet(nn.Module):
    def __init__(self, num_movies, num_users, num_genres_encoded,
                 embedding_size, hidden_dim):
        super(MovieLensNet, self).__init__()
        self.num_movies = num_movies
        self.num_users = num_users
        self.num_genres_encoded = num_genres_encoded
        self.movie_embedding = nn.Embedding(num_movies, embedding_size)
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.fc1 = nn.Linear(embedding_size * 2 + num_genres_encoded, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, movie_id, user_id, genre_id):
        genre_id = torch.unsqueeze(genre_id, dim=2)
        if genre_id.size() != (movie_id.size(0), self.num_genres_encoded, 1):
            raise ValueError(f"Expected genre_id to have size ({movie_id.size(0)}, {self.num_genres_encoded}, 1)")
        movie_emb = self.movie_embedding(movie_id)
        user_emb = self.user_embedding(user_id)
        movie_emb = torch.unsqueeze(movie_emb, dim=2)
        user_emb = torch.unsqueeze(user_emb, dim=2)
        x = torch.cat([movie_emb, user_emb, genre_id.float()], dim=1)
        x = torch.flatten(x, start_dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [264]:
class MovieLensDataset(torch.utils.data.Dataset):
    def __init__(self, data, movies, genres_encoded, mlb, max_genre_count, num_users):
        self.data = data
        self.movies = movies
        self.genres_encoded = genres_encoded
        self.mlb = mlb
        self.max_genre_count = max_genre_count
        self.num_users = num_users

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        row = self.data.iloc[index]
        movie_id = torch.tensor(row["item_id"], dtype=torch.long)
        user_id = torch.tensor(row["user_id"], dtype=torch.long)
        if user_id.min() < 0 or user_id.max() > self.num_users:
            print('self.num_users = ', self.num_users)
            raise ValueError(f"Invalid user ID: {user_id}")
        movie_genres = self.movies.loc[self.movies['movie_id'] == row['item_id'], 'genres'].iloc[0]
        genre_indices = []
        for genre in movie_genres.split('|'):
            if genre in self.mlb.classes_:
                genre_indices.append(np.where(self.mlb.classes_ == genre)[0][0])
        if len(genre_indices) == 0:
            genre_indices.append(0)
        genre_id = torch.tensor(genre_indices, dtype=torch.long)
        genre_id = torch.flatten(genre_id)[:self.max_genre_count]
        genre_pad = torch.zeros(self.max_genre_count - genre_id.shape[0], dtype=torch.long)
        genre_id = torch.cat([genre_id, genre_pad])
        rating = torch.tensor(row["rating"], dtype=torch.float)
        return {"movie_id": movie_id, "user_id": user_id, "genre_id": genre_id, "rating": rating}

Now let us work on making sure the datasets comform to the model.

In [265]:
item_dataset["release_date"] = pd.to_datetime(item_dataset["release_date"], format=r"%d-%b-%Y")

A check to ensure that all rated movies are present in the movie dataset.

In [266]:
valid_movies = set(item_dataset["movie_id"])
user_movie = user_movie[user_movie['item_id'].isin(valid_movies)]

A check for NaN values:

In [267]:
user_movie.isnull().sum()

user_id      0
item_id      0
rating       0
timestamp    0
dtype: int64

In [268]:
item_dataset.isnull().sum()

movie_id                 0
movie_title              0
release_date             1
video_release_date    1682
IMDB_URL                 3
unknown                  0
Action                   0
Adventure                0
Animation                0
Children's               0
Comedy                   0
Crime                    0
Documentary              0
Drama                    0
Fantasy                  0
Film-Noir                0
Horror                   0
Musical                  0
Mystery                  0
Romance                  0
Sci-Fi                   0
Thriller                 0
War                      0
Western                  0
dtype: int64

Video release date is meaningless information in user preferences, so is the IMDB_URL, so those columns can be safely dropped. We are only missing a single release date, so I will fill it in manually.

In [269]:
item_dataset.drop(["video_release_date", "IMDB_URL"], axis=1, inplace=True)

In [270]:
item_dataset[item_dataset.isnull().any(axis=1)]

Unnamed: 0,movie_id,movie_title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
266,267,unknown,NaT,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


So the line with a NaN date is in fact a fallback line for a non-existant movie. I will replace the release date with the earliest real date present in the column.

In [271]:
item_dataset.at[266, 'release_date'] = item_dataset['release_date'].min()

In [272]:
item_dataset.isnull().sum()

movie_id        0
movie_title     0
release_date    0
unknown         0
Action          0
Adventure       0
Animation       0
Children's      0
Comedy          0
Crime           0
Documentary     0
Drama           0
Fantasy         0
Film-Noir       0
Horror          0
Musical         0
Mystery         0
Romance         0
Sci-Fi          0
Thriller        0
War             0
Western         0
dtype: int64

In [273]:
user_dataset.isnull().sum()

id            0
age           0
gender        0
occupation    0
zip_code      0
dtype: int64

Collating the information about genres into a single vector:

In [275]:
item_dataset["genre_id"] = item_dataset.loc[:, genre_list].values.tolist()

In [276]:
item_dataset['genres'] = ''

for i in range(len(item_dataset)):
    item_dataset.at[i, "genres"] = '|'.join([genre_list[j] for j in range(len(item_dataset.at[i, "genre_id"]))
                                    if item_dataset.at[i, "genre_id"][j] == 1])

In [277]:
item_dataset.head()

Unnamed: 0,movie_id,movie_title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,genre_id,genres
0,1,Toy Story (1995),1995-01-01,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,"[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Animation|Children's|Comedy
1,2,GoldenEye (1995),1995-01-01,0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Action|Adventure|Thriller
2,3,Four Rooms (1995),1995-01-01,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Thriller
3,4,Get Shorty (1995),1995-01-01,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,"[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",Action|Comedy|Drama
4,5,Copycat (1995),1995-01-01,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,"[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...",Crime|Drama|Thriller


Dropping all the columns that are not directly helpful:

In [278]:
item_dataset.drop([*genre_list, "release_date", "movie_title"], axis=1, inplace=True)

Redefining movie and user id as categorical variables:

In [279]:
item_dataset['movie_id'] = item_dataset['movie_id'].astype('category')
item_dataset['movie_id'] = item_dataset['movie_id'].cat.as_ordered()

In [280]:
user_movie['user_id'] = user_movie['user_id'].astype('category')
user_movie['user_id'] = user_movie['user_id'].cat.codes

In [None]:
item_dataset.to_csv("../benchmark/data/item_dataset.csv")

Now we create a content-based filtering algorithm based on movie genres

In [281]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=lambda x: x.split('|'))
movie_genres = vectorizer.fit_transform(item_dataset['genres'])
mlb = MultiLabelBinarizer(sparse_output=True)
genres_encoded = mlb.fit_transform(list(vectorizer.vocabulary_.keys()))
genres_encoded = genres_encoded.astype(np.float32)



In [282]:
user_movie.drop('timestamp', axis=1, inplace=True)

In [283]:
train_data, test_data = train_test_split(user_movie, test_size=0.2, random_state=36)
num_users = user_movie['user_id'].max() + 1
num_movies = max(user_movie['item_id'].max(), user_movie['item_id'].max()) + 1
num_genres = len(genre_list)

In [284]:
train_dataset = MovieLensDataset(train_data, item_dataset, genres_encoded, mlb, max_genre_count=num_genres, num_users=num_users)
test_dataset = MovieLensDataset(test_data, item_dataset, genres_encoded, mlb, max_genre_count=num_genres, num_users=num_users)

In [285]:
# Create a list of indices for the train and test sets
num_train = len(train_data)
train_indices = list(range(num_train))
num_test = len(test_data)
test_indices = list(range(num_test))

In [286]:
from torch.utils.data import DataLoader

# Create the train and test DataLoader objects with the subsets
train_loader = DataLoader(train_dataset, batch_size= 1024, shuffle=True, drop_last=True)
# test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False, drop_last=True)

In [291]:
from tqdm import tqdm

# Set the hyperparameters and train the model
embedding_dim = 16
number_epochs = 10
hidden_dim = 32
dropout_p = 0.5
print(f'num_movies: {num_movies}')
print(f'num_users: {num_users}')
print(f'num_genres: {num_genres}')
model = MovieLensNet(num_movies,num_users, num_genres, embedding_size=32, hidden_dim=64)
num_train_samples = len(train_loader.dataset)
print(f"Number of training samples: {num_train_samples}")
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in tqdm(range(number_epochs)):
    running_loss = 0.0
    for i, batch in tqdm(enumerate(train_loader)):
        movie_id = batch['movie_id']
        user_id = batch['user_id']
        genre_id = batch['genre_id']
        rating = batch['rating']
        # print(f'movie_id shape: {movie_id.shape}')
        # print(f'user_id shape: {user_id.shape}')
        # print(f'genre_id shape: {genre_id.shape}')
        # print('user_id' , user_id)
        # print('model.num_users = ' , model.num_users)
        output = model(movie_id, user_id, genre_id)
        # print(f'output shape: {output.shape}')
        loss = criterion(output, rating.squeeze())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if (i + 1) % 10 == 0:
            print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0

torch.save(model.state_dict(), '../models/model_parameters.pth')

num_movies: 1683
num_users: 943
num_genres: 19
Number of training samples: 80000


  0%|          | 0/10 [00:00<?, ?it/s]

[1,    10] loss: 1.232




[1,    20] loss: 0.995




[1,    30] loss: 0.666




[1,    40] loss: 0.335




[1,    50] loss: 0.188




[1,    60] loss: 0.169




[1,    70] loss: 0.144


78it [00:31,  2.50it/s]
 10%|█         | 1/10 [00:31<04:40, 31.20s/it]

[2,    10] loss: 0.142




[2,    20] loss: 0.140




[2,    30] loss: 0.138




[2,    40] loss: 0.141




[2,    50] loss: 0.138




[2,    60] loss: 0.135




[2,    70] loss: 0.137


78it [00:31,  2.47it/s]
 20%|██        | 2/10 [01:02<04:11, 31.46s/it]

[3,    10] loss: 0.136




[3,    20] loss: 0.133




[3,    30] loss: 0.135




[3,    40] loss: 0.135




[3,    50] loss: 0.134




[3,    60] loss: 0.135




[3,    70] loss: 0.134


78it [00:31,  2.48it/s]
 30%|███       | 3/10 [01:34<03:40, 31.48s/it]

[4,    10] loss: 0.133




[4,    20] loss: 0.132




[4,    30] loss: 0.135




[4,    40] loss: 0.133




[4,    50] loss: 0.131




[4,    60] loss: 0.132




[4,    70] loss: 0.130


78it [00:31,  2.45it/s]
 40%|████      | 4/10 [02:06<03:09, 31.61s/it]

[5,    10] loss: 0.130




[5,    20] loss: 0.133




[5,    30] loss: 0.128




[5,    40] loss: 0.130




[5,    50] loss: 0.132




[5,    60] loss: 0.130




[5,    70] loss: 0.130


78it [00:32,  2.38it/s]
 50%|█████     | 5/10 [02:38<02:40, 32.02s/it]

[6,    10] loss: 0.129




[6,    20] loss: 0.131




[6,    30] loss: 0.131




[6,    40] loss: 0.130




[6,    50] loss: 0.128




[6,    60] loss: 0.130




[6,    70] loss: 0.129


78it [00:32,  2.37it/s]
 60%|██████    | 6/10 [03:11<02:09, 32.34s/it]

[7,    10] loss: 0.128




[7,    20] loss: 0.131




[7,    30] loss: 0.130




[7,    40] loss: 0.128




[7,    50] loss: 0.128




[7,    60] loss: 0.129




[7,    70] loss: 0.128


78it [00:32,  2.41it/s]
 70%|███████   | 7/10 [03:44<01:37, 32.37s/it]

[8,    10] loss: 0.130




[8,    20] loss: 0.128




[8,    30] loss: 0.130




[8,    40] loss: 0.129




[8,    50] loss: 0.131




[8,    60] loss: 0.125




[8,    70] loss: 0.127


78it [00:32,  2.40it/s]
 80%|████████  | 8/10 [04:16<01:04, 32.41s/it]

[9,    10] loss: 0.127




[9,    20] loss: 0.126




[9,    30] loss: 0.128




[9,    40] loss: 0.129




[9,    50] loss: 0.128




[9,    60] loss: 0.127




[9,    70] loss: 0.130


78it [00:32,  2.38it/s]
 90%|█████████ | 9/10 [04:49<00:32, 32.51s/it]

[10,    10] loss: 0.126




[10,    20] loss: 0.126




[10,    30] loss: 0.129




[10,    40] loss: 0.129




[10,    50] loss: 0.128




[10,    60] loss: 0.126




[10,    70] loss: 0.128


78it [00:32,  2.39it/s]
100%|██████████| 10/10 [05:22<00:00, 32.21s/it]


In [293]:
test_data.to_csv("../benchmark/data/test_dataset.csv")

In [294]:
item_dataset.to_csv("../benchmark/data/item_dataset.csv")