# Movie Recommendation System

This Jupyter Notebook presents a movie recommendation system using PyTorch. The dataset contains movie ratings and movie details including genres, and we utilize temporal information to enhance the model performance.

In [22]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Data Citation:
# F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on
# Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>

! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

In [24]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [25]:
# import the dataset
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [None]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

In [None]:
# Take a look at movies_df
movies_df.head()

In [None]:
# Take a look at ratings_df
ratings_df.head()

In [None]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

## Dataset Class
The `MovieDataset` class extends the PyTorch `Dataset` class to handle our movie data. It includes preprocessing steps such as adding temporal features from timestamps and one-hot encoding of genres.

In [30]:
class MovieDataset(Dataset):
    def __init__(self, ratings_df, movies_df, train=True):
        # Add temporal features
        self.ratings = ratings_df.copy()
        self.ratings['datetime'] = pd.to_datetime(self.ratings['timestamp'], unit='s')
        self.ratings['hour'] = self.ratings['datetime'].dt.hour
        self.ratings['day_of_week'] = self.ratings['datetime'].dt.dayofweek

        # Add movie features
        movies = movies_df.copy()
        # One-hot encode genres
        genre_dummies = movies['genres'].str.get_dummies('|')
        self.n_genre_features = len(genre_dummies.columns)
        movies = pd.concat([movies, genre_dummies], axis=1)

        # Merge movie features
        self.ratings = self.ratings.merge(movies[['movieId'] + list(genre_dummies.columns)],
                                        on='movieId', how='left')

        # Create continuous IDs
        self.userid2idx = {o:i for i,o in enumerate(self.ratings['userId'].unique())}
        self.movieid2idx = {o:i for i,o in enumerate(self.ratings['movieId'].unique())}

        # Convert IDs to indices
        self.ratings['user_idx'] = self.ratings['userId'].map(self.userid2idx)
        self.ratings['movie_idx'] = self.ratings['movieId'].map(self.movieid2idx)
        # Create a reverse mapping for movie IDs
        self.idx2movieid = {i:o for o, i in self.movieid2idx.items()}

        # Split train/test
        if train:
            self.ratings = self.ratings.sample(frac=0.8, random_state=42)
        else:
            self.ratings = self.ratings.sample(frac=0.2, random_state=42)

        # Scale temporal features
        scaler = MinMaxScaler()
        self.ratings[['hour', 'day_of_week']] = scaler.fit_transform(self.ratings[['hour', 'day_of_week']])

        # Prepare features and target
        self.features = self.ratings[['user_idx', 'movie_idx', 'hour', 'day_of_week'] +
                                   list(genre_dummies.columns)].values
        self.targets = self.ratings['rating'].values

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return torch.FloatTensor(self.features[idx]), torch.FloatTensor([self.targets[idx]])


## Recommender Model
The `Recommender` class is the core of our recommendation system. We utilize embeddings to represent users and movies, and a neural network to predict ratings based on these embeddings coupled with temporal and genre features.

In [31]:
class Recommender(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=50, n_genres=20):
        super().__init__()

        # Embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)

        # Neural network for feature processing
        self.nn = torch.nn.Sequential(
            torch.nn.Linear(n_factors * 2 + 2 + n_genres, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(64, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, 1)
        )

        # Initialize weights
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, torch.nn.Linear):
            torch.nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                module.bias.data.zero_()

    def forward(self, features):
        user_idx = features[:, 0].long()
        movie_idx = features[:, 1].long()
        temporal_features = features[:, 2:4]
        genre_features = features[:, 4:]

        # Get embeddings
        user_embedding = self.user_factors(user_idx)
        movie_embedding = self.item_factors(movie_idx)

        # Concatenate all features
        x = torch.cat([
            user_embedding,
            movie_embedding,
            temporal_features,
            genre_features
        ], dim=1)

        return self.nn(x)


In [None]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = Recommender(n_users, n_items, n_factors=50, n_genres=20)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

# Prepare dataset
train_set = MovieDataset(ratings_df, movies_df, train=True)
val_set = MovieDataset(ratings_df, movies_df, train=False)

# Data loaders
train_loader = DataLoader(train_set, batch_size=128, shuffle=True)
val_loader = DataLoader(val_set, batch_size=128, shuffle=False)

## Training the Model
We define a `train_model` function which trains the `Recommender` model and evaluates its performance on validation data. The training process includes the option of learning rate scheduling and saving the best-performing model.

In [12]:
def train_model(model, train_loader, val_loader, epochs=50, lr=0.001):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)
    criterion = torch.nn.MSELoss()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    best_val_loss = float('inf')
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        for features, targets in train_loader:
            features, targets = features.to(device), targets.to(device)

            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for features, targets in val_loader:
                features, targets = features.to(device), targets.to(device)
                outputs = model(features)
                val_loss += criterion(outputs, targets).item()

        train_loss /= len(train_loader)
        val_loss /= len(val_loader)

        # Learning rate scheduling
        scheduler.step(val_loss)

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pt')

        print(f'Epoch {epoch}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}')


In [None]:
train_model(model, train_loader, val_loader, epochs=num_epochs, lr=1e-3)

In [None]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

In [15]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [None]:
len(trained_movie_embeddings) # unique movie factor weights

In [33]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [None]:
'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(10):
    print("Cluster #{}".format(cluster))
    movs = []
    # Find movie indices belonging to the current cluster
    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        movid = train_set.idx2movieid[movidx]
        # Check how many ratings this movie has
        rat_count = len(ratings_df.loc[ratings_df['movieId'] == movid])
        movs.append((movie_names[movid], rat_count))
    # Sort movies by rating count in descending order and print top 10
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", mov[0])