<a href="https://colab.research.google.com/github/AMakarova/BotPlay/blob/main/Bot_Play_FM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up the environment

In [42]:
from google.colab import drive
drive.mount('/content/drive')

%cd 'drive/MyDrive/The Movie Dataset'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 2] No such file or directory: 'drive/MyDrive/The Movie Dataset'
/content/drive/MyDrive/The Movie Dataset


# Build Dataset

## Process ratings

In [43]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Optional
import random
import os.path
import ast

In [44]:
RATINGS_PATH = "ratings.csv"
RECODED_RATINGS_PATH = 'ratings_recoded.csv'
METADATA_PATH = "movies_metadata.csv"
LINKS_PATH = "links.csv"

TOP_MOVIE_COUNT = 100
MIN_RATINGS = 10

In [45]:
def load_dataset(
    path: str = RATINGS_PATH,
    cols: Optional[list] = None,
    ):
  '''
  Loads dataset as DataFrame, filters to specified list of columns
  '''
  print(f"Loading {path} dataset...")
  dataset = pd.read_csv(path, low_memory=False)
  if cols:
    return dataset[cols]
  else:
    return dataset

In [46]:
def print_frequencies(ratings: pd.DataFrame):
  '''
  Prints user and movie count
  '''
  users = ratings['userId'].unique().size
  movies = ratings['movieId'].unique().size
  print(f"Dataset contains {users} unique users and {movies} unique movies")

In [47]:
def subset_ratings(
    ratings: pd.DataFrame, 
    top_movie_count: int = TOP_MOVIE_COUNT, 
    min_ratings: int = MIN_RATINGS
    ) -> pd.DataFrame:
  '''
  Subsets ratings dataset
  '''
  print(f"Subsetting ratings dataset...")
  movie_frequencies = ratings.groupby('movieId').count().sort_values('rating', ascending=False)
  ratings = ratings[ratings['movieId'].isin(movie_frequencies.index[:top_movie_count])]
                                            
  user_frequencies = ratings[ratings['rating']>=4].groupby('userId').count()['rating']
  ratings = ratings[ratings['userId'].isin(user_frequencies[user_frequencies>=min_ratings].index)]
  print_frequencies(ratings)
  return ratings

In [48]:
def binarise_dataset(ratings: pd.DataFrame) -> pd.DataFrame:
  '''
  Modify the dataset by:
  * recoding movies with rating >=4 as positive class
  * recoding movies with rating <4 as negative class
  * sampling additional movies for the negative class from the set of unrated movies
  '''
  print(f"Binarising ratings dataset and balancing classes.")
  recoded_ratings = pd.DataFrame()

  for u in tqdm(ratings['userId'].unique()[:50000]): # free runtime seems to disconnect after this point
    user_subset = ratings[ratings['userId']==u]
    positive_class = user_subset[user_subset['rating']>=4]
    positive_class.loc[:, 'rating'] = 1
    negative_class = user_subset[user_subset['rating']<4]
    negative_class.loc[:, 'rating'] = 0
    if negative_class.shape[0] < positive_class.shape[0]:
      len_diff = positive_class.shape[0] - negative_class.shape[0]
      movie_pool = [m for m in ratings['movieId'].unique() if m not in user_subset['movieId']]
      unrated_movie_sample = random.sample(movie_pool, len_diff)
      unrated_movies = pd.DataFrame()
      unrated_movies.loc[:, 'movieId'] = unrated_movie_sample
      unrated_movies.loc[:, 'userId'] = u
      unrated_movies.loc[:, 'rating'] = 0
    recoded_ratings = pd.concat([recoded_ratings, positive_class, negative_class, unrated_movies])
  print_frequencies(recoded_ratings)
  return recoded_ratings

In [49]:
def process_ratings(use_cache: bool = True) -> pd.DataFrame:
  '''
  Process the ratings dataset
  '''
  if use_cache and os.path.exists(RECODED_RATINGS_PATH):
    ratings = load_dataset(RECODED_RATINGS_PATH, cols=['userId', 'movieId', 'rating'])
  else:
    ratings = load_dataset(path=RATINGS_PATH, cols=['userId', 'movieId', 'rating'])
    print_frequencies(ratings)
    ratings = subset_ratings(ratings)
    ratings = binarise_dataset(ratings)
    ratings.to_csv('ratings_recoded.csv')
  return ratings

In [50]:
def parse_genres(metadata: pd.DataFrame) -> pd.DataFrame:
  '''
  Reformat genre data in columnar format
  '''
  genres = []
  for line in metadata['genres']:
    for genre in ast.literal_eval(line):
      if genre['name'] not in genres:
        genres.append(genre['name'])

  genres = np.array(genres[:20]) # remove the erroneous genres
  genre_matrix = np.zeros([metadata.shape[0], len(genres)], dtype=int)

  for row_index, line in enumerate(tqdm(metadata['genres'])):
    for genre in ast.literal_eval(line):
      if genre['name'] in genres:
        genre_index = np.where(genres==genre['name'])[0][0]
        genre_matrix[row_index, genre_index] = 1

  genre_matrix = pd.DataFrame(genre_matrix, columns=genres)
  metadata = pd.concat([metadata, genre_matrix], axis=1)
  metadata.drop('genres', axis=1, inplace=True)
  return metadata

In [51]:
def merge_in_metadata(ratings: pd.DataFrame) -> pd.DataFrame:
  '''
  Reformat genre data in columnar format
  '''
  metadata = load_dataset(path=METADATA_PATH, cols=['id', 'title', 'genres'])
  metadata['id'] = pd.to_numeric(metadata['id'], errors='coerce')
  metadata = parse_genres(metadata)
  links = load_dataset(path=LINKS_PATH, cols=['movieId', 'tmdbId'])
  links.rename(columns={'tmdbId':'id'}, inplace=True)
  metadata = metadata.merge(links, how='inner', on='id')
  ratings = ratings.merge(metadata.drop(columns=['id']), how='left', on='movieId').fillna(0)
  return ratings

In [53]:
ratings = process_ratings()
ratings = merge_in_metadata(ratings)

Loading ratings_recoded.csv dataset...
Loading movies_metadata.csv dataset...


100%|██████████| 45466/45466 [00:01<00:00, 23883.00it/s]


Loading links.csv dataset...


# Recommendation module

## Training / validation split

In [54]:
from sklearn.model_selection import train_test_split
VAL_SIZE = 0.2

X_train, X_val, y_train, y_val = train_test_split(
        ratings.drop(['rating', 'title'], axis=1),
        ratings['rating'],
        test_size=VAL_SIZE,
        random_state=13,
)

## Model training code

In [55]:
import copy
import logging
from typing import Optional, Tuple, Union
import pickle

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
from torch.utils.data import DataLoader, TensorDataset

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

SklearnInput = Union[list, np.ndarray, pd.DataFrame, pd.Series]


class BinaryClassifierModel():
    def __init__(
        self,
        model: nn.Module,
        batch_size: int,
        learning_rate: float = 0.0001,
        epochs: int = 1000,
        optimiser: type = optim.Adam,
        loss: nn.modules.loss._Loss = torch.nn.BCEWithLogitsLoss(),
        silent = False
    ):
        """
        Initialises the PyTorch based binary classifier object.

        :param model: PyTorch model object
        :param batch_size: batch size for model training
        :param learning_rate: training learning rate
        :param epochs: max number of training epochs
        :param optimiser: optimiser object
        :param loss: loss object
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model.to(self.device)
        self.loss = loss.to(self.device)
        self.optimiser = optimiser(self.model.parameters(), lr=learning_rate)
        self.batch_size = batch_size
        self.epochs = epochs
        self.loss_history = {}
        self.silent = silent

    def fit(
        self,
        X: SklearnInput,
        y: SklearnInput,
        eval_set: Optional[Tuple[SklearnInput, SklearnInput]] = None,
        early_stopping_rounds: int = 10,
        save_weights: bool = False,
        epochs: Optional[int] = None,
    ) -> "BinaryClassifierModel":
        """
        Fits the binary classifier.

        :param X: feature dataset
        :param y: output dataset
        :param eval_set: optional validation feature and output datasets
        :param early_stopping_rounds: for how many epochs to train model after loss stops improving
        :param save_weights: flag for saving weights to disk
        :param epochs: max number of training epochs
        :return: self
        """
        train_iterator = self._dataset_iterator(X, y)
        best_loss_epoch = None

        if eval_set:
            (X_val, y_val) = eval_set
            val_iterator = self._dataset_iterator(X_val, y_val)
            best_valid_loss = float("inf")
        else:
            best_train_loss = float("inf")

        if epochs:
            self.epochs = epochs

        for epoch in range(self.epochs):

            train_loss = self._train(train_iterator)
            self.loss_history[epoch] = {}
            self.loss_history[epoch]['train'] = train_loss

            if eval_set:
                valid_loss = self._evaluate(val_iterator)
                self.loss_history[epoch]['val'] = valid_loss
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    best_loss_epoch = epoch
                    best_model = copy.copy(self.model)

            else:
                if train_loss < best_train_loss:
                    best_train_loss = train_loss
                    best_loss_epoch = epoch
                    best_model = copy.copy(self.model)


            if eval_set:
                print(f"\tVal. loss: {valid_loss:.3f}")

            if save_weights:
              torch.save(fm.state_dict(), 'fm_weights')
              with open('fm_loss_history.pickle', 'wb') as handle:
                  pickle.dump(self.loss_history, handle, protocol=pickle.HIGHEST_PROTOCOL)

            if epoch >= best_loss_epoch + early_stopping_rounds:
                self.model = copy.copy(best_model)
                break

        return self


    def predict(self, X: SklearnInput) -> pd.DataFrame: 
        """
        Returns predictions as a DataFrame.

        :param X: feature dataset
        :return: a DataFrame with predicted ratings
        """
        self.model.eval()
        return pd.DataFrame(
            self.model(torch.Tensor(np.array(X))).sigmoid().detach().numpy(), # check if correct
            columns=['predicted_rating'],
        )

    def param_count(self) -> int:
        """
        Returns number of trainable parameters in the model.

        :return: number of trainable parameters
        """
        return sum(p.numel() for p in self.model.parameters() if p.requires_grad)

    def _dataset_iterator(
        self,
        X: SklearnInput,
        y: SklearnInput,
    ) -> DataLoader[object]:
        """
        Converts scikit-learn style dataset into PyTorch DataLoader.

        :param X: feature dataset
        :param y: output dataset
        :return: DataLoader iterator object
        """

        dataset = TensorDataset(
            torch.Tensor(np.array(X, dtype="float64")),
            # torch.Tensor(np.array(y, dtype="float64").reshape((-1, 1))),
            torch.Tensor(np.array(y, dtype="int").reshape(-1, 1))
        )

        return DataLoader(dataset, batch_size=self.batch_size)

    def _train(self, iterator: DataLoader[object]) -> Tuple[float, float]:
        """
        Performs forward and backpropagation, updates model params
        based on the loss.

        :param iterator: dataset iterator
        :return: average epoch loss
        """
        epoch_loss = 0.0
        self.model.train()

        t = tqdm.tqdm(iterator, position=0, leave=True, disable=self.silent)

        for (i, (x, y)) in enumerate(t):
            x = x.to(self.device)
            y = y.to(self.device)
            self.optimiser.zero_grad()
            y_pred = self.model(x)
            loss = self.loss(y_pred, y)
            loss.backward()
            self.optimiser.step()
            epoch_loss += loss.item()
            t.set_description(f"Train loss = {epoch_loss / (i+1):.3f}")

        return epoch_loss / len(iterator)

    def _evaluate(self, iterator: DataLoader[object]) -> Tuple[float, float]:
        """
        Performs forward propagation only and calculates loss
        without updating model params.

        :param iterator: dataset iterator
        :return: average epoch loss
        """
        epoch_loss = 0.0
        self.model.eval()

        with torch.no_grad():  
            for (x, y) in iterator:
                x = x.to(self.device)
                y = y.to(self.device)
                y_pred = self.model(x)
                loss = self.loss(y_pred, y)
                epoch_loss += loss.item()

        return epoch_loss / len(iterator)

## Model architecture

In [56]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [57]:
class TorchFM(nn.Module):
    def __init__(self, categorical_vars, X, k=None):
        super().__init__()

        # upacking categorical vars
        self.cat_cols = categorical_vars
        self.dict_size = (X[categorical_vars].max()+1).to_list()
        continuous_dim = X.shape[1] - len(self.cat_cols)
        categorical_dim = sum(self.dict_size)
        n = continuous_dim + categorical_dim
       
        # Initially we fill V with random values sampled from Gaussian distribution
        # NB: use nn.Parameter to compute gradients
        self.V = nn.Parameter(torch.randn(n, k),requires_grad=True)
        self.lin = nn.Linear(n, 1)

        
    def forward(self, x):

        # one-hot transformations
        x_transformed = []

        for i, col in enumerate(self.cat_cols):      
            x_i = F.one_hot((x[:, i]).long(), self.dict_size[i])
            if col == 'movieId':
              x_i[:, 0] = 0
            x_transformed.append(x_i)
        if len(self.cat_cols) > 0:
            x_cont = x[:, len(self.cat_cols):]
        else:
            x_cont = x
        x_transformed.append(x_cont)
        x_transformed = torch.cat(x_transformed, 1)

        # FM calculations
        out_1 = torch.matmul(x_transformed, self.V).pow(2).sum(1, keepdim=True) #S_1^2
        self.x_transformed = x_transformed
        out_2 = torch.matmul(x_transformed.pow(2), self.V.pow(2)).sum(1, keepdim=True) # S_2
        out_inter = 0.5*(out_1 - out_2)
        out_lin = self.lin(x_transformed)
        out = out_inter + out_lin
        
        return out

In [58]:
EPOCHS = 100
BATCH_SIZE = 16

categorical_vars = ['userId', 'movieId']

In [59]:
fm = TorchFM(categorical_vars, ratings.drop(['rating', 'title'], axis=1), k=20)

model = BinaryClassifierModel(
          model=fm,
          epochs=EPOCHS,
          batch_size=BATCH_SIZE,
)

In [None]:
model.fit(
      X_train,
      y_train,
      eval_set=(X_val, y_val),
      early_stopping_rounds=5,
      save_weights=True
)

Train loss = 5.001:   1%|          | 1245/129969 [01:40<2:49:51, 12.63it/s]