<a href="https://colab.research.google.com/github/AMakarova/BotPlay/blob/main/Bot_Play_FM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up the environment

In [42]:
from google.colab import drive
drive.mount('/content/drive')

%cd 'drive/MyDrive/The Movie Dataset'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 2] No such file or directory: 'drive/MyDrive/The Movie Dataset'
/content/drive/MyDrive/The Movie Dataset


# Build Dataset

## Process ratings

In [43]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Optional
import random
import os.path
import ast

In [44]:
RATINGS_PATH = "ratings.csv"
RECODED_RATINGS_PATH = 'ratings_recoded.csv'
METADATA_PATH = "movies_metadata.csv"
LINKS_PATH = "links.csv"

TOP_MOVIE_COUNT = 100
MIN_RATINGS = 10

In [45]:
def load_dataset(
    path: str = RATINGS_PATH,
    cols: Optional[list] = None,
    ):
  '''
  Loads dataset as DataFrame, filters to specified list of columns
  '''
  print(f"Loading {path} dataset...")
  dataset = pd.read_csv(path, low_memory=False)
  if cols:
    return dataset[cols]
  else:
    return dataset

In [46]:
def print_frequencies(ratings: pd.DataFrame):
  '''
  Prints user and movie count
  '''
  users = ratings['userId'].unique().size
  movies = ratings['movieId'].unique().size
  print(f"Dataset contains {users} unique users and {movies} unique movies")

In [47]:
def subset_ratings(
    ratings: pd.DataFrame, 
    top_movie_count: int = TOP_MOVIE_COUNT, 
    min_ratings: int = MIN_RATINGS
    ) -> pd.DataFrame:
  '''
  Subsets ratings dataset
  '''
  print(f"Subsetting ratings dataset...")
  movie_frequencies = ratings.groupby('movieId').count().sort_values('rating', ascending=False)
  ratings = ratings[ratings['movieId'].isin(movie_frequencies.index[:top_movie_count])]
                                            
  user_frequencies = ratings[ratings['rating']>=4].groupby('userId').count()['rating']
  ratings = ratings[ratings['userId'].isin(user_frequencies[user_frequencies>=min_ratings].index)]
  print_frequencies(ratings)
  return ratings

In [48]:
def binarise_dataset(ratings: pd.DataFrame) -> pd.DataFrame:
  '''
  Modify the dataset by:
  * recoding movies with rating >=4 as positive class
  * recoding movies with rating <4 as negative class
  * sampling additional movies for the negative class from the set of unrated movies
  '''
  print(f"Binarising ratings dataset and balancing classes.")
  recoded_ratings = pd.DataFrame()

  for u in tqdm(ratings['userId'].unique()[:50000]): # free runtime seems to disconnect after this point
    user_subset = ratings[ratings['userId']==u]
    positive_class = user_subset[user_subset['rating']>=4]
    positive_class.loc[:, 'rating'] = 1
    negative_class = user_subset[user_subset['rating']<4]
    negative_class.loc[:, 'rating'] = 0
    if negative_class.shape[0] < positive_class.shape[0]:
      len_diff = positive_class.shape[0] - negative_class.shape[0]
      movie_pool = [m for m in ratings['movieId'].unique() if m not in user_subset['movieId']]
      unrated_movie_sample = random.sample(movie_pool, len_diff)
      unrated_movies = pd.DataFrame()
      unrated_movies.loc[:, 'movieId'] = unrated_movie_sample
      unrated_movies.loc[:, 'userId'] = u
      unrated_movies.loc[:, 'rating'] = 0
    recoded_ratings = pd.concat([recoded_ratings, positive_class, negative_class, unrated_movies])
  print_frequencies(recoded_ratings)
  return recoded_ratings

In [49]:
def process_ratings(use_cache: bool = True) -> pd.DataFrame:
  '''
  Process the ratings dataset
  '''
  if use_cache and os.path.exists(RECODED_RATINGS_PATH):
    ratings = load_dataset(RECODED_RATINGS_PATH, cols=['userId', 'movieId', 'rating'])
  else:
    ratings = load_dataset(path=RATINGS_PATH, cols=['userId', 'movieId', 'rating'])
    print_frequencies(ratings)
    ratings = subset_ratings(ratings)
    ratings = binarise_dataset(ratings)
    ratings.to_csv('ratings_recoded.csv')
  return ratings

In [50]:
def parse_genres(metadata: pd.DataFrame) -> pd.DataFrame:
  '''
  Reformat genre data in columnar format
  '''
  genres = []
  for line in metadata['genres']:
    for genre in ast.literal_eval(line):
      if genre['name'] not in genres:
        genres.append(genre['name'])

  genres = np.array(genres[:20]) # remove the erroneous genres
  genre_matrix = np.zeros([metadata.shape[0], len(genres)], dtype=int)

  for row_index, line in enumerate(tqdm(metadata['genres'])):
    for genre in ast.literal_eval(line):
      if genre['name'] in genres:
        genre_index = np.where(genres==genre['name'])[0][0]
        genre_matrix[row_index, genre_index] = 1

  genre_matrix = pd.DataFrame(genre_matrix, columns=genres)
  metadata = pd.concat([metadata, genre_matrix], axis=1)
  metadata.drop('genres', axis=1, inplace=True)
  return metadata

In [51]:
def merge_in_metadata(ratings: pd.DataFrame) -> pd.DataFrame:
  '''
  Reformat genre data in columnar format
  '''
  metadata = load_dataset(path=METADATA_PATH, cols=['id', 'title', 'genres'])
  metadata['id'] = pd.to_numeric(metadata['id'], errors='coerce')
  metadata = parse_genres(metadata)
  links = load_dataset(path=LINKS_PATH, cols=['movieId', 'tmdbId'])
  links.rename(columns={'tmdbId':'id'}, inplace=True)
  metadata = metadata.merge(links, how='inner', on='id')
  ratings = ratings.merge(metadata.drop(columns=['id']), how='left', on='movieId').fillna(0)
  return ratings

In [53]:
ratings = process_ratings()
ratings = merge_in_metadata(ratings)

Loading ratings_recoded.csv dataset...
Loading movies_metadata.csv dataset...


100%|██████████| 45466/45466 [00:01<00:00, 23883.00it/s]


Loading links.csv dataset...
