<a href="https://colab.research.google.com/github/AMakarova/BotPlay/blob/main/Bot_Play_FM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up the environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd 'drive/MyDrive/The Movie Dataset'

Mounted at /content/drive
/content/drive/MyDrive/The Movie Dataset


# Build Dataset

## Process ratings

In [None]:
# import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Optional
from joblib import Memory
import random

In [None]:
RATINGS_PATH = "ratings.csv"
METADATA_PATH = "movies_metadata.csv"

TOP_MOVIE_COUNT = 100
MIN_RATINGS = 5

cachedir = 'cache'
memory = Memory(cachedir, verbose=0)

In [None]:
def load_dataset(
    path: str = RATINGS_PATH,
    cols: Optional[list] = None,
    ):
  '''
  Loads dataset as DataFrame, filters to specified list of columns
  '''
  dataset = pd.read_csv(path, low_memory=False)
  if cols:
    return dataset[cols]
  else:
    return dataset

In [None]:
def print_frequencies(ratings: pd.DataFrame):
  '''
  Prints user and movie count
  '''
  users = ratings['userId'].unique().size
  movies = ratings['movieId'].unique().size
  print(f"Dataset contains {users} unique users and {movies} unique movies")

In [None]:
def subset_ratings(
    ratings: pd.DataFrame, 
    top_movie_count: int = TOP_MOVIE_COUNT, 
    min_ratings: int = MIN_RATINGS
    ) -> pd.DataFrame:
  '''
  Subsets ratings dataset
  '''
  movie_frequencies = ratings.groupby('movieId').count().sort_values('rating', ascending=False)
  ratings = ratings[ratings['movieId'].isin(movie_frequencies.index[:top_movie_count])]
                                            
  user_frequencies = ratings[ratings['rating']>=4].groupby('userId').count()['rating']
  ratings = ratings[ratings['userId'].isin(user_frequencies[user_frequencies>=min_ratings].index)]
  return ratings

In [None]:
ratings = load_dataset(path=RATINGS_PATH, cols=['userId', 'movieId', 'rating'])
print_frequencies(ratings)
ratings = subset_ratings(ratings)
print_frequencies(ratings)

Dataset contains 270896 unique users and 45115 unique movies
Dataset contains 147208 unique users and 100 unique movies
