In [None]:
import re
import os
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from gensim.models import Word2Vec

import warnings
warnings.filterwarnings("ignore")

## Loading Datasets

In [None]:
drive_dir = '/content/drive/MyDrive/ML Datasets/'

In [None]:
# dataset obtained from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
ratings_df = pd.read_csv(os.path.join(drive_dir, 'ml-latest-small/ratings.csv'))
movies_df = pd.read_csv(os.path.join(drive_dir, 'ml-latest-small/movies.csv'))

In [None]:
print("Ratings: ", ratings_df.shape)
print("Movies: ", movies_df.shape)

Ratings:  (100836, 4)
Movies:  (9742, 3)


In [None]:
ratings_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [None]:
movies_df.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [None]:
ratings_df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [None]:
movies_df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

## Preparing Data

In [None]:
# converting stockcode to str type
ratings_df['movieId']= ratings_df['movieId'].astype(str)

In [None]:
users = ratings_df['userId'].unique().tolist()
len(users)

610

Splitting datasets for training and testing

In [None]:
# shuffle userIds
random.shuffle(users)

# extract 90% of userIds for training
users_train = [users[i] for i in range(round(0.9 * len(users)))]

# split ratings and movies datasets into training and validation
ratings_train = ratings_df[ratings_df['userId'].isin(users_train)]
ratings_val = ratings_df[~ratings_df['userId'].isin(users_train)]
print("Ratings:", ratings_train.shape)

Ratings: (90266, 4)


Create sequences of movies watched by the users for both train and val set if rating >= 4

In [None]:
# list to capture movies watched by users for training
watched_train = []

# populate the list with movieIDs
for i in tqdm(users_train, position=0, leave=True):
  temp = ratings_train[(ratings_train['userId'] == i) & (ratings_train['rating'] >= 4)]['movieId'].tolist()
  # is shuffling needed to separate movies from those around similar release years?
  # random.shuffle(temp)
  watched_train.append(temp)

# list to capture movies watched by users for validation
watched_val = []

# populate the list with movieIDs
for i in tqdm(ratings_val['userId'].unique(), position=0, leave=True):
  temp = ratings_val[ratings_val['userId'] == i]['movieId'].tolist()
  watched_val.append(temp)

100%|██████████| 549/549 [00:00<00:00, 586.24it/s]
100%|██████████| 61/61 [00:00<00:00, 1286.98it/s]


## Building Word2vec Embeddings

In [None]:
# train word2vec model
model = Word2Vec(sentences=watched_train,
                 iter = 5,
                 min_count = 5,
                 sg = 1, # using skip-gram so 1
                 hs = 0, # using negative sampling
                 negative = 5, # for negative sampling
                 alpha=0.03, 
                 min_alpha=0.0007,
                 seed = 14,
                 window = 9999999)  # large window size as used in item2vec

In [None]:
# Since we are not planning to train the model any further, we are calling init_sims( ) here. This will make the model much more memory-efficient
model.init_sims(replace=True)

In [None]:
print(model)

Word2Vec(vocab=1816, size=100, alpha=0.03)


## Start Recommendation

In [None]:
# creating a dict to map movie names to IDs
reference = ratings_train['movieId'].unique().tolist()
reference_dict = defaultdict(list)
for i in tqdm(reference, position=0, leave=True):
  movie_title = movies_df[movies_df['movieId'] == int(i)]['title'].tolist()[0]
  movie_name = movie_title.split('(')[0].strip().lower()
  reference_dict[i] = [movie_title, movie_name]

100%|██████████| 9035/9035 [00:05<00:00, 1688.00it/s]


In [None]:
def get_movie_id(d, val):
  # get movie ID from given movie name
  for k, v in d.items():
    if v[1] == val:
      return k
  return None

In [None]:
# testing the dict
movie_name = 'toy story'
reference_dict[get_movie_id(reference_dict, movie_name)]

['Toy Story (1995)', 'toy story']

In [None]:
def similar_products(val, n = 10):
  # get vector for input movie name
  v = model[str(get_movie_id(reference_dict, val))]

  # extract most similar products for the input vector
  ms = model.wv.most_similar_cosmul(positive=[v], topn= n+1)[1:]

  # extract name and similarity score of the similar products
  new_ms = []
  for j in ms:
    pair = (reference_dict[j[0]][0], j[1])
    new_ms.append(pair)

  return new_ms  

In [None]:
similar_products('iron man')

[('Avengers, The (2012)', 0.8827745318412781),
 ('300 (2007)', 0.8811231851577759),
 ('Avatar (2009)', 0.8782851099967957),
 ('Casino Royale (2006)', 0.8634577393531799),
 ('In Bruges (2008)', 0.861535906791687),
 ('Toy Story 3 (2010)', 0.8549548983573914),
 ('Bourne Identity, The (2002)', 0.8530797958374023),
 ('V for Vendetta (2006)', 0.853043258190155),
 ('Sin City (2005)', 0.848569393157959),
 ('Bourne Ultimatum, The (2007)', 0.8467860221862793)]