# Spotlight Recommender Systems

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Reading the Data

In [2]:
ratings_data = pd.read_csv('./data/ratings_small.csv.zip')
metadata = pd.read_csv('./data/movies_metadata.csv.zip')
links_data = pd.read_csv('./data/links.csv')
ratings_data.head(10)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [3]:
ratings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [4]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [5]:
links_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45843 entries, 0 to 45842
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  45843 non-null  int64  
 1   imdbId   45843 non-null  int64  
 2   tmdbId   45624 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.0 MB


## Preprocessing the Data

In [6]:
ratings_data['userId'] = ratings_data['userId'].astype('int32')

In [7]:
metadata = metadata[metadata['imdb_id'].notna()]

In [8]:
def remove_characters(string):
    
    return ''.join(filter(str.isdigit, string))

In [9]:
metadata['imdb_id'] = metadata['imdb_id'].apply(lambda x: int(remove_characters(str(x))))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [10]:
full_metadata = pd.merge(metadata, links_data, left_on='imdb_id', right_on='imdbId')
full_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45383 entries, 0 to 45382
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45383 non-null  object 
 1   belongs_to_collection  4486 non-null   object 
 2   budget                 45383 non-null  object 
 3   genres                 45383 non-null  object 
 4   homepage               7765 non-null   object 
 5   id                     45383 non-null  object 
 6   imdb_id                45383 non-null  int64  
 7   original_language      45372 non-null  object 
 8   original_title         45383 non-null  object 
 9   overview               44433 non-null  object 
 10  popularity             45380 non-null  object 
 11  poster_path            45005 non-null  object 
 12  production_companies   45380 non-null  object 
 13  production_countries   45380 non-null  object 
 14  release_date           45302 non-null  object 
 15  re

## Exploratory Data Analysis

In [11]:
from spotlight.interactions import Interactions

dataset = Interactions(user_ids=ratings_data['userId'].values,
                       item_ids=ratings_data['movieId'].values,
                       ratings=ratings_data['rating'].values,
                       timestamps=ratings_data['timestamp'].values)

## Training a Matrix Factorization Model

In [25]:
from spotlight.cross_validation import random_train_test_split
from spotlight.evaluation import rmse_score
from spotlight.factorization.explicit import ExplicitFactorizationModel

train, test = random_train_test_split(dataset)

model = ExplicitFactorizationModel(n_iter=10)
model.fit(train, verbose=True)

rmse = rmse_score(model, test)
print('RMSE = ', rmse)

Epoch 0: loss 4.494929069874945
Epoch 1: loss 0.8425834600011973
Epoch 2: loss 0.5420750372064997
Epoch 3: loss 0.38652444562064103
Epoch 4: loss 0.30954678428190163
Epoch 5: loss 0.26690390673145314
Epoch 6: loss 0.24580617306721325
Epoch 7: loss 0.23303465699786075
Epoch 8: loss 0.2235499506040965
Epoch 9: loss 0.2163570392770579
RMSE = 1.1101374661355057


In [None]:
model.fit(dataset)

## Generating Predictions From the Matrix Factorization Model

In [13]:
model.predict(user_ids=1)

array([0.42891726, 2.2079964 , 1.6789076 , ..., 0.24747998, 0.36188596,
       1.658421  ], dtype=float32)

In [14]:
def get_metadata(movie_id, metadata):
    
    movie_data = metadata[metadata['movieId'] == movie_id]
    return movie_data[['original_title', 'release_date', 'genres']].to_dict(orient='records')

def recommend_movies(user_id, metadata, model, n_movies=5):
     
    pred = model.predict(user_ids=user_id)
    indices = np.argpartition(pred, -n_movies)[-n_movies:]
    best_movie_ids = indices[np.argsort(pred[indices])]
    
    return [get_metadata(movie_id + 1, metadata) for movie_id in best_movie_ids]
    

In [15]:
get_metadata(1, full_metadata)

[{'original_title': 'Toy Story',
  'release_date': '1995-10-30',
  'genres': "[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"}]

In [16]:
recommend_movies(1, full_metadata, model)

[[{'original_title': 'Hustler White',
   'release_date': '1996-07-19',
   'genres': "[{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}]"}],
 [{'original_title': 'Cape Fear',
   'release_date': '1962-04-12',
   'genres': "[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name': 'Thriller'}]"}],
 [{'original_title': 'A Face in the Crowd',
   'release_date': '1957-10-17',
   'genres': "[{'id': 18, 'name': 'Drama'}]"}],
 [],
 [{'original_title': 'Magnolia',
   'release_date': '1999-12-08',
   'genres': "[{'id': 18, 'name': 'Drama'}]"}]]

## Training a Sequential Model

In [17]:
from spotlight.sequence.implicit import ImplicitSequenceModel
from spotlight.evaluation import sequence_mrr_score
from spotlight.cross_validation import user_based_train_test_split

train, test = user_based_train_test_split(dataset)

train = train.to_sequence()
test = test.to_sequence()

model = ImplicitSequenceModel(n_iter=10,
                              representation='cnn',
                              loss='bpr')

model.fit(train, verbose=True)

mrr_score = sequence_mrr_score(model, test)
print(mrr_score)



Epoch 0: loss 0.23515926493752387
Epoch 1: loss 0.027926203284052112
Epoch 2: loss 0.015699874850050095
Epoch 3: loss 0.01316860222047375
Epoch 4: loss 0.012089601147078698
Epoch 5: loss 0.011510106464547495
Epoch 6: loss 0.011428503169407767
Epoch 7: loss 0.011324393653100537
Epoch 8: loss 0.010540511308898848
Epoch 9: loss 0.010737932812903197
[0.00100402 0.00277778 0.00194932 ... 0.00110619 0.0005305  0.00471698]


In [None]:
model.fit(dataset.to_sequence(), verbose=True)

## Generating Predictions from the Sequential Model

In [18]:
model.predict(sequences=np.array([1, 2, 3, 4, 5]))

array([ 0.      , 16.237215, 11.529311, ..., -2.713985, -2.403066,
       -3.747315], dtype=float32)

In [19]:
import difflib

def get_movie_id(movie_title, metadata):
    
    existing_titles = list(metadata['original_title'].values)
    closest_titles = difflib.get_close_matches(movie_title, existing_titles)
    movie_id = metadata[metadata['original_title'] == closest_titles[0]]['movieId'].values[0]
    return movie_id

def recommend_next_movies(movies, metadata, model, n_movies=5):
    
    movie_ids = [get_movie_id(movie, metadata) for movie in movies]
    pred = model.predict(sequences=np.array(movie_ids))
    indices = np.argpartition(pred, -n_movies)[-n_movies:]
    best_movie_ids = indices[np.argsort(pred[indices])]
    return [get_metadata(movie_id + 1, metadata) for movie_id in best_movie_ids]

In [24]:
movies = ['Shallow Grave', 'Twilight', 'Star Wars', 'Harry Potter']
recommend_next_movies(movies, full_metadata, model, n_movies=5)



[[{'original_title': 'Azúcar amarga',
   'release_date': '1996-02-10',
   'genres': "[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]"}],
 [{'original_title': 'The American President',
   'release_date': '1995-11-17',
   'genres': "[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]"}],
 [{'original_title': 'Jaws 2',
   'release_date': '1978-06-16',
   'genres': "[{'id': 27, 'name': 'Horror'}, {'id': 53, 'name': 'Thriller'}]"}],
 [{'original_title': 'Robin Hood',
   'release_date': '1973-11-08',
   'genres': "[{'id': 16, 'name': 'Animation'}, {'id': 10751, 'name': 'Family'}]"}],
 [{'original_title': 'Touch of Evil',
   'release_date': '1958-04-23',
   'genres': "[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name': 'Thriller'}, {'id': 80, 'name': 'Crime'}]"}]]