## Summaries

1. Non-personalized
2. Content based
3. Collaborative filtering
4. Hybrid

Conclusion:



In [1]:
import os
import json
from time import time
from ast import literal_eval

import pandas as pd
import numpy as np
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, KNNBasic, evaluate
from surprise.model_selection import cross_validate, train_test_split
import warnings; warnings.simplefilter('ignore')
import matplotlib.pyplot as plt
import re
%matplotlib inline
import seaborn as sns

## Added functions

In [2]:
def make_keyword(string):
    return re.sub('[^a-z0-9]+', '', string.lower())

def print_soup(df, title):
    print('Soup for "{}": {}'.format(title, df[df['title'] == title]['soup'].values[0]))
    
def print_description(df, title):
    print('Description for "{}": {}'.format(title, df[df['title'] == title]['description'].values[0]))

In [3]:
# load data
meta_df = pd.read_csv('movies_metadata.csv')
# parse genre feature
meta_df['genres'] = meta_df['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
# parse date
meta_df['year'] = pd.to_datetime(meta_df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
# stack genre and add it to dataframe again
stacked_genre_df = meta_df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
stacked_genre_df.name = 'genre'
stacked_genre_df = meta_df.drop('genres', axis=1).join(stacked_genre_df)
stacked_genre_df.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Animation
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Comedy
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Family


## Non-personalized recommendations

__ToDo__: Implement non-personalized recommendations which will return top 10 movies for a genre.
Come up with specific average ratio, and use it to rank videos.
(Use video_count, video_average features from meta_df dataframe)


IMDB weighted rating is used:

Weighted Rating = $(\frac{v}{v + m} . R) + (\frac{m}{v + m} . C)$

where,
* *v* is the number of votes for the movie
* *m* is the minimum votes required to be listed in the chart
* *R* is the average rating of the movie
* *C* is the mean vote across the whole report

In [4]:
def get_weighted_rating(v, m, R, C):
    return (v / (v + m) * R) + (m / (v + m) * C)

def get_genre_nonpersonalized_recommendations(df, genre, percentile=0.85):
    genre_df = df[df['genre'] == genre].copy()
    C = genre_df['vote_average'].mean()
    m = genre_df['vote_count'].quantile(percentile)
    genre_df = genre_df[genre_df['vote_count'] > m]
    genre_df['weighted_rating'] = genre_df.apply(
        lambda x: get_weighted_rating(x['vote_count'], m, x['vote_average'], C), axis=1)
    return genre_df.nlargest(10, 'weighted_rating')[['title', 'year']]

In [5]:
get_genre_nonpersonalized_recommendations(stacked_genre_df, 'Comedy')

Unnamed: 0,title,year
10309,Dilwale Dulhania Le Jayenge,1995
2211,Life Is Beautiful,1997
351,Forrest Gump,1994
18465,The Intouchables,2011
1225,Back to the Future,1985
22841,The Grand Budapest Hotel,2014
22131,The Wolf of Wall Street,2013
30315,Inside Out,2015
40882,La La Land,2016
732,Dr. Strangelove or: How I Learned to Stop Worr...,1964


In [6]:
get_genre_nonpersonalized_recommendations(stacked_genre_df, 'Animation')

Unnamed: 0,title,year
5481,Spirited Away,2001
40251,Your Name.,2016
9698,Howl's Moving Castle,2004
2884,Princess Mononoke,1997
359,The Lion King,1994
30315,Inside Out,2015
5553,Grave of the Fireflies,1988
5833,My Neighbor Totoro,1988
13724,Up,2009
12704,WALL·E,2008


In [7]:
get_genre_nonpersonalized_recommendations(stacked_genre_df, 'Family')

Unnamed: 0,title,year
5481,Spirited Away,2001
1225,Back to the Future,1985
359,The Lion King,1994
30315,Inside Out,2015
17437,Harry Potter and the Deathly Hallows: Part 2,2011
13724,Up,2009
12704,WALL·E,2008
24455,Big Hero 6,2014
5833,My Neighbor Totoro,1988
7725,Harry Potter and the Prisoner of Azkaban,2004


## Item-item content based recommendations

__ToDo__: implement functions to perform item-item description based recommendations

In [8]:
# load ID from smaller set
links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
# drop rows with broken ID values
meta_df = meta_df.drop([19730, 29503, 35587])
# parse movie ID to int
meta_df['id'] = meta_df['id'].astype('int')
# create small dataframe
small_meta_df = meta_df[meta_df['id'].isin(links_small)]
small_meta_df.shape

(9099, 25)

In [9]:
# create descriptions
small_meta_df['tagline'] = small_meta_df['tagline'].fillna('')
small_meta_df['description'] = small_meta_df['overview'] + small_meta_df['tagline']
small_meta_df['description'] = small_meta_df['description'].fillna('')
small_meta_df['description'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: description, dtype: object

In [10]:
def create_cosine_matrix(df):
    tfidf = TfidfVectorizer()
    descriptions = tfidf.fit_transform(df['description']).todense()
    
    return linear_kernel(descriptions, descriptions)

def get_item_content_recommendations(df, cosine_sim, title, top_n=None):
    if top_n == None:
        top_n = len(cosine_sim)
    top_n = min(top_n,len(cosine_sim))

    np.fill_diagonal(cosine_sim, 0)
    index = np.where((df['title'] == title).values)
    similarity_indexes_sorted_asc = np.argsort(np.squeeze(cosine_sim[index,:]))
    
    top_indexes_desc = np.flip(similarity_indexes_sorted_asc[-top_n:], axis=0)
     
    return df.iloc[top_indexes_desc]['title']

In [11]:
cosine_matrix = create_cosine_matrix(small_meta_df)

In [12]:
recommendations = get_item_content_recommendations(small_meta_df, cosine_matrix, 'Toy Story', top_n=20)
recommendations

15348                    Toy Story 3
2997                     Toy Story 2
10301         The 40 Year Old Virgin
3057                 Man on the Moon
1071           Rebel Without a Cause
6435          What's Up, Tiger Lily?
11606                   Factory Girl
11399         For Your Consideration
1199                       Manhattan
1932                       Condorman
448                For Love or Money
6944                Rivers and Tides
7254                  Africa Screams
485                           Malice
10585                    Match Point
9963                   Life Is Sweet
2157               Indecent Proposal
3756     The Ballad of Ramblin' Jack
21190     Woody Allen: A Documentary
2635                      Radio Days
Name: title, dtype: object

#### Recommendation for "Toy Story" does not exactly match what was expected. It is mainly used name Andy to get similar movies.
#### Which resulted in suggestion "The 40 Year Old Virgin" which is unappropriate

In [13]:
print_description(small_meta_df, 'Toy Story')
for recommendation_title in recommendations.values[:5]:
    print_description(small_meta_df, recommendation_title)

Description for "Toy Story": Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.
Description for "Toy Story 3": Woody, Buzz, and the rest of Andy's toys haven't been played with in years. With Andy about to go to college, the gang find themselves accidentally left at a nefarious day care center. The toys must band together to escape and return home to Andy.No toy gets left behind.
Description for "Toy Story 2": Andy heads off to Cowboy Camp, leaving his toys to their own devices. Things shift into high gear when an obsessive toy collector named Al McWhiggen, owner of Al's Toy Barn kidnaps Woody. Andy's toys mount a daring rescue mission, Buzz Lightyear meets his match and Woody has to decide where he and his heart truly belong.The toys ar

In [14]:
recommendations = get_item_content_recommendations(small_meta_df, cosine_matrix, 'Africa Screams', top_n=20)
recommendations

565         Wide Eyed and Legless
3500            The Twelve Chairs
22676                Wonder Woman
3791                    Girlfight
6178                      The Wiz
0                       Toy Story
3371         The Son of the Sheik
7341             In This Our Life
6159                        Holes
1841     In the Heat of the Night
2516                     Besieged
12449               The Love Guru
22497             The Other Shore
8259                  The Bellboy
920                My Man Godfrey
39       Cry, the Beloved Country
1004        The Fox and the Hound
5831                    My Girl 2
907           Father of the Bride
20619                    The Call
Name: title, dtype: object

In [15]:
print_description(small_meta_df, 'Africa Screams')
for recommendation_title in recommendations.values[:5]:
    print_description(small_meta_df, recommendation_title)

Description for "Africa Screams": When bookseller Buzz cons Diana into thinking that his friend Stanley knows all there is to know about Africa, they are abducted and ordered to lead Diana and her henchmen to an African tribe in search of a fortune in jewels.A Zany, Hilarious Romp!
Description for "Wide Eyed and Legless": Diana and Deric have an ideal marriage: they thrive in each other's company, they're funny, and they enjoy their two grown children and Deric's dotty mother; the trouble is, Diana can no longer walk and her malady defies medical diagnosis. To care for Diana, Deric is letting his business slide, but at a civic luncheon, he is seated next to Aileen Armitage, a novelist who is blind. They have a nice time, and on the sly, Diana contacts Aileen to made an odd request. Diana's declining health and her resolve bring this triangle of unlikely friends to a surprising place.
Description for "The Twelve Chairs": A treasure hunt. An aging ex-nobleman of the Czarist regime has fi

__ToDo__: implement functions to perform item-item keywords based recommendations

In [16]:
# load credits and keywords data
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')
# parse ID
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
meta_df['id'] = meta_df['id'].astype('int')
# merge existing dataframe with credits and keywords
meta_df = meta_df.merge(credits, on='id')
meta_df = meta_df.merge(keywords, on='id')
# take only small subset
small_meta_df = meta_df[meta_df['id'].isin(links_small)]
small_meta_df.shape

(9219, 28)

### <font color='green'>Used a proper keyword-making function so that they are consistent (for cast, director, etc.)</font>

In [17]:
# convert parse to json and keep top 3 from cast
small_meta_df['cast'] = small_meta_df['cast'].apply(literal_eval)
small_meta_df['cast'] = small_meta_df['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
small_meta_df['cast'] = small_meta_df['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

# join cast name and surname
small_meta_df['cast'] = small_meta_df['cast'].apply(lambda cast: [make_keyword(x) for x in cast])

# parse crew
small_meta_df['crew'] = small_meta_df['crew'].apply(literal_eval)

# measure cast and crew sizes
small_meta_df['cast_size'] = small_meta_df['cast'].apply(lambda x: len(x))
small_meta_df['crew_size'] = small_meta_df['crew'].apply(lambda x: len(x))

### <font color='green'>Fixed a bug</font> 
In get director() -- in list comprehension director of the first movie was always taken

In [18]:
# find director
def get_director(crew):
    names = [x['name'] for x in crew if x['job']=='Director']
    return np.nan if not names else names[0]

small_meta_df['director'] = small_meta_df['crew'].apply(get_director)

small_meta_df['director'] = small_meta_df['director'].astype('str').apply(make_keyword)
small_meta_df['director'] = small_meta_df['director'].apply(lambda x: [x, x, x])

### <font color='green'>Fixed the bugs:
1. In filtering keywords -- intersection was done on words which were a result of value_counts(). Should be: words = words.index.values
2. Stemmer was never initialized (it was done after the usage actually) and it did not crash because keywords were
always empty ([stemmer.stem(i) for i in x])</font> 

In [19]:
def filter_keywords(x):
    return list(set(x).intersection(words))

small_meta_df['keywords'] = small_meta_df['keywords'].apply(literal_eval)
small_meta_df['keywords'] = small_meta_df['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# keep only frequent words
words = small_meta_df.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
words.name = 'keyword'
words = words.value_counts()
words = words[words > 1]
words = words.index.values

# create stemmer
stemmer = SnowballStemmer('english')

# filter keywords
small_meta_df['keywords'] = small_meta_df['keywords'].apply(filter_keywords)
small_meta_df['keywords'] = small_meta_df['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
small_meta_df['keywords'] = small_meta_df['keywords'].apply(lambda x: [make_keyword(i) for i in x])

In [20]:
small_meta_df['soup'] = small_meta_df['keywords'] + small_meta_df['cast'] + small_meta_df['director'] + small_meta_df['genres']
small_meta_df['soup'] = small_meta_df['soup'].apply(lambda x: ' '.join(x))

In [21]:
def create_cosine_matrix_for_words(df):
    # use CountVectorizer and cosine_similarity
    stemmed = df['soup'].apply(lambda sentence: ' '.join([stemmer.stem(word) for word in (sentence).split()]))
    vectorizer = CountVectorizer()
    count_vectorized = vectorizer.fit_transform(stemmed)
    cosine_matrix = cosine_similarity(count_vectorized, count_vectorized)
    return cosine_matrix

In [22]:
cosine_matrix = create_cosine_matrix_for_words(small_meta_df)

In [23]:
recommendations = get_item_content_recommendations(small_meta_df, cosine_matrix, 'Toy Story', top_n=20)
recommendations

3024                          Toy Story 2
10754                            Luxo Jr.
17551                              Cars 2
2262                         A Bug's Life
11074                                Cars
15519                         Toy Story 3
22126                Toy Story of Terror!
5394                      Stuart Little 2
3336                    Creature Comforts
4797                       Monsters, Inc.
5303     Spirit: Stallion of the Cimarron
22915                      The Lego Movie
1034                   That Thing You Do!
11836                  Meet the Robinsons
17484                        Larry Crowne
9883                   Once Upon a Forest
589                             Pinocchio
30525                          Inside Out
11553                     Charlotte's Web
21580                        The Smurfs 2
Name: title, dtype: object

### Here we have much better suggestion of kids movies. All thanks to 'soup'!

In [25]:
print_soup(small_meta_df, 'Toy Story')
for recommendation_title in recommendations.values[:5]:
    print_soup(small_meta_df, recommendation_title)

Soup for "Toy Story": boynextdoor rivalri newtoy friend friendship toy toycomestolif boy jealousi tomhanks timallen donrickles johnlasseter johnlasseter johnlasseter Animation Comedy Family
Soup for "Toy Story 2": museum rescueteam prosecut teamwork identitycrisi personif garagesal airplan friendship duringcreditssting toycomestolif collector inanimateobjectscomingtolif tomhanks timallen joancusack johnlasseter johnlasseter johnlasseter Animation Comedy Family
Soup for "Luxo Jr.": short johnlasseter johnlasseter johnlasseter Animation
Soup for "Cars 2": sequel comedi bestfriend duringcreditssting anthropomorph carrac owenwilson larrythecableguy michaelcaine johnlasseter johnlasseter johnlasseter Animation Family Adventure Comedy
Soup for "A Bug's Life": invent grass fight anthil duringcreditssting kidsandfamili collector ant winter kevinspacey julialouisdreyfus haydenpanettiere johnlasseter johnlasseter johnlasseter Adventure Animation Comedy Family
Soup for "Cars": carjourney ruralset

In [26]:
recommendations = get_item_content_recommendations(small_meta_df, cosine_matrix, 'Africa Screams', top_n=20)
recommendations

3831          Abbott and Costello Meet Frankenstein
1008                                 The Shaggy Dog
4678             Abbott and Costello Meet the Mummy
9694     Abbott and Costello Meet the Invisible Man
2708                                     Funny Farm
6330                   Animals Are Beautiful People
1145                                Mina Tannenbaum
4964                                       Blankman
14705               Did You Hear About the Morgans?
23617                                       Blended
28885                                   The Cobbler
20517                                      Movie 43
7528                          School for Scoundrels
9360                                   A Dog's Will
6582                               The Pink Panther
1072                                 The Great Race
18                   Ace Ventura: When Nature Calls
5341                                        Caveman
4616                                        Hatari!
5404        

In [27]:
print_soup(small_meta_df, 'Africa Screams')
for recommendation_title in recommendations.values[:5]:
    print_soup(small_meta_df, recommendation_title)

Soup for "Africa Screams": africa slapstick budabbott loucostello clydebeatty charlesbarton charlesbarton charlesbarton Comedy
Soup for "Abbott and Costello Meet Frankenstein": dracula frankenstein horrorspoof budabbott loucostello lonchaneyjr charlesbarton charlesbarton charlesbarton Comedy Horror
Soup for "The Shaggy Dog": dog magic fredmacmurray jeanhagen tommykirk charlesbarton charlesbarton charlesbarton Comedy Family
Soup for "Abbott and Costello Meet the Mummy": mummi slapstick budabbott loucostello mariewindsor charleslamont charleslamont charleslamont Comedy
Soup for "Abbott and Costello Meet the Invisible Man": invis horrorspoof budabbott loucostello nancyguild charleslamont charleslamont charleslamont Comedy Horror
Soup for "Funny Farm": citycountrycontrast slapstick chevychase madolynsmithosborne kevinomorrison georgeroyhill georgeroyhill georgeroyhill Comedy


## Collaborative filtering / Matrix factorization

In [80]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [81]:
links_small = pd.read_csv('links_small.csv')
ratings = ratings.merge(links_small, how='left', on='movieId')
ratings.drop(ratings[ratings['tmdbId'].isnull()].index.values, inplace=True)
ratings['tmdbId'] = 

In [82]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,31,2.5,1260759144,112792,9909.0
1,1,1029,3.0,1260759179,33563,11360.0
2,1,1061,3.0,1260759182,117665,819.0
3,1,1129,2.0,1260759185,82340,1103.0
4,1,1172,4.0,1260759205,95765,11216.0


In [29]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100004.0,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608,1129639000.0
std,195.163838,26369.198969,1.058064,191685800.0
min,1.0,1.0,0.5,789652000.0
25%,182.0,1028.0,3.0,965847800.0
50%,367.0,2406.5,4.0,1110422000.0
75%,520.0,5418.0,4.0,1296192000.0
max,671.0,163949.0,5.0,1476641000.0


In [30]:
def run_complex_model(ratings_df, model_class, train_on_all_ratings=False):
    # use everything imported from surprise library at the beginning
    # if train_on_all_ratings=True - train on all ratings
    # if train_on_all_ratings=False - split data on 5 folds and do evaluation
    
    if model_class == 'SVD':
        algo = SVD()
    elif model_class == 'KNN':
        algo = KNNBasic()
    else: 
        assert False, f'Algorithm {model_class} is not supported'
        
    reader = Reader(rating_scale=(0.5, 5))
    data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

    if train_on_all_ratings:
        data = data.build_full_trainset()
        algo.fit(data)
    else:
        cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
        
    return algo

In [31]:
model = run_complex_model(ratings, 'SVD')

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8990  0.8879  0.8956  0.8972  0.9027  0.8965  0.0049  
MAE (testset)     0.6918  0.6820  0.6905  0.6900  0.6963  0.6901  0.0046  
Fit time          7.97    8.03    8.14    7.86    5.76    7.55    0.90    
Test time         0.41    0.41    0.27    0.26    0.22    0.31    0.08    


In [32]:
model = run_complex_model(ratings, 'KNN')

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9627  0.9823  0.9608  0.9665  0.9749  0.9694  0.0080  
MAE (testset)     0.7419  0.7539  0.7408  0.7425  0.7481  0.7454  0.0049  
Fit time          0.23    0.26    0.35    0.46    0.62    0.38    0.14    
Test time         2.37    2.81    2.89    2.51    2.07    2.53    0.30    


In [33]:
model = run_complex_model(ratings, 'KNN', train_on_all_ratings=True)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [34]:
ratings[ratings['userId'] == 10].head(10)

Unnamed: 0,userId,movieId,rating,timestamp
744,10,50,5.0,942766420
745,10,152,4.0,942766793
746,10,318,4.0,942766515
747,10,344,3.0,942766603
748,10,345,4.0,942766603
749,10,592,3.0,942767328
750,10,735,4.0,942766974
751,10,1036,3.0,942767258
752,10,1089,3.0,942766420
753,10,1101,2.0,942767328


In [35]:
model.predict(10, 50)

Prediction(uid=10, iid=50, r_ui=None, est=4.6958005317347427, details={'actual_k': 40, 'was_impossible': False})

In [36]:
model.predict(10, 152)

Prediction(uid=10, iid=152, r_ui=None, est=3.8628556780312744, details={'actual_k': 3, 'was_impossible': False})

In [37]:
model.predict(10, 40)

Prediction(uid=10, iid=40, r_ui=None, est=3.9718878202972632, details={'actual_k': 6, 'was_impossible': False})

### Examine what suggestions we can expect for the given user

In [64]:
# ratings[ratings['userId'] == 10]
# min(ratings['movieId'].values)
np.isin(ratings['movieId'].values, meta_df['id'].values).sum()/len(ratings)

0.44987200511979519

## Hybrid recommendations

In [55]:
small_meta_df[['id', 'title', 'genres', 'budget', 'popularity', 'vote_average']].head(3)

Unnamed: 0,id,title,genres,budget,popularity,vote_average
0,862,Toy Story,"[Animation, Comedy, Family]",30000000,21.9469,7.7
1,8844,Jumanji,"[Adventure, Fantasy, Family]",65000000,17.0155,6.9
2,15602,Grumpier Old Men,"[Romance, Comedy]",0,11.7129,6.5


In [None]:
ratings.head(3)

In [None]:
COLDSTART_THRESHOLD = 5
def get_hybrid_recommendations(small_meta_df, ratings, userId, title):
    recs = get_item_content_recommendations(small_meta_df, cosine_matrix, title, top_n=100)
    print(recs)
#     watched_movie_ids = np.unique(ratings[ratings['userId'] == userId]['movieId'].values)
#     not_watched_movie_ids = np.unique(ratings[~ratings['movieId'].isin(watched_movie_ids)]['movieId'].values)
#     if len(watched_movie_ids) > COLDSTART_THRESHOLD:
#         preds = []
#         for movie_id in not_watched_movie_ids:
#             pred = model.predict(userId, movie_id).est
#             preds.extend([pred])
#         preds = np.array(preds)
#         preds_sorted_indexes = preds.argsort()[-20:]
#         print(preds[preds_sorted_indexes])

In [None]:
userId = 10
model.predict(10, 50).est.dtype

In [None]:
get_hybrid_recommendations(small_meta_df, ratings, 10, 'Central Intelligence')

In [None]:
get_hybrid_recommendations(small_meta_df, ratings, 10, 'Assassins')

In [None]:
get_hybrid_recommendations(small_meta_df, ratings, 101, 'Central Intelligence')

In [None]:
get_hybrid_recommendations(small_meta_df, ratings, 101, 'Assassins')

In [None]:
(small_meta_df['original_title'] == 'Assassins').any()