## Summaries

1. Non-personalized
2. Content based
3. Collaborative filtering
4. Hybrid

Conclusion:



In [3]:
import os
import json
from time import time
from ast import literal_eval

import pandas as pd
import numpy as np
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, KNNBasic, evaluate
import warnings; warnings.simplefilter('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [4]:
# load data
meta_df = pd.read_csv('movies_metadata.csv')
# parse genre feature
meta_df['genres'] = meta_df['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
# parse date
meta_df['year'] = pd.to_datetime(meta_df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
# stack genre and add it to dataframe again
stacked_genre_df = meta_df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
stacked_genre_df.name = 'genre'
stacked_genre_df = meta_df.drop('genres', axis=1).join(stacked_genre_df)
stacked_genre_df.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Animation
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Comedy
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Family


## Non-personalized recommendations

__ToDo__: Implement non-personalized recommendations which will return top 10 movies for a genre.
Come up with specific average ratio, and use it to rank videos.
(Use video_count, video_average features from meta_df dataframe)


IMDB weighted rating is used:

Weighted Rating = $(\frac{v}{v + m} . R) + (\frac{m}{v + m} . C)$

where,
* *v* is the number of votes for the movie
* *m* is the minimum votes required to be listed in the chart
* *R* is the average rating of the movie
* *C* is the mean vote across the whole report

In [5]:
def get_weighted_rating(v, m, R, C):
    return (v / (v + m) * R) + (m / (v + m) * C)

def get_genre_nonpersonalized_recommendations(df, genre, percentile=0.85):
    genre_df = df[df['genre'] == genre].copy()
    C = genre_df['vote_average'].mean()
    m = genre_df['vote_count'].quantile(percentile)
    genre_df = genre_df[genre_df['vote_count'] > m]
    genre_df['weighted_rating'] = genre_df.apply(
        lambda x: get_weighted_rating(x['vote_count'], m, x['vote_average'], C), axis=1)
    return genre_df.nlargest(10, 'weighted_rating')[['title', 'year']]

In [6]:
get_genre_nonpersonalized_recommendations(stacked_genre_df, 'Comedy')

Unnamed: 0,title,year
10309,Dilwale Dulhania Le Jayenge,1995
2211,Life Is Beautiful,1997
351,Forrest Gump,1994
18465,The Intouchables,2011
1225,Back to the Future,1985
22841,The Grand Budapest Hotel,2014
22131,The Wolf of Wall Street,2013
30315,Inside Out,2015
40882,La La Land,2016
732,Dr. Strangelove or: How I Learned to Stop Worr...,1964


In [7]:
get_genre_nonpersonalized_recommendations(stacked_genre_df, 'Animation')

Unnamed: 0,title,year
5481,Spirited Away,2001
40251,Your Name.,2016
9698,Howl's Moving Castle,2004
2884,Princess Mononoke,1997
359,The Lion King,1994
30315,Inside Out,2015
5553,Grave of the Fireflies,1988
5833,My Neighbor Totoro,1988
13724,Up,2009
12704,WALL·E,2008


In [8]:
get_genre_nonpersonalized_recommendations(stacked_genre_df, 'Family')

Unnamed: 0,title,year
5481,Spirited Away,2001
1225,Back to the Future,1985
359,The Lion King,1994
30315,Inside Out,2015
17437,Harry Potter and the Deathly Hallows: Part 2,2011
13724,Up,2009
12704,WALL·E,2008
24455,Big Hero 6,2014
5833,My Neighbor Totoro,1988
7725,Harry Potter and the Prisoner of Azkaban,2004


## Item-item content based recommendations

__ToDo__: implement functions to perform item-item description based recommendations

In [9]:
# load ID from smaller set
links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
# drop rows with broken ID values
meta_df = meta_df.drop([19730, 29503, 35587])
# parse movie ID to int
meta_df['id'] = meta_df['id'].astype('int')
# create small dataframe
small_meta_df = meta_df[meta_df['id'].isin(links_small)]
small_meta_df.shape

(9099, 25)

In [10]:
# create descriptions
small_meta_df['tagline'] = small_meta_df['tagline'].fillna('')
small_meta_df['description'] = small_meta_df['overview'] + small_meta_df['tagline']
small_meta_df['description'] = small_meta_df['description'].fillna('')
small_meta_df['description'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: description, dtype: object

In [11]:
def create_cosine_matrix(df):
    tfidf = TfidfVectorizer()
    descriptions = tfidf.fit_transform(df['description']).todense()
    
    return linear_kernel(descriptions, descriptions)

def get_item_content_recommendations(df, cosine_sim, title):
    np.fill_diagonal(cosine_sim, 0)
    index = df[df['title'] == title].index.values
    similarity_indexes_sorted_asc = np.argsort(np.squeeze(cosine_sim[index,:]))
    top_20_indexes_desc = np.flip(similarity_indexes_sorted_asc[-20:], axis=0)
     
    return df.iloc[top_20_indexes_desc]['title']

In [12]:
cosine_matrix = create_cosine_matrix(small_meta_df)

In [13]:
get_item_content_recommendations(small_meta_df, cosine_matrix, 'Toy Story')

15348                    Toy Story 3
2997                     Toy Story 2
10301         The 40 Year Old Virgin
3057                 Man on the Moon
1071           Rebel Without a Cause
6435          What's Up, Tiger Lily?
11606                   Factory Girl
11399         For Your Consideration
1199                       Manhattan
1932                       Condorman
448                For Love or Money
6944                Rivers and Tides
7254                  Africa Screams
485                           Malice
10585                    Match Point
9963                   Life Is Sweet
2157               Indecent Proposal
3756     The Ballad of Ramblin' Jack
21190     Woody Allen: A Documentary
2635                      Radio Days
Name: title, dtype: object

In [14]:
get_item_content_recommendations(small_meta_df, cosine_matrix, 'Africa Screams')

11306    Borat: Cultural Learnings of America for Make ...
3790             The Broken Hearts Club: A Romantic Comedy
3617                                   Trouble in Paradise
365                    The Naked Gun 33⅓: The Final Insult
11339                             The Elementary Particles
32388                                        Almost Normal
6090                                  The Talk of the Town
3309                                     Creature Comforts
11965                                 The Bourne Ultimatum
12320                                                [REC]
9846                                    Call Northside 777
8857                              The Times of Harvey Milk
34447                                               Plan B
16127                            The First Beautiful Thing
14497                                     Everybody's Fine
6095                                        Born Yesterday
7641                                        Educating Ri

__ToDo__: implement functions to perform item-item keywords based recommendations

In [15]:
# load credits and keywords data
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')
# parse ID
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
meta_df['id'] = meta_df['id'].astype('int')
# merge existing dataframe with credits and keywords
meta_df = meta_df.merge(credits, on='id')
meta_df = meta_df.merge(keywords, on='id')
# take only small subset
small_meta_df = meta_df[meta_df['id'].isin(links_small)]
small_meta_df.shape

(9219, 28)

In [16]:
# convert parse to json and keep top 3 from cast
small_meta_df['cast'] = small_meta_df['cast'].apply(literal_eval)
small_meta_df['cast'] = small_meta_df['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
small_meta_df['cast'] = small_meta_df['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

# parse crew
small_meta_df['crew'] = small_meta_df['crew'].apply(literal_eval)

# measure cast and crew sizes
small_meta_df['cast_size'] = small_meta_df['cast'].apply(lambda x: len(x))
small_meta_df['crew_size'] = small_meta_df['crew'].apply(lambda x: len(x))

In [17]:
# find director
def get_director(x):
    names = [x['name'] for x in small_meta_df['crew'][0] if x['job']=='Director']
    return np.nan if not names else names[0]

small_meta_df['director'] = small_meta_df['crew'].apply(get_director)
small_meta_df['director'] = small_meta_df['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
small_meta_df['director'] = small_meta_df['director'].apply(lambda x: [x, x, x])

In [18]:
def filter_keywords(x):
    return list(set(x).intersection(words))

small_meta_df['keywords'] = small_meta_df['keywords'].apply(literal_eval)
small_meta_df['keywords'] = small_meta_df['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# keep only frequent words
words = small_meta_df.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
words.name = 'keyword'
words = words.value_counts()
words = words[words > 1]

# filter keywords
small_meta_df['keywords'] = small_meta_df['keywords'].apply(filter_keywords)
small_meta_df['keywords'] = small_meta_df['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
small_meta_df['keywords'] = small_meta_df['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [19]:
# create stemmer
stemmer = SnowballStemmer('english')
small_meta_df['soup'] = small_meta_df['keywords'] + small_meta_df['cast'] + small_meta_df['director'] + small_meta_df['genres']
small_meta_df['soup'] = small_meta_df['soup'].apply(lambda x: ' '.join(x))

In [20]:
def create_cosine_matrix_for_words(df):
    # use CountVectorizer and cosine_similarity
    vectorizer = CountVectorizer()
    vectorizer.fit_transform(df['soup'])
    print(vectorizer.get_feature_names())
    pass

In [22]:
stemmer.
small_meta_df['soup']

0        Tom Hanks Tim Allen Don Rickles johnlasseter j...
1        Robin Williams Jonathan Hyde Kirsten Dunst joh...
2        Walter Matthau Jack Lemmon Ann-Margret johnlas...
3        Whitney Houston Angela Bassett Loretta Devine ...
4        Steve Martin Diane Keaton Martin Short johnlas...
5        Al Pacino Robert De Niro Val Kilmer johnlasset...
6        Harrison Ford Julia Ormond Greg Kinnear johnla...
7        Jonathan Taylor Thomas Brad Renfro Rachael Lei...
8        Jean-Claude Van Damme Powers Boothe Dorian Har...
9        Pierce Brosnan Sean Bean Izabella Scorupco joh...
10       Michael Douglas Annette Bening Michael J. Fox ...
11       Leslie Nielsen Mel Brooks Amy Yasbeck johnlass...
12       Kevin Bacon Bob Hoskins Bridget Fonda johnlass...
13       Anthony Hopkins Joan Allen Powers Boothe johnl...
14       Geena Davis Matthew Modine Frank Langella john...
15       Robert De Niro Sharon Stone Joe Pesci johnlass...
16       Kate Winslet Emma Thompson Hugh Grant johnlass.

In [21]:
cosine_matrix = create_cosine_matrix_for_words(small_meta_df)

['50', 'aaliyah', 'aalto', 'aamir', 'aaran', 'aarne', 'aaron', 'aasif', 'abatantuono', 'abate', 'abbas', 'abbass', 'abbie', 'abbott', 'abbrescia', 'abby', 'abdalla', 'abdolrahman', 'abdul', 'abdullah', 'abdullrahman', 'abe', 'abed', 'abedini', 'abel', 'abelanski', 'abercrombie', 'abhin', 'abigail', 'able', 'abo', 'abraham', 'abrams', 'abril', 'accorsi', 'achbar', 'acheche', 'ackland', 'ackles', 'ackman', 'acovone', 'action', 'ad', 'adam', 'adames', 'adamo', 'adamová', 'adams', 'adamson', 'adar', 'adcock', 'addy', 'adebimpe', 'adel', 'adelaide', 'adelman', 'aden', 'adet', 'adewale', 'adi', 'adjani', 'adkins', 'adl', 'adler', 'adley', 'adlon', 'adolf', 'adolfo', 'adolph', 'adolphe', 'adorf', 'adrian', 'adriana', 'adrianne', 'adriano', 'adrien', 'adrienne', 'adrián', 'adsit', 'adventure', 'adèle', 'ae', 'aernouts', 'afemo', 'affleck', 'afshin', 'agapova', 'agar', 'agata', 'agbaje', 'agee', 'aghdashloo', 'agnes', 'agnew', 'agnès', 'agosto', 'agron', 'aguilar', 'aguilera', 'agutter', 'ah', 

In [182]:
get_item_content_recommendations(small_meta_df, cosine_matrix, 'Toy Story')

15348                   Toy Story 3
2997                    Toy Story 2
10301        The 40 Year Old Virgin
3057                Man on the Moon
6435         What's Up, Tiger Lily?
1071          Rebel Without a Cause
11606                  Factory Girl
11399        For Your Consideration
1199                      Manhattan
1932                      Condorman
6944               Rivers and Tides
448               For Love or Money
485                          Malice
7254                 Africa Screams
9963                  Life Is Sweet
2157              Indecent Proposal
10585                   Match Point
3145           White Men Can't Jump
1045                        Sleeper
21190    Woody Allen: A Documentary
Name: title, dtype: object

In [21]:
get_item_content_recommendations(small_meta_df, cosine_matrix, 'Africa Screams')

3762                           The Eyes of Tammy Faye
8153                                         Salesman
5554                                       Powaqqatsi
6200                                           Stevie
438                              The Endless Summer 2
748                   Maya Lin: A Strong Clear Vision
774                        The Gate of Heavenly Peace
2237            Hands on a Hard Body: The Documentary
3190                          The Brandon Teena Story
3212     Black Tar Heroin: The Dark End of the Street
3514                                 Regret to Inform
3582                                           Baraka
4072                    Long Night's Journey Into Day
4210                                      Startup.com
5530                    The Trials of Henry Kissinger
6262                                       Spellbound
6848                                         Girlhood
6863                                     My Architect
7974                        

## Collaborative filtering / Matrix factorization

In [22]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [23]:
def run_complex_model(ratings_df, model_class, train_on_all_ratings=False):
    # use everything imported from surprise library at the beginning
    # if train_on_all_ratings=True - train on all ratings
    # if train_on_all_ratings=False - split data on 5 folds and do evaluation
    pass

In [24]:
run_complex_model(ratings, 'SVD')

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.9006
MAE:  0.6935
------------
Fold 2
RMSE: 0.8932
MAE:  0.6870
------------
Fold 3
RMSE: 0.8925
MAE:  0.6875
------------
Fold 4
RMSE: 0.9027
MAE:  0.6940
------------
Fold 5
RMSE: 0.8925
MAE:  0.6886
------------
------------
Mean RMSE: 0.8963
Mean MAE : 0.6901
------------
------------
Average runtime per fold:  3.7757  seconds


In [25]:
run_complex_model(ratings, 'KNN')

Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9724
MAE:  0.7473
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9679
MAE:  0.7460
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9593
MAE:  0.7374
------------
Fold 4
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9733
MAE:  0.7467
------------
Fold 5
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9694
MAE:  0.7448
------------
------------
Mean RMSE: 0.9685
Mean MAE : 0.7444
------------
------------
Average runtime per fold:  1.3753  seconds


In [26]:
model = run_complex_model(ratings, 'KNN', train_on_all_ratings=True)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [27]:
ratings[ratings['userId'] == 10].head(10)

Unnamed: 0,userId,movieId,rating,timestamp
744,10,50,5.0,942766420
745,10,152,4.0,942766793
746,10,318,4.0,942766515
747,10,344,3.0,942766603
748,10,345,4.0,942766603
749,10,592,3.0,942767328
750,10,735,4.0,942766974
751,10,1036,3.0,942767258
752,10,1089,3.0,942766420
753,10,1101,2.0,942767328


In [28]:
model.predict(10, 50)

Prediction(uid=10, iid=50, r_ui=None, est=4.695800531734743, details={'was_impossible': False, 'actual_k': 40})

In [29]:
model.predict(10, 152)

Prediction(uid=10, iid=152, r_ui=None, est=3.8628556780312744, details={'was_impossible': False, 'actual_k': 3})

In [30]:
model.predict(10, 40)

Prediction(uid=10, iid=40, r_ui=None, est=3.971887820297263, details={'was_impossible': False, 'actual_k': 6})

## Hybrid recommendations

In [31]:
small_meta_df[['id', 'title', 'genres', 'budget', 'popularity', 'vote_average']].head(3)

Unnamed: 0,id,title,genres,budget,popularity,vote_average
0,862,Toy Story,"[Animation, Comedy, Family]",30000000,21.9469,7.7
1,8844,Jumanji,"[Adventure, Fantasy, Family]",65000000,17.0155,6.9
2,15602,Grumpier Old Men,"[Romance, Comedy]",0,11.7129,6.5


In [32]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182


In [33]:
def get_hybrid_recommendations(small_meta_df, ratings, userId, title):
    pass

In [34]:
get_hybrid_recommendations(small_meta_df, ratings, 10, 'Central Intelligence')

In [35]:
get_hybrid_recommendations(small_meta_df, ratings, 10, 'Assassins')

In [36]:
get_hybrid_recommendations(small_meta_df, ratings, 101, 'Central Intelligence')

In [37]:
get_hybrid_recommendations(small_meta_df, ratings, 101, 'Assassins')