In [373]:
import numpy as np
import pandas as pd

In [375]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

import warnings; warnings.simplefilter('ignore')

In [376]:
creditdf = pd.read_csv('credits.csv')
keyworddf = pd.read_csv('keywords.csv')
linkdf = pd.read_csv('links_small.csv')
moviedf = pd.read_csv('movies_metadata.csv')
ratingdf = pd.read_csv('ratings_small.csv')

In [377]:
linkdf.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


# Recommender Based on Overview and tagline

In [378]:
linkdf = linkdf[linkdf['tmdbId'].notnull()]['tmdbId'].astype('int')
linkdf

0          862
1         8844
2        15602
3        31357
4        11862
         ...  
9120    402672
9121    315011
9122    391698
9123    137608
9124    410803
Name: tmdbId, Length: 9112, dtype: int32

## The id in these index is not in the right format. It is an object but in the datetime format that's why we will drop these row

In [379]:
row = moviedf.iloc[[19730, 29503, 35587]]
row

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'name': 'Carousel Productions', 'id': 11176}...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1997-08-20,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,1,,,,,,,,,
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-09-29,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,12,,,,,,,,,
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2014-01-01,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,22,,,,,,,,,


In [380]:
moviedf = moviedf.drop([19730, 29503, 35587])

In [381]:
moviedf['id'] = moviedf['id'].astype('int')
moviedf['id']

0           862
1          8844
2         15602
3         31357
4         11862
          ...  
45461    439050
45462    111109
45463     67758
45464    227506
45465    461257
Name: id, Length: 45463, dtype: int32

In [382]:
linkdf.shape

(9112,)

In [383]:
moviedf['id'].shape

(45463,)

We are taking only that movie that our available in linkdf dataset because of computing power we will make our model on small dataset. At first we have 45463 movies in our dataframe and now in our new dataframe we have 9099 movies.

In [384]:
smallmoviedf = moviedf[moviedf['id'].isin(linkdf)]
smallmoviedf.shapeS

(9099, 24)

In [385]:
smallmoviedf.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [386]:
smallmoviedf.iloc[0]

adult                                                                False
belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
homepage                              http://toystory.disney.com/toy-story
id                                                                     862
imdb_id                                                          tt0114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     

In [387]:
smallmoviedf['tagline'].isnull().sum()

2066

In [388]:
smallmoviedf['tagline'] = smallmoviedf['tagline'].fillna('')

In [389]:
smallmoviedf['tagline'].isnull().sum()

0

In [390]:
smallmoviedf['description'] = smallmoviedf['overview']+smallmoviedf['tagline']
smallmoviedf['description'] = smallmoviedf['description'].fillna('')

In [391]:
smallmoviedf['description']

0        Led by Woody, Andy's toys live happily in his ...
1        When siblings Judy and Peter discover an encha...
2        A family wedding reignites the ancient feud be...
3        Cheated on, mistreated and stepped on, the wom...
4        Just when George Banks has recovered from his ...
                               ...                        
40224    From the mind behind Evangelion comes a hit la...
40503    The band stormed Europe in 1963, and, in 1964,...
44821    When Molly Hale's sadness of her father's disa...
44826    All your favorite Pokémon characters are back,...
45265    While holidaying in the French Alps, a Swedish...
Name: description, Length: 9099, dtype: object

In [392]:
import re

def clean_description(text):
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]"," ",text)
    text = ' '.join(text.split())
    text = text.lower()
    return text

smallmoviedf['clean_description'] = smallmoviedf['description'].apply(lambda x: clean_description(x))
smallmoviedf['clean_description'][0]

'led by woody andys toys live happily in his room until andys birthday brings buzz lightyear onto the scene afraid of losing his place in andys heart woody plots against buzz but when circumstances separate buzz and woody from their owner the duo eventually learns to put aside their differences'

In [393]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smallmoviedf['clean_description'])

In [394]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [395]:
cosine_sim[1]

array([0.0066973 , 1.        , 0.01520129, ..., 0.00376753, 0.00651863,
       0.        ])

In [396]:
smallmoviedf = smallmoviedf.reset_index()

In [397]:
smallmoviedf

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,description,clean_description
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",led by woody andys toys live happily in his ro...
1,1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,When siblings Judy and Peter discover an encha...,when siblings judy and peter discover an encha...
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,A family wedding reignites the ancient feud be...,a family wedding reignites the ancient feud be...
3,3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"Cheated on, mistreated and stepped on, the wom...",cheated on mistreated and stepped on the women...
4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,Just when George Banks has recovered from his ...,just when george banks has recovered from his ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9094,40224,False,,15000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,315011,tt4262980,ja,シン・ゴジラ,...,120.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...",Released,A god incarnate. A city doomed.,Shin Godzilla,False,6.6,152.0,From the mind behind Evangelion comes a hit la...,from the mind behind evangelion comes a hit la...
9095,40503,False,,0,"[{'id': 99, 'name': 'Documentary'}, {'id': 104...",http://www.thebeatlesliveproject.com/,391698,tt2531318,en,The Beatles: Eight Days a Week - The Touring Y...,...,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The band you know. The story you don't.,The Beatles: Eight Days a Week - The Touring Y...,False,7.6,92.0,"The band stormed Europe in 1963, and, in 1964,...",the band stormed europe in and in they conquer...
9096,44821,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",16000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://movies.warnerbros.com/pk3/,10991,tt0235679,ja,Pokémon 3: The Movie,...,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Pokémon: Spell of the Unknown,Pokémon: Spell of the Unknown,False,6.0,144.0,When Molly Hale's sadness of her father's disa...,when molly hales sadness of her fathers disapp...
9097,44826,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://www.pokemon.com/us/movies/movie-pokemon...,12600,tt0287635,ja,劇場版ポケットモンスター セレビィ 時を越えた遭遇（であい）,...,75.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Pokémon 4Ever: Celebi - Voice of the Forest,False,5.7,82.0,"All your favorite Pokémon characters are back,...",all your favorite pok mon characters are back ...


In [398]:
titles = smallmoviedf['title']
titles

0                                               Toy Story
1                                                 Jumanji
2                                        Grumpier Old Men
3                                       Waiting to Exhale
4                             Father of the Bride Part II
                              ...                        
9094                                        Shin Godzilla
9095    The Beatles: Eight Days a Week - The Touring Y...
9096                        Pokémon: Spell of the Unknown
9097          Pokémon 4Ever: Celebi - Voice of the Forest
9098                                        Force Majeure
Name: title, Length: 9099, dtype: object

In [399]:
indices = pd.Series(smallmoviedf.index, index=smallmoviedf['title'])
indices

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
                                                      ... 
Shin Godzilla                                         9094
The Beatles: Eight Days a Week - The Touring Years    9095
Pokémon: Spell of the Unknown                         9096
Pokémon 4Ever: Celebi - Voice of the Forest           9097
Force Majeure                                         9098
Length: 9099, dtype: int64

In [400]:
def get_recommendations(title):
    idx = indices[title]
    
    #It will get the title with its index
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    
    # [2502, 7535, 4702, 889, 437]
    movie_indices = [i[0] for i in sim_scores]
    
    return titles.iloc[movie_indices]

In [401]:
title_with_index = get_recommendations("Toy Story")

In [402]:
title_with_index

2502               Toy Story 2
7535               Toy Story 3
4702    What's Up, Tiger Lily?
889      Rebel Without a Cause
437                     Malice
Name: title, dtype: object

In [403]:
title_with_index = title_with_index.reset_index()
title_with_index

Unnamed: 0,index,title
0,2502,Toy Story 2
1,7535,Toy Story 3
2,4702,"What's Up, Tiger Lily?"
3,889,Rebel Without a Cause
4,437,Malice


In [404]:
for i in title_with_index['title']:
    print(i)

Toy Story 2
Toy Story 3
What's Up, Tiger Lily?
Rebel Without a Cause
Malice


# Keyword, Genre, Cast & Crew based Recommender

In [405]:
keyworddf.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [406]:
keyworddf['keywords'][0]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

In [407]:
creditdf.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [408]:
creditdf['cast'][0]

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

In [409]:
creditdf['crew'][0]

'[{\'credit_id\': \'52fe4284c3a36847f8024f49\', \'department\': \'Directing\', \'gender\': 2, \'id\': 7879, \'job\': \'Director\', \'name\': \'John Lasseter\', \'profile_path\': \'/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f4f\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12891, \'job\': \'Screenplay\', \'name\': \'Joss Whedon\', \'profile_path\': \'/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f55\', \'department\': \'Writing\', \'gender\': 2, \'id\': 7, \'job\': \'Screenplay\', \'name\': \'Andrew Stanton\', \'profile_path\': \'/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f5b\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12892, \'job\': \'Screenplay\', \'name\': \'Joel Cohen\', \'profile_path\': \'/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f61\', \'department\': \'Writing\', \'gender\': 0, \'id\': 12893, \'job\': \'Screenplay\', \'name\': \'A

In [410]:
keyworddf['id'] = keyworddf['id'].astype('int')
creditdf['id'] = creditdf['id'].astype('int')
moviedf['id'] = moviedf['id'].astype('int')

In [411]:
moviedf = moviedf.merge(creditdf, on='id')
moviedf = moviedf.merge(keyworddf, on='id')


In [412]:
moviedf.cast[0]

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

Just to make our dataframe small because of computing power

In [413]:
smoviedf = moviedf[moviedf['id'].isin(linkdf)]
smoviedf.shape

(9219, 27)

# Now we will clean the cast, crew and keyword column

In [414]:
smoviedf['cast'] = smoviedf['cast'].apply(literal_eval)
smoviedf['crew'] = smoviedf['crew'].apply(literal_eval)
smoviedf['keywords'] = smoviedf['keywords'].apply(literal_eval)
smoviedf['genres'] = smoviedf['genres'].apply(literal_eval)

In [415]:
smoviedf.cast[0]

[{'cast_id': 14,
  'character': 'Woody (voice)',
  'credit_id': '52fe4284c3a36847f8024f95',
  'gender': 2,
  'id': 31,
  'name': 'Tom Hanks',
  'order': 0,
  'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'},
 {'cast_id': 15,
  'character': 'Buzz Lightyear (voice)',
  'credit_id': '52fe4284c3a36847f8024f99',
  'gender': 2,
  'id': 12898,
  'name': 'Tim Allen',
  'order': 1,
  'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'},
 {'cast_id': 16,
  'character': 'Mr. Potato Head (voice)',
  'credit_id': '52fe4284c3a36847f8024f9d',
  'gender': 2,
  'id': 7167,
  'name': 'Don Rickles',
  'order': 2,
  'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'},
 {'cast_id': 17,
  'character': 'Slinky Dog (voice)',
  'credit_id': '52fe4284c3a36847f8024fa1',
  'gender': 2,
  'id': 12899,
  'name': 'Jim Varney',
  'order': 3,
  'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'},
 {'cast_id': 18,
  'character': 'Rex (voice)',
  'credit_id': '52fe4284c3a36847f8024fa5',
  'gender': 2,
  'id': 12900,
 

## We are fetching only director from crew and after fetching the crew column is of no use.

In [416]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [417]:
smoviedf['director'] = smoviedf['crew'].apply(get_director)

In [418]:
smoviedf['director'].head()

0      John Lasseter
1       Joe Johnston
2      Howard Deutch
3    Forest Whitaker
4      Charles Shyer
Name: director, dtype: object

In [419]:
smoviedf['director'] = smoviedf['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

In [420]:
smoviedf['director'].head()

0      johnlasseter
1       joejohnston
2      howarddeutch
3    forestwhitaker
4      charlesshyer
Name: director, dtype: object

We are doing this to convert the string into list so that we can merge them afterward easily

In [421]:
smoviedf['director'] = smoviedf['director'].apply(lambda x: [x])

In [422]:
smoviedf['director'].head()

0      [johnlasseter]
1       [joejohnston]
2      [howarddeutch]
3    [forestwhitaker]
4      [charlesshyer]
Name: director, dtype: object

## We are fetching the first three cast from the movie because they are famous

In [423]:
smoviedf['cast'] = smoviedf['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smoviedf['cast'] = smoviedf['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [424]:
smoviedf['cast'].head()

0                  [Tom Hanks, Tim Allen, Don Rickles]
1       [Robin Williams, Jonathan Hyde, Kirsten Dunst]
2           [Walter Matthau, Jack Lemmon, Ann-Margret]
3    [Whitney Houston, Angela Bassett, Loretta Devine]
4           [Steve Martin, Diane Keaton, Martin Short]
Name: cast, dtype: object

In [425]:
smoviedf['cast'] = smoviedf['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [426]:
smoviedf.cast.head()

0                  [tomhanks, timallen, donrickles]
1       [robinwilliams, jonathanhyde, kirstendunst]
2          [waltermatthau, jacklemmon, ann-margret]
3    [whitneyhouston, angelabassett, lorettadevine]
4           [stevemartin, dianekeaton, martinshort]
Name: cast, dtype: object

## We had the keywords with id so we are fetching only keywords from it.

In [427]:
smoviedf['keywords'] = smoviedf['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [428]:
smoviedf['keywords'].head()

0    [jealousy, toy, boy, friendship, friends, riva...
1    [board game, disappearance, based on children'...
2    [fishing, best friend, duringcreditsstinger, o...
3    [based on novel, interracial relationship, sin...
4    [baby, midlife crisis, confidence, aging, daug...
Name: keywords, dtype: object

In [429]:
stemmer = SnowballStemmer('english')
smoviedf['keywords'] = smoviedf['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smoviedf['keywords'][0]

['jealousi',
 'toy',
 'boy',
 'friendship',
 'friend',
 'rivalri',
 'boy next door',
 'new toy',
 'toy comes to lif']

In [430]:
smoviedf['keywords'] = smoviedf['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
smoviedf['keywords'][0]

['jealousi',
 'toy',
 'boy',
 'friendship',
 'friend',
 'rivalri',
 'boynextdoor',
 'newtoy',
 'toycomestolif']

## We had a genres with id so we are fetching only genres from there.

In [431]:
moviedf['genres'][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [432]:
smoviedf['genres'] = smoviedf['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [433]:
smoviedf['genres'].head()

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
2               [Romance, Comedy]
3        [Comedy, Drama, Romance]
4                        [Comedy]
Name: genres, dtype: object

## Combining Dataset 

In [435]:
smoviedf['combine'] = smoviedf['keywords'] + smoviedf['cast'] + smoviedf['director'] + smoviedf['genres']

In [440]:
smoviedf['combine'] = smoviedf['combine'].apply(lambda x: ' '.join(x))

In [441]:
smoviedf['combine'][0]

'jealousi toy boy friendship friend rivalri boynextdoor newtoy toycomestolif tomhanks timallen donrickles johnlasseter Animation Comedy Family'

In [442]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smoviedf['combine'])

In [447]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [449]:
cosine_sim[0]

array([1.        , 0.03456506, 0.03919309, ..., 0.        , 0.        ,
       0.        ])

In [450]:
smoviedf = smoviedf.reset_index()
titles = smoviedf['title']
indices = pd.Series(smoviedf.index, index=smoviedf['title'])

In [451]:
title_with_index = get_recommendations("Toy Story")

In [452]:
title_with_index

8519                           Toy Story of Terror!
2522                                    Toy Story 2
2751                              Creature Comforts
1432                               Meet the Deedles
4341    The Looney, Looney, Looney Bugs Bunny Movie
Name: title, dtype: object

# Recommendation on similarity with popularity

In [461]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smoviedf.iloc[movie_indices][['title', 'vote_count', 'vote_average']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [462]:
improved_recommendations("Toy Story")

Unnamed: 0,title,vote_count,vote_average,wr
3833,"Monsters, Inc.",6150,7,6.921385
7629,Toy Story 3,4710,7,6.900206
2522,Toy Story 2,3914,7,6.882743
8595,The Lego Movie,3127,7,6.858214
2797,The Road to El Dorado,892,7,6.650688
6534,Monster House,912,6,6.065676
6968,Horton Hears a Who!,927,6,6.065045
3016,Chicken Run,1190,6,6.055671
7404,Cloudy with a Chance of Meatballs,1799,6,6.041742
5456,Garfield,851,5,5.495693
