In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [63]:
df = pd.read_csv('movie_dataset.csv')

In [64]:
df.shape

(4803, 24)

In [65]:
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [66]:
len(df.columns)

24

In [67]:
len(df.loc[df.title == df.original_title])

4542

In [68]:
df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [69]:
df.id.isnull().any()

False

In [70]:
important_features = ['title', 'original_title', 'genres', 'keywords', 'overview', 'tagline', 'cast', 'director', 'production_companies']

In [71]:
df[important_features].isnull().sum()

title                     0
original_title            0
genres                   28
keywords                412
overview                  3
tagline                 844
cast                     43
director                 30
production_companies      0
dtype: int64

In [72]:
for feature in important_features:
    df[feature].fillna(value = '', inplace = True)

In [73]:
df[important_features].isnull().sum()

title                   0
original_title          0
genres                  0
keywords                0
overview                0
tagline                 0
cast                    0
director                0
production_companies    0
dtype: int64

In [74]:
def combine_features(df):
    return df['original_title'] + ' ' + df['genres'] + ' ' + df['keywords'] + ' ' + df['overview'] + ' ' + df['tagline'] + ' ' + df['cast'] + ' ' + df['spoken_languages'] + ' ' + df['production_companies'] + ' ' + df['director']

In [75]:
df['final_features'] = combine_features(df)

In [76]:
from sklearn.feature_extraction.text import CountVectorizer

In [77]:
cv = CountVectorizer()

In [78]:
count_matrix = cv.fit_transform(df['final_features'])

In [79]:
from sklearn.metrics.pairwise import cosine_similarity

In [80]:
movie_similarity = cosine_similarity(count_matrix)

In [81]:
movie_similarity = 100*movie_similarity

In [82]:
movie_similarity.shape

(4803, 4803)

In [83]:
def get_similar_movies(customer_preferences, df, movie_similarity):
    
    movie_database = list(df['title'].values)
    total_recommendations = []
    
    for movie in customer_preferences:
        if movie in movie_database:
            movie_id = df.loc[df['title'] == movie, 'index'].values[0]
            #print(movie_id)
        else:
            raise Exception(f'Movie : {movie} not present in our database at the moment')
            
        similar_movies = list(movie_similarity[movie_id, :])
        similar_movie_list = enumerate(similar_movies)
        similar_movie_list = sorted(similar_movie_list, key  = lambda x: x[1], reverse = True)
        similar_movie_list = similar_movie_list[1:]
        #print(similar_movie_list)
        total_recommendations.extend(similar_movie_list)
        
    #print(total_recommendations)
        
    total_recommendations = sorted(total_recommendations, key = lambda x: x[1], reverse = True)
    
    return movie_database, total_recommendations

In [84]:
def recommend(customer_preferences, df, movie_similarity):
    
    movie_database, recommendations = get_similar_movies(customer_preferences, df, movie_similarity)
    
    print('Based on your observed preferences, the following movies might be a good fit.\n')
    movie_ids = []
    scores = []
    movie_scores = []
    
    for movie_id, score in recommendations:
        movie_ids.append(movie_id)
        scores.append(score)
        
    movies_list = [df.loc[df['index'] == movie_id, 'title'].values[0] for movie_id in movie_ids]
    movie_dict = {}
    #print(movies_list)
    
    for t in zip(movies_list, scores):
        movie_scores.append(t)

          
    for k, v in movie_scores:
        if k in movie_dict.keys():
            if v > movie_dict[k]:
                movie_dict[k] = v
            else:
                continue
        else:
            movie_dict[k] = v
    
    final_rec = pd.Series(data = movie_dict)
    
    movie_recommendation_df = pd.DataFrame(final_rec, columns = ['Similarity Scores (%)'])
    movie_recommendation_df.insert(loc = 1, column = 'Genres',
                                   value = [df.loc[df['title'] == k]['genres'] for k, v in list(movie_dict.items())])
    
    
    movie_recommendation_df = movie_recommendation_df.sort_values(by = 'Similarity Scores (%)', ascending = False)
        
    return movie_recommendation_df.head(n = 20)
        

In [90]:
customer_preferences = ['Thor', 'Spider-Man']

In [91]:
recommend(customer_preferences, df, movie_similarity)

Based on your observed preferences, the following movies might be a good fit.



Unnamed: 0,Similarity Scores (%),Genres
Thor: The Dark World,54.97273,"126 Action Adventure Fantasy Name: genres, ..."
Spider-Man 3,54.221541,"5 Fantasy Action Adventure Name: genres, dt..."
Gladiator,48.535521,"274 Action Drama Adventure Name: genres, dt..."
Spider-Man 2,48.483389,"30 Action Adventure Fantasy Name: genres, d..."
Avengers: Age of Ultron,48.308508,7 Action Adventure Science Fiction Name: ge...
The Amazing Spider-Man,46.935693,"20 Action Adventure Fantasy Name: genres, d..."
Little Nicky,46.899858,"471 Comedy Fantasy Romance Name: genres, dt..."
Pirates of the Caribbean: At World's End,46.023132,"1 Adventure Fantasy Action Name: genres, dt..."
Hart's War,45.54987,"599 Drama War Name: genres, dtype: object"
The Legend of Hercules,45.510034,"610 Action Adventure Name: genres, dtype: o..."


In [87]:
df.loc[df['title'] == 'Poseidon']['overview']

104    A packed cruise ship traveling the Atlantic is...
Name: overview, dtype: object