In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movies-dataset-recommender-system/movie_dataset.csv


In [2]:
#Importing libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Loading data
data = pd.read_csv('/kaggle/input/movies-dataset-recommender-system/movie_dataset.csv')

In [4]:
#Checking shape of the data
data.shape

(4803, 24)

In [5]:
#Checking dataset
data.head(2)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski


In [6]:
#Checking list of variables in the data
data.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [7]:
#Creating data with important features
imp_features = ['keywords', 'cast', 'genres', 'director']
data1 = data[imp_features]
data1.head()

Unnamed: 0,keywords,cast,genres,director
0,culture clash future space war space colony so...,Sam Worthington Zoe Saldana Sigourney Weaver S...,Action Adventure Fantasy Science Fiction,James Cameron
1,ocean drug abuse exotic island east india trad...,Johnny Depp Orlando Bloom Keira Knightley Stel...,Adventure Fantasy Action,Gore Verbinski
2,spy based on novel secret agent sequel mi6,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Action Adventure Crime,Sam Mendes
3,dc comics crime fighter terrorist secret ident...,Christian Bale Michael Caine Gary Oldman Anne ...,Action Crime Drama Thriller,Christopher Nolan
4,based on novel mars medallion space travel pri...,Taylor Kitsch Lynn Collins Samantha Morton Wil...,Action Adventure Science Fiction,Andrew Stanton


In [8]:
#Checking missing values
data1.isnull().sum()

keywords    412
cast         43
genres       28
director     30
dtype: int64

In [9]:
#Replacing missing values
missing_var = data1.isnull().sum().keys()
for i in missing_var:
    data1[i] = data1[i].fillna('')

In [10]:
#Checking missing values
data1.isnull().sum()

keywords    0
cast        0
genres      0
director    0
dtype: int64

In [11]:
#Combining the values of the all features
data1['Combined_features'] = data1['keywords']+' '+data1['cast']+' '+data1['genres']+' '+data1['director']

In [12]:
#Checking first five values of the combined features
data1['Combined_features'][0:5]

0    culture clash future space war space colony so...
1    ocean drug abuse exotic island east india trad...
2    spy based on novel secret agent sequel mi6 Dan...
3    dc comics crime fighter terrorist secret ident...
4    based on novel mars medallion space travel pri...
Name: Combined_features, dtype: object

In [13]:
#Creating an object of count victorizer
cv = CountVectorizer()

In [14]:
#Creating count vectorization of combined features
count_matrix = cv.fit_transform(data1['Combined_features'])

In [15]:
#Computing cosine similarity of count matrix
cosin_sim = cosine_similarity(count_matrix)

In [16]:
#Checking first five values cosine similarity
cosin_sim[5]

array([0.19245009, 0.25560386, 0.12510865, ..., 0.04      , 0.        ,
       0.        ])

In [17]:
#Defining function for movie recommendation system
def get_recommender(movie_title):
    #Finding an index number of given movie title which you like most
    movie_index = data[data.title==movie_title]['index'].values[0]
    
    #Finding cosine similarity score with respect to an index number of given movie title which you like most
    similar_movies = list(enumerate(cosin_sim[movie_index]))
    
    #Sorting cosine similarity scores in descending order
    sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1], reverse=True)
    
    #Extracting index numbers from sorted cosine similarity scores
    movies_indices = [i[0] for i in sorted_similar_movies]
    
    #Extracting movies title with respect to extracted index numbers from sorted cosine similarity scores
    movies_name = data['original_title'].iloc[movies_indices]
    
    #Creating dataframe of recommended movies name and resettig the index number
    recommended_movies = pd.DataFrame()
    recommended_movies['Recommended Movies'] = movies_name
    recommended_movies = recommended_movies.reset_index(drop = True)
    
    #Showing the result
    return recommended_movies.iloc[0:10]

In [18]:
#I like Avatar movie, Lets check the similar recommendations
get_recommender('Avatar')

Unnamed: 0,Recommended Movies
0,Avatar
1,Guardians of the Galaxy
2,Aliens
3,Star Wars: Clone Wars (Volume 1)
4,Star Trek Into Darkness
5,Star Trek Beyond
6,Alien
7,Lockout
8,Jason X
9,The Helix... Loaded


Please upvote my work if you like!!!!

Thank You!