# **Movie Recommendation System**

In [3]:
import pandas as pd
import numpy as np

In [8]:
movies=pd.read_csv('tmdb_5000_movies.csv')
credits=pd.read_csv('tmdb_5000_credits.csv')

In [9]:
movies=movies.merge(credits,on='title')
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [17]:
movies=movies[['movie_id','title','overview','genres','keywords','cast','crew']]
movies.info()
movies.shape

<class 'pandas.core.frame.DataFrame'>
Index: 4806 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4806 non-null   int64 
 1   title     4806 non-null   object
 2   overview  4806 non-null   object
 3   genres    4806 non-null   object
 4   keywords  4806 non-null   object
 5   cast      4806 non-null   object
 6   crew      4806 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.4+ KB


(4806, 7)

### **Data Cleaning**

In [11]:
movies.isnull().sum()
movies.dropna(inplace=True)
movies.drop_duplicates(inplace=True)
movies.duplicated().sum()

0

### **converting feature's dictionary form into list form**

In [13]:
import ast

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [14]:
#function used to convert ['genres'] attribut's dictionary form into list form
#for that we have used ast library
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [15]:
movies['genres']=movies['genres'].apply(convert)

In [18]:
movies['keywords']=movies['keywords'].apply(convert)

In [19]:
#we are only focusing on top 3 cast of the movie.
def convert_cast(obj):
    L=[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter!=3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [20]:
movies['cast']=movies['cast'].apply(convert_cast)

In [22]:
#we are only fetching director's name amongest this big crew team
def convert_crew(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i['name'])
            break
    return L

In [23]:
movies['crew']=movies['crew'].apply(convert_crew)

In [24]:
#converting overview - string to list

movies['overview']=movies['overview'].apply(lambda x:x.split())

In [25]:
#removing " " in each name for model's better performance
# i.e Sam Worthington, Sam Mendes

movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [26]:
#as our expected format we will concatinate attributes to 'tags' attribute
movies['tags']=movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']

In [27]:
new_df=movies[['movie_id','title','tags']]

In [30]:
new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))


In [31]:
#converting all tags into lower case
new_df['tags']=new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:x.lower())


In [32]:
 !pip install nltk



### **Perfoming Stemming**

In [33]:
import nltk
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [34]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)


In [35]:
new_df['tags']= new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']= new_df['tags'].apply(stem)


### **Vectorization**

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')

In [37]:
vectors=cv.fit_transform(new_df['tags']).toarray()

In [38]:
 cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

### **Calculating COSINE SIMILARITY**

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

In [40]:
similarity=cosine_similarity(vectors)

**Main Function**

In [42]:
def recommend(movie):
    # Find the index of the given movie
    movie_index = new_df[new_df['title'] == movie].index[0]

    # Retrieve similarity scores for the movie
    distances = similarity[movie_index]

    # Sort the movies based on similarity scores in descending order, excluding the given movie itself
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    # Print the titles of the recommended movies
    recommended_movies = []
    for i in movies_list:
        recommended_movies.append(new_df.iloc[i[0]].title)

    return recommended_movies


In [44]:
recommend('Batman Begins')
#recommend('Bang')

['The Dark Knight', 'Batman', 'Batman', 'The Dark Knight Rises', '10th & Wolf']

**Used for Streamlit**

In [45]:
import pickle

In [46]:
pickle.dump(new_df,open('movies.pkl','wb'))

In [47]:
pickle.dump(new_df.to_dict(),open('mpvie_dict.pkl','wb'))

In [48]:
pickle.dump(similarity,open('similarity.pkl','wb'))