In [None]:
import numpy as np
import pandas as pd

In [None]:
movies=pd.read_csv('tmdb_5000_movies.csv')
credits=pd.read_csv('tmdb_5000_credits.csv')

In [None]:
movies.head()

In [None]:
credits.head()

# DATA PREPROCESSING
# removing features that will be of no use in the model.

In [None]:
movies=movies.merge(credits,on="title")
# merging both the dataframes
movies.head(2)

In [None]:
#Removing the below features:-
 # budget
 # homepage
 # id
 # original_language
 # original_title
 # popularity
 # production_comapny
 # production_countries
 # release-date(not sure)

In [None]:
movies=movies[['movie_id','title','overview','genres','keywords','cast','crew']]
movies.head(3)
# removine the unnecessary features from the dataframe.

In [None]:
movies.dtypes

In [None]:
import ast



In [None]:
 def formatting(text):
  l=[] # empty list
  for i in ast.literal_eval(text):
   l.append(i['name'])
  return l
#This function extracts the value associated with the key 'name' from each dictionary.

In [None]:
movies.dropna(inplace=True)
movies['genres']=movies['genres'].apply(formatting)
movies.head(3)

In [None]:
movies['keywords']=movies['keywords'].apply(formatting)
movies.head(1)

In [None]:
def formatting2(text):
    l=[]
    counter=1
    for i in ast.literal_eval(text):
        l.append(i['name'])
        counter+=1
        if counter==4:
            break
    return l
# THIS EXTRACTS THE NAME OF FIRST 3 CASTS OF A MOVIE.

In [None]:
movies['cast']=movies['cast'].apply(formatting2)
movies.head(3)

In [None]:
def formatting3(text):
  l=[] # empty list
  for i in ast.literal_eval(text):
   if(i['job']=='Director'):
     l.append(i['name'])
     break
  return l

In [None]:
movies['crew']=movies['crew'].apply(formatting3)
movies.head(3)

In [None]:
def collapse(text):
    l=[]
    for i in text:
        l.append(i.replace(" ",''))
    return l
#This function removes spaces in between of singular elements

In [None]:
movies['genres']=movies['genres'].apply(collapse)
movies['crew']=movies['crew'].apply(collapse)
movies['cast']=movies['cast'].apply(collapse)
movies['keyboards']=movies['keywords'].apply(collapse)

In [None]:
movies.head()

In [None]:
# converting the overview column to a list as well
movies['overview']=movies['overview'].apply(lambda x:x.split())
movies.head(1)

In [None]:
# Now, we will combine the overview, cast, crew, keywords, genre columns to create a new 'tags' column.
movies['tags']=movies['cast']+movies['crew']+movies['keywords']+movies['overview']+movies['genres']


In [None]:
new =movies.drop(columns=['overview','genres','keywords','cast','crew','keyboards'])
new.head()

In [None]:
new['tags']=new['tags'].apply(lambda x:" ".join(x))
new.head()
#Now the column tags will contain strings
# will this data processing has been completed

# APPLYING COSINE SIMILARITY

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')

#CountVectorizer converts a collection of tools into a matrix of token counts.
# max_feature=5000 specifies the number of 5000 most frequent words will be considered
# stop_words = This parameter allows you to specify a list of words to be ignored during tokenization.

In [None]:
vector=cv.fit_transform(new['tags']).toarray()
#the variable vector contains a NumPy array representing the document-term matrix obtained from the text data in the 'tags' column of the DataFrame new. 
#Each row of the array corresponds to a document (or entry in the 'tags' column), and each column corresponds to a unique token (or word) in the vocabulary. The values in the array represent the counts of each token in each document.
# we use the toarray() to convert the sparse matrix to numpy array as sparse matrix only stores non-zero elements

In [None]:

vector.shape
#The shape is (4806,5000).Here 4806 is the number of movies and 5000 is the most common words in tags column.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(vector)
#cosine_similarity returns a NumPy array representing the pairwise cosine similarity between vectors.

In [None]:
similarity

In [None]:
def recommend_movies(movie):
  movie_index=new[new['title']==movie].index[0]
  distance= similarity[movie_index]
  movies_list=sorted(list(enumerate(distance)), reverse=True,key=lambda x:x[1])[1:6]
  for i in movies_list:
    print(new.iloc[i[0]].title)


In [None]:
recommend_movies('Avatar')

In [None]:
import pickle

In [None]:
pickle.dump(new,open('movies.pkl','wb'))
#This function from pickle module is used to serialize python objects and save them to a file.
# open('movies.pkl', 'wb') , this part of the code opens a file named 'movies.pkl' in binary write mode
# After executing this code, a file named 'movies.pkl' will be created in the current directorym. 
# This file will contain the serialzed representation of the new dataframe.
# we can later load this file using 'pickle.load()' to work with the data in python.


In [None]:
pickle.dump(similarity,open('similarity.pkl','wb'))

In [None]:
new