In [25]:
import pandas as pd

In [26]:
movies_data = pd.read_csv('movies_metadata.csv', low_memory=False)
movies_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [56]:
# Handle missing values
movies_data['overview'] = movies_data['overview'].fillna('')
movies_data['tagline'] = movies_data['tagline'].fillna('')
movies_data['genres'] = movies_data['genres'].fillna('')

# Extract genres from the genres column
def extract_genres(genres_str):
    try:
        genres_list = ast.literal_eval(movies_data.iloc[:]['genres'])
        genre_name = genres_list[0]['name']
        return genre_name
    except:
        return ''

movies_data['genres_str'] = movies_data['genres'].apply(extract_genres)

# Combine genres, overview, and tagline into a single string
movies_data['combined_text'] = movies_data['genres_str'] + ' ' + movies_data['overview'] + ' ' + movies_data['tagline']

# Display the first few rows of the combined text
movies_data[['title', 'combined_text']].head()

Unnamed: 0,title,combined_text
0,Toy Story,"Led by Woody, Andy's toys live happily in his..."
1,Jumanji,When siblings Judy and Peter discover an ench...
2,Grumpier Old Men,A family wedding reignites the ancient feud b...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wo..."
4,Father of the Bride Part II,Just when George Banks has recovered from his...


In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
#Now we neeed to capture the significance of each word to apply ML algorithms
#initialized vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

#Calculating TF-IDF matrix for the combined text
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_data['combined_text'])
tfidf_matrix.shape

(45466, 77132)

In [58]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.01408453, 0.        , ..., 0.        , 0.00570811,
        0.        ],
       [0.01408453, 1.        , 0.04202193, ..., 0.07202831, 0.02044419,
        0.00869903],
       [0.        , 0.04202193, 1.        , ..., 0.        , 0.0130777 ,
        0.        ],
       ...,
       [0.        , 0.07202831, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.00570811, 0.02044419, 0.0130777 , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.00869903, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [47]:
sorted(list(enumerate(cosine_sim[0])), reverse=True,key= lambda x: x[1])[1:4]

[(15348, 0.5064092388349712),
 (2997, 0.46939776835860814),
 (24523, 0.2732957026163847)]

In [93]:
def recommend_movie(movie_title):
    # Get the index of the movie from its title
    movie_index = movies_data[movies_data['title'] == movie_title].index[0]
    list_movie = sorted(list(enumerate(cosine_sim[movie_index])), reverse=True,key= lambda x: x[1])[1:6]
    recommended_movies = [movies_data.iloc[i[0]].title for i in list_movie]
    return recommended_movies

In [96]:
recommend_movie('Toy Story')

['Toy Story 3',
 'Toy Story 2',
 'Small Fry',
 'The 40 Year Old Virgin',
 "Andy Hardy's Blonde Trouble"]