<a href="https://colab.research.google.com/github/Adhiksha007/AI-Bootcamp/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [70]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [71]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("parasharmanas/movie-recommendation-system")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/movie-recommendation-system


In [72]:
data = pd.read_csv(path + "/movies.csv")

In [73]:
data = data.head(1000)
data

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
995,1018,That Darn Cat! (1965),Children|Comedy|Mystery
996,1019,"20,000 Leagues Under the Sea (1954)",Adventure|Drama|Sci-Fi
997,1020,Cool Runnings (1993),Comedy
998,1021,Angels in the Outfield (1994),Children|Comedy


In [74]:
# replace '|' in genre column
data["genres"] = data["genres"].str.replace("|", ", ")

In [75]:
data

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy"
1,2,Jumanji (1995),"Adventure, Children, Fantasy"
2,3,Grumpier Old Men (1995),"Comedy, Romance"
3,4,Waiting to Exhale (1995),"Comedy, Drama, Romance"
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
995,1018,That Darn Cat! (1965),"Children, Comedy, Mystery"
996,1019,"20,000 Leagues Under the Sea (1954)","Adventure, Drama, Sci-Fi"
997,1020,Cool Runnings (1993),Comedy
998,1021,Angels in the Outfield (1994),"Children, Comedy"


In [76]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  1000 non-null   int64 
 1   title    1000 non-null   object
 2   genres   1000 non-null   object
dtypes: int64(1), object(2)
memory usage: 23.6+ KB


In [77]:
data.isnull().sum()

Unnamed: 0,0
movieId,0
title,0
genres,0


In [78]:
# Create an instance of the vectorizer
model = TfidfVectorizer(stop_words="english")

In [79]:
# Fit and transform the genre column into matrix of TF-IDF features
tfidf_matrix = model.fit_transform(data["genres"])

In [80]:
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2109 stored elements and shape (1000, 21)>

In [81]:
# Compute the cosine similarity
cosine_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [82]:
# function to recommend movies based on cosine similarities
def get_recommendations(movie_title, cosine_matrix=cosine_matrix, data=data):
    # Get the index of the movie that matches the title
    idx = data[data["title"] == movie_title].index[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_matrix[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of 2 most similar movies
    sim_scores = sim_scores[1:3]

    # get the movie indexs
    movie_idx = [i[0] for i in sim_scores]

    # return the titles
    return data['title'].iloc[movie_idx]

In [84]:
# test the recommendation system
movie_title = "Father of the Bride Part II (1995)"
get_recommendations(movie_title)

Unnamed: 0,title
17,Four Rooms (1995)
18,Ace Ventura: When Nature Calls (1995)
