<a href="https://colab.research.google.com/github/Abhishek1923/Edureka_DSML/blob/main/Movie_Recommender_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1.
Create a popularity-based recommender system at a genre level. The user will input a genre (g), minimum rating threshold (t) for a movie, and no. of
recommendations(N) for which it should be recommended top N movies which are
most popular within that genre (g) ordered by ratings in descending order where each movie has at least (t) reviews.

Example:
Input:

• Genre (g) : Comedy
• Minimum reviews threshold (t) : 100
• Num recommendations (N) : 5

In [None]:
import pandas as pd

# Load the dataset
movies_df = pd.read_csv("/content/movies.xls")
ratings_df = pd.read_csv("/content/ratings.csv")

In [None]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [None]:
# Merge datasets to get movie ratings
movie_ratings = pd.merge(movies_df, ratings_df, on='movieId')

movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,5.0,859046895
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,1303501039
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,5.0,858610933
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.0,850815810
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.0,851766286


In [None]:
# User inputs
genre = input("Enter the genre you are interested in: ")
min_reviews_threshold = int(input("Enter the minimum number of reviews required: "))
num_recommendations = int(input("Enter the number of recommendations you want: "))

Enter the genre you are interested in: Comedy
Enter the minimum number of reviews required: 100
Enter the number of recommendations you want: 5


In [None]:
# Filter movies belonging to the specified genre and having at least (t) reviews
genre_movies = movie_ratings[movie_ratings['genres'].str.contains(genre_input, case=False, na=False)]

# Calculate the count of ratings for each movie
genre_movies['num_reviews'] = genre_movies.groupby('title')['rating'].transform('count')

# Filter movies with at least the minimum reviews threshold
genre_movies = genre_movies[genre_movies['num_reviews'] >= min_reviews_threshold]

genre_movies.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_movies['num_reviews'] = genre_movies.groupby('title')['rating'].transform('count')


Unnamed: 0,movieId,title,genres,userId,rating,timestamp,num_reviews
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,5.0,859046895,232
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,1303501039,232
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,5.0,858610933,232
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.0,850815810,232
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.0,851766286,232


In [None]:
# Calculate popularity score for each movie
genre_movies['popularity_score'] = genre_movies['rating'].mean() * genre_movies['rating'].count()


In [None]:
# Sort movies based on popularity score and ratings in descending order
sorted_movies = genre_movies.sort_values(by=['popularity_score', 'rating'], ascending=[False, False])


In [None]:
# Recommend the top N movies
recommendations = sorted_movies.drop_duplicates(subset='title').head(num_recommendations)


In [None]:
# Display the recommended movies
print("Top", num_recommendations, "movies in the", genre_input, "genre with at least", min_reviews_threshold, "reviews:")
print(recommendations[['title', 'rating']])

Top 5 movies in the Comedy genre with at least 100 reviews:
                      title  rating
0          Toy Story (1995)     5.0
1254      Get Shorty (1995)     5.0
4367   Birdcage, The (1996)     5.0
4874  Batman Forever (1995)     5.0
6587          Clerks (1994)     5.0


# 2
Create a content-based recommender system that recommends top N movies based on similar movie(m) genres.

Example:
Input:

• Movie Title (t): Toy Story
• Num recommendations (N): 5

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load the movie dataset
movies_df = pd.read_csv('/content/movies.xls')

movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
movie_title = input("Enter the Movie Title: ")
num_recommendations = int(input("Enter the number of recommendations you want: "))

Enter the Movie Title: Toy Story
Enter the number of recommendations you want: 5


In [None]:
# Find the index of the input movie in the dataset
input_movie_index = movies_df[movies_df['title'].str.contains(movie_title, case=False, na=False)].index[0]


In [None]:
# Create a TfidfVectorizer to convert genres into numerical vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=lambda x: x.split('|'))
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df['genres'])



In [None]:
# Calculate cosine similarity between movies based on genres
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [None]:
# Get the pairwise similarity scores of the input movie with all movies
similarity_scores = list(enumerate(cosine_sim[input_movie_index]))


In [None]:
# Sort movies based on similarity scores in descending order
sorted_similar_movies = sorted(similarity_scores, key=lambda x: x[1], reverse=True)


In [None]:
# Get the indices of the top N most similar movies (excluding the input movie itself)
top_N_similar_indices = [i for i, _ in sorted_similar_movies[1:num_recommendations+1]]


In [None]:
# Get the titles of the top N most similar movies
recommended_movies = movies_df.iloc[top_N_similar_indices]['title']


In [None]:
# Display the recommended movies
print("Top", num_recommendations, "movies similar to", movie_title, "based on genres:")
print(recommended_movies)

Top 5 movies similar to Toy Story based on genres:
1815                                       Antz (1998)
2496                                Toy Story 2 (1999)
2967    Adventures of Rocky and Bullwinkle, The (2000)
3166                  Emperor's New Groove, The (2000)
3811                             Monsters, Inc. (2001)
Name: title, dtype: object


# 3
Create a collaborative based recommender system which recommends top N movies
based on “K” similar users for a target user “u”

Example:
Input:

• UserID: 1
• Num recommendations(N): 5
• Threshold for similar users (k: 100

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the user-movie ratings dataset
ratings_df = pd.read_csv('/content/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [None]:
movies_df = pd.read_csv('/content/movies.xls')

In [None]:
user_id = int(input("Enter the UserID: "))
num_recommendations = int(input("Enter the number of recommendations you want: "))
k_similar_users = int(input("Enter the threshold for similar users (k): "))


Enter the UserID: 1
Enter the number of recommendations you want: 5
Enter the threshold for similar users (k): 100


In [None]:
# Create a user-item rating matrix
user_item_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating')


In [None]:
# Fill missing values with the mean rating value
mean_rating = user_item_matrix.mean().mean()
user_item_matrix = user_item_matrix.fillna(mean_rating)

In [None]:
# Calculate cosine similarity between users based on their ratings
cosine_sim_users = cosine_similarity(user_item_matrix, user_item_matrix)


In [None]:
# Find the indices of K most similar users to the target user
similar_users_indices = cosine_sim_users[user_id-1].argsort()[-k_similar_users-1:-1][::-1]


In [None]:
# Get the ratings of movies for the similar users
similar_users_ratings = user_item_matrix.iloc[similar_users_indices]


In [None]:
# Calculate the mean rating for each movie by the similar users
mean_ratings = similar_users_ratings.mean()


In [None]:
# Sort movies based on mean ratings in descending order
sorted_movies = mean_ratings.sort_values(ascending=False)


In [None]:
# Get the indices of the top N recommended movies
top_N_movie_indices = sorted_movies.index[:num_recommendations]


In [None]:
# Get the movie titles of the top N recommended movies
recommended_movies = movies_df[movies_df['movieId'].isin(top_N_movie_indices)]['title'].unique()


In [None]:
# Display the recommended movies
print("Top", num_recommendations, "recommended movies for User", user_id, "based on", k_similar_users, "similar users:")
print(recommended_movies)

Top 5 recommended movies for User 1 based on 100 similar users:
['Pulp Fiction (1994)' 'Shawshank Redemption, The (1994)'
 'Fugitive, The (1993)' "Schindler's List (1993)"
 'Silence of the Lambs, The (1991)']
