### Importing Required libraries and framework

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

In [None]:
Movies_Data=pd.read_csv("/content/movies.csv")
Ratings_Data=pd.read_csv("/content/ratings.csv")

### Describing the Data

In [None]:
Movies_Data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
Ratings_Data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [None]:
Movies_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


In [None]:
Ratings_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


In [None]:
# Shape of the Dataset
print("Shape of Movie Dataset : ",Movies_Data.shape)
print("Shape of Rating Dataset : ",Ratings_Data.shape)

Shape of Movie Dataset :  (10329, 3)
Shape of Rating Dataset :  (105339, 4)


In [None]:
Movies_Data.describe()

Unnamed: 0,movieId
count,10329.0
mean,31924.282893
std,37734.741149
min,1.0
25%,3240.0
50%,7088.0
75%,59900.0
max,149532.0


In [None]:
Ratings_Data.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


### Finding unique users and movies

In [None]:
Movies_Data['movieId'].nunique(), Ratings_Data['movieId'].nunique(),Ratings_Data['userId'].nunique()

(10329, 10325, 668)

###Average rating and Total movies at genre level.

In [None]:
Movies_Data['genres']=Movies_Data['genres'].str.split("|")

In [None]:
Movies_Data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [None]:
Movies_Data1=Movies_Data.explode('genres')
Merged_Data=pd.merge(Ratings_Data,Movies_Data1,on=['movieId'],how="inner")
print(Merged_Data.groupby('genres').agg({"title":"nunique",
                                         "rating":"mean"}).reset_index().rename(columns={"title":"unique_movies_count",
                                                                                                     "rating":"mean_rating"}))

                genres  unique_movies_count  mean_rating
0   (no genres listed)                    7     3.071429
1               Action                 1736     3.451450
2            Adventure                 1164     3.518027
3            Animation                  400     3.635350
4             Children                  540     3.439429
5               Comedy                 3513     3.420996
6                Crime                 1440     3.642392
7          Documentary                  415     3.643035
8                Drama                 5217     3.650266
9              Fantasy                  670     3.500459
10           Film-Noir                  195     3.913636
11              Horror                 1001     3.281097
12                IMAX                  152     3.641821
13             Musical                  409     3.571962
14             Mystery                  675     3.652043
15             Romance                 1788     3.544255
16              Sci-Fi         

In [None]:
Movies_Data1=Movies_Data1[Movies_Data1['genres']!="(no genres listed)"]

In [None]:
Merged_Data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime
1,1,16,4.0,1217897793,Casino (1995),Drama
2,9,16,4.0,842686699,Casino (1995),Crime
3,9,16,4.0,842686699,Casino (1995),Drama
4,12,16,1.5,1144396284,Casino (1995),Crime


###1. Popularity based Recommendations

In [None]:

def TopNPopularMovies(genre,num_ratings_threshold,topN=5):

  popularity_df=Merged_Data.groupby(['genres','title']).agg({'rating':["mean","size"]}).reset_index()
  popularity_df.columns=['genre','title','ratings_mean','ratings_counts']
  topN_recommendations=popularity_df[(popularity_df['genre']==genre) & (popularity_df['ratings_counts']>=num_ratings_threshold)].sort_values(by=['ratings_mean'],ascending=False).head(topN)
  topN_recommendations['S.No']=list(range(1,len(topN_recommendations)+1))
  topN_recommendations.index=range(len(topN_recommendations))
  topN_recommendations.columns=['Genre','Movie Title','Average Movie Rating','Num Reviews','S.No']
  return topN_recommendations[['S.No','Movie Title','Average Movie Rating','Num Reviews']]


In [None]:
#Test
genre="Action"
num_ratings_threshold=100
topN=5
TopNPopularMovies(genre=genre,
                  num_ratings_threshold=num_ratings_threshold,
                  topN=topN)

Unnamed: 0,S.No,Movie Title,Average Movie Rating,Num Reviews
0,1,"Matrix, The (1999)",4.264368,261
1,2,Star Wars: Episode V - The Empire Strikes Back...,4.22807,228
2,3,Raiders of the Lost Ark (Indiana Jones and the...,4.212054,224
3,4,Inception (2010),4.18932,103
4,5,Star Wars: Episode IV - A New Hope (1977),4.188645,273


###2. Content based Recommendations

In [None]:
Movies_Data2=Movies_Data1.groupby("title").agg({"genres":lambda x : " ".join(list(x))}).reset_index()
Movies_Data2

Unnamed: 0,title,genres
0,'71 (2014),Action Drama Thriller War
1,'Hellboy': The Seeds of Creation (2004),Action Adventure Comedy Documentary Fantasy
2,'Round Midnight (1986),Drama Musical
3,'Til There Was You (1997),Drama Romance
4,"'burbs, The (1989)",Comedy
...,...,...
10315,loudQUIETloud: A Film About the Pixies (2006),Documentary
10316,xXx (2002),Action Crime Thriller
10317,xXx: State of the Union (2005),Action Crime Thriller
10318,¡Three Amigos! (1986),Comedy Western


In [None]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(Movies_Data2['genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
def recommendations_genre(movie_df,similarity_matrix,movie_title,topN=5):
    titles = movie_df['title']
    indices = pd.Series(movie_df.index, index=movie_df['title'])
    index = indices[movie_title]
    cosine_similarity_scores = list(enumerate(similarity_matrix[index]))
    cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
    cosine_similarity_scores = cosine_similarity_scores[1:topN+2]
    matching_movies = [i[0] for i in cosine_similarity_scores]
    matches_df=movie_df.iloc[matching_movies]
    matches_df=matches_df[matches_df['title']!=movie_title]
    matches_df.rename(columns={'title':'Movie Title'},inplace=True)
    matches_df['S.No']=range(1,len(matches_df)+1)
    matches_df.index=range(len(matches_df))
    return matches_df[['S.No','Movie Title']].head(topN)

In [None]:
# Test
movie_title="Toy Story (1995)"
topN=5
recommendations_genre(movie_df=Movies_Data2,
                      similarity_matrix=cosine_sim,
                      movie_title=movie_title,
                      topN=topN)

Unnamed: 0,S.No,Movie Title
0,1,Antz (1998)
1,2,Asterix and the Vikings (Astérix et les Viking...
2,3,"Boxtrolls, The (2014)"
3,4,DuckTales: The Movie - Treasure of the Lost La...
4,5,"Emperor's New Groove, The (2000)"


### 3. Collaborative Filtering based Recommendations

In [None]:
Movies_Data3=Movies_Data1.pivot_table(index=['title'],columns=['genres']).fillna(0).applymap(lambda x: int(x>0)).reset_index()
Movies_Data3.index=range(0,len(Movies_Data3))
new_columns=['title']+list(map(lambda x : x[1] , Movies_Data3.columns))[1:]
Movies_Data3.columns=new_columns
Movies_Data3=pd.merge(Movies_Data[['title','movieId']],Movies_Data3)

In [None]:
Movies_Data4=Movies_Data3[['title','movieId']]
def FetchUserHistory(userid):
  user_ratings=Ratings_Data[Ratings_Data['userId']==userid]
  user_history=pd.merge(user_ratings,Movies_Data4,how="inner",on="movieId").sort_values('movieId')
  return user_history

In [None]:
def GenerateSimilarUsersRecommendations(target_user_id,p,k,topN):
  user_history=FetchUserHistory(userid=target_user_id)
  similar_users = Ratings_Data[Ratings_Data['movieId'].isin(user_history['movieId'].tolist())]
  similar_uses_num_movie_reviewed=similar_users.groupby("userId").agg({"movieId":"nunique"}).reset_index().rename(columns={'movieId':'num_movies_reviewed'})\
                                                                                          .sort_values(by='num_movies_reviewed',ascending=False)
  similar_uses_num_movie_reviewed=similar_uses_num_movie_reviewed[similar_uses_num_movie_reviewed['userId']!=target_user_id]
  movies_rated_by_target_user=user_history['movieId'].nunique()
  min_movies_rated_threshold=int(movies_rated_by_target_user*(p/100))
  similar_uses_num_movie_reviewed=similar_uses_num_movie_reviewed[similar_uses_num_movie_reviewed['num_movies_reviewed']>=min_movies_rated_threshold]
  bestN_similar_users=similar_users[similar_users['userId'].isin(list(set(similar_uses_num_movie_reviewed.userId)))]
  user_vector=user_history.pivot(index='userId',columns=['movieId'],values=['rating'])
  user_vector.columns=[str(each[1]) for each in user_vector.columns]
  bestN_vectors=bestN_similar_users.pivot(index='userId',columns=['movieId'],values=['rating']).fillna(0)
  bestN_vectors.columns=[str(each[1]) for each in bestN_vectors.columns]
  for each in set(user_vector.columns)-set(bestN_vectors.columns):
    bestN_vectors[each]=0.0
  bestN_vectors=bestN_vectors[user_vector.columns]
  similarity_df=pd.DataFrame(cosine_similarity(user_vector,bestN_vectors)).T
  similarity_df.index=bestN_vectors.index
  similarity_df.columns=['similarity_score']
  topK_user_ids=list(similarity_df.sort_values('similarity_score',ascending=False).index[:k])
  similar_users_subset=Ratings_Data[(Ratings_Data['userId'].isin(topK_user_ids))]
  similar_users_subset=similar_users_subset[~similar_users_subset['movieId'].isin(user_history['movieId'])]
  best_recommended_movies=similar_users_subset.groupby('movieId').mean()[['rating']].reset_index().sort_values('rating',ascending=False)[:topN]
  output_df=pd.merge(best_recommended_movies,Movies_Data4)
  output_df.columns=['Movie ID','Average Rating', "Movie Title"]
  output_df['S.No']=range(1,len(output_df)+1)
  user_history.rename(columns={'title':"Movie Title"},inplace=True)
  user_history=user_history.sort_values('rating',ascending=False)
  user_history['S.No']=range(1,len(user_history)+1)
  return user_history[['S.No',"Movie Title"]],output_df[['S.No',"Movie Title"]]


In [None]:
#Test
target_user_id=2
p=90
k=100
topN=10
history,recommendations=GenerateSimilarUsersRecommendations(target_user_id=target_user_id,
                                    p=p,
                                    k=k,
                                    topN=topN)

In [None]:
print("History :")
history

History :


Unnamed: 0,S.No,Movie Title
0,1,Toy Story (1995)
26,2,"Time to Kill, A (1996)"
4,3,Sense and Sensibility (1995)
7,4,Dead Man Walking (1995)
10,5,Mr. Holland's Opus (1995)
17,6,Fargo (1996)
15,7,"River Wild, The (1994)"
27,8,Willy Wonka & the Chocolate Factory (1971)
25,9,Phenomenon (1996)
24,10,"Nutty Professor, The (1996)"


In [None]:
print("Recommendations :")
recommendations

Recommendations :


Unnamed: 0,S.No,Movie Title
0,1,Mother (1996)
1,2,Up Close and Personal (1996)
2,3,Ransom (1996)
3,4,"Rock, The (1996)"
4,5,"Birdcage, The (1996)"
5,6,Heat (1995)
6,7,Happy Gilmore (1996)
7,8,Down Periscope (1996)
8,9,Muppet Treasure Island (1996)
9,10,Sabrina (1995)
