In [70]:
#importing all the necessary modules
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [71]:
#to import all the csv files
rating = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")

In [72]:
#to check if the import is proper by printing
print(rating.head())
print(movies.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [73]:
#merges the data based on the entries in column MovieId
totdata = pd.merge(rating, movies, on='movieId')
#print(totdata.head())

In [74]:
#making a pivot table
UserPivotTable = pd.pivot_table(totdata, index='userId', columns='title', values='rating', fill_value=0)
print(UserPivotTable)

title   '71 (2014)  'Hellboy': The Seeds of Creation (2004)  \
userId                                                        
1              0.0                                      0.0   
2              0.0                                      0.0   
3              0.0                                      0.0   
4              0.0                                      0.0   
5              0.0                                      0.0   
...            ...                                      ...   
606            0.0                                      0.0   
607            0.0                                      0.0   
608            0.0                                      0.0   
609            0.0                                      0.0   
610            4.0                                      0.0   

title   'Round Midnight (1986)  'Salem's Lot (2004)  \
userId                                                
1                          0.0                  0.0   
2              

In [75]:
#recommendation algorithm
'''
- We use the tf-idf vectorization to convert the genres of the various movies into numerical values
- we calcuate how close the values are using cosine similarity (can use k means clustering too tho na? would work or not?)
- We recommend the movie based on how close the movie is to current movie
'''

'\n- We use the tf-idf vectorization to convert the genres of the various movies into numerical values\n- we calcuate how close the values are using cosine similarity (can use k means clustering too tho na? would work or not?)\n- We recommend the movie based on how close the movie is to current movie\n'

In [76]:
#performing the TF-IDF vectorization of all the genres in movies
tfidf = TfidfVectorizer(stop_words='english')
movies['genres'] = movies['genres'].fillna('')
tfidf_mat = tfidf.fit_transform(movies['genres'])


In [77]:
#performing the cosine similarity\
#for the simillarity between the tfidf matrices 
cosineSim = cosine_similarity(tfidf_mat, tfidf_mat)
print(cosineSim)

[[1.         0.81357774 0.15276924 ... 0.         0.4210373  0.26758648]
 [0.81357774 1.         0.         ... 0.         0.         0.        ]
 [0.15276924 0.         1.         ... 0.         0.         0.57091541]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.4210373  0.         0.         ... 0.         1.         0.        ]
 [0.26758648 0.         0.57091541 ... 0.         0.         1.        ]]


In [78]:
#for performing the similarity between the users itself
UserSim = cosine_similarity(UserPivotTable.fillna(0))
print(UserSim)

[[1.         0.02728287 0.05972026 ... 0.29109737 0.09357193 0.14532081]
 [0.02728287 1.         0.         ... 0.04621095 0.0275654  0.10242675]
 [0.05972026 0.         1.         ... 0.02112846 0.         0.03211875]
 ...
 [0.29109737 0.04621095 0.02112846 ... 1.         0.12199271 0.32205486]
 [0.09357193 0.0275654  0.         ... 0.12199271 1.         0.05322546]
 [0.14532081 0.10242675 0.03211875 ... 0.32205486 0.05322546 1.        ]]


In [79]:
#converting the given pivot matrix into a sparse matrix as it is of large size and is of mostly zeros
UserSparseMat = csr_matrix(UserPivotTable.values)

In [80]:
#using the sparse matrix we can perform the svd to get our components 
u, sigma, vtranspose = svds(UserSparseMat, k=50)
sigma = np.diag(sigma)


In [81]:
#reconstructing the matrix with the new sigma to ensure we can use the predictions
PredictedRating = np.dot(np.dot(u, sigma), vtranspose)
print(PredictedRating)

[[-6.92873637e-02  1.87668108e-02 -4.28949139e-02 ... -1.87782264e-01
   1.50037535e+00  3.34632090e-02]
 [-2.86782837e-02 -1.20149084e-02 -1.07852546e-02 ... -1.02054894e-02
   5.66087362e-02 -3.67801466e-04]
 [ 1.74872101e-02  1.23869158e-03  4.90684906e-03 ...  8.86581055e-03
   5.27317419e-02  3.76147903e-03]
 ...
 [ 1.27096406e-03 -8.19546796e-02 -5.69910637e-02 ...  2.31730054e-02
   5.37805707e-01  1.74674416e-02]
 [-2.22592510e-03  5.34112987e-03  5.61299748e-03 ...  2.47015214e-02
   2.31089246e-02 -3.79650636e-03]
 [ 3.87914228e+00 -2.45434938e-02 -2.53885265e-02 ...  1.42288530e+00
  -2.58587981e-01  9.69463724e-03]]


In [None]:
#building the actual recommendation function 
def Get_Recommendations(title, cosine_sim = cosineSim):
    # Check if the title exists
    if title not in movies['title'].values:
        print(f"Movie '{title}' not found in the dataset.")
        return
    
    #get the id of movies with the title provided
    ids = movies[movies['title'] == title].index[0]
    
    #we make a sim score of all the ids with the same id using the cosine_sim function
    sim_score = list(enumerate(cosine_sim[ids]))
    
    #sorting the scores in non-increasing order
    sim_score = sorted(sim_score, key=lambda x : x[1], reverse=True)
    
    #we take the first 10 recommendations based on the sim score
    sim_score = sim_score[1:11]
    
    #we iterate through all the movie names 
    movies_ind = [i[0] for i in sim_score]
    
    #checking the indeces
    #print(movies_ind)
    
    #recommendations which are written in a list
    recs = movies['title'].iloc[movies_ind].values
    
    #printing all the top recommendations
    print(f"Recommendations are : \n")
    for name in recs:
        print(name)

In [85]:
#Taking the inputs from the user
MovName = str(input("Enter name of the film you already like: "))
MovYear = str(input("Enter the year of the film's release: "))

#making the proper input for the recommendation algo
fullMovieName = MovName + " (" + MovYear + ")"

#Using the aformentioned recommender
Get_Recommendations(fullMovieName)


[1706, 2355, 2809, 3000, 3568, 6194, 6486, 6948, 7760, 8219]
Recommendations are : 

Antz (1998)
Toy Story 2 (1999)
Adventures of Rocky and Bullwinkle, The (2000)
Emperor's New Groove, The (2000)
Monsters, Inc. (2001)
Wild, The (2006)
Shrek the Third (2007)
Tale of Despereaux, The (2008)
Asterix and the Vikings (Astérix et les Vikings) (2006)
Turbo (2013)
