## Import Libraries

In [None]:
import numpy as np 
import pandas as pd 
import sklearn 
import matplotlib.pyplot as plt 
import seaborn as sb 

## Import the datasets

In [None]:
ratings = pd.read_csv("ratings.csv")

In [None]:
movies = pd.read_csv("movies.csv")

## Analysis of dataset

In [None]:
n_ratings = len(ratings) 
n_movies = len(ratings['movieId'].unique()) 
n_users = len(ratings['userId'].unique())

In [None]:
print(f"Number of ratings: {n_ratings}") 
print(f"Number of unique movieId's: {n_movies}") 
print(f"Number of unique users: {n_users}") 
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}") 
print(f"Average ratings per movie: {round(n_ratings/n_movies, 2)}") 

In [None]:
user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()   #No of movies rated by a user
user_freq.columns = ['userId', 'n_ratings'] 
user_freq.head()

In [None]:
# Find Lowest and Highest rated movies: 
mean_rating = ratings.groupby('movieId')[['rating']].mean()
print(mean_rating)

In [None]:
# Lowest rated movies 
lowest_rated = mean_rating['rating'].idxmin()  #lowest average rating
movies.loc[movies['movieId'] == lowest_rated]

In [None]:
# Highest rated movies 
highest_rated = mean_rating['rating'].idxmax() 
movies.loc[movies['movieId'] == highest_rated]

In [None]:
# This code filters the ratings DataFrame to retrieve all rows where the 'movieId' column matches the value of highest_rated. 
# This will give you all the ratings associated with the movie that has the highest-rated movieId. 

print(ratings[ratings['movieId'] == highest_rated])

In [None]:
# This code filters the ratings DataFrame to retrieve all rows where the 'movieId' column matches the value of lowest_rated. 
# This will give you all the ratings associated with the movie that has the lowest-rated movieId. 

print(ratings[ratings['movieId'] == lowest_rated])

In [None]:
movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean']) 
movie_stats.columns = movie_stats.columns.droplevel() 

print(movie_stats)

## Creating necessary DS for  collaborative filtering

In [None]:
from scipy.sparse import csr_matrix 
  
def create_matrix(df): 
      
    N = len(df['userId'].unique()) 
    M = len(df['movieId'].unique()) 
      
    # Map Ids to indices 
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N)))) 
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M)))) 
      
    # Map indices to IDs 
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"]))) 
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"]))) 
      
    user_index = [user_mapper[i] for i in df['userId']] 
    movie_index = [movie_mapper[i] for i in df['movieId']] 
  
    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N)) 
      
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

In [None]:
print(X)

In [None]:
movie_ind = movie_mapper[3]
movie_vec = X[movie_ind]
print(movie_vec)
#(0, 0) 4.0: This indicates that user 0 rated the movie of idx = 3 with a score of 4.0.
#(0, 5) 5.0: User 5 rated the movie with a score of 5.0.
#(0, 18) 3.0: User 18 rated the movie with a score of 3.0.

## Training Model

In [None]:
from sklearn.neighbors import NearestNeighbors 

#Find similar movies using KNN 

def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False): 
      
    neighbour_ids = [] 
      
    movie_ind = movie_mapper[movie_id] 
    movie_vec = X[movie_ind] 
    k+=1  #The k value is incremented by 1 to ensure that it also
          #includes the input movie itself when finding the nearest neighbors.
    
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric) 
    kNN.fit(X) 
    
    movie_vec = movie_vec.reshape(1,-1) 
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance) 
    
    for i in range(0,k): 
        n = neighbour.item(i) 
        neighbour_ids.append(movie_inv_mapper[n]) 
    neighbour_ids.pop(0)
    
    return neighbour_ids 


EXPLANATION:

Python function called find_similar_movies that is designed to find similar movies to a given movie based on a sparse matrix X, using k-Nearest Neighbors (kNN) with various options like the choice of distance metric and whether to show distances. This is a common approach for building movie recommendation systems. Here's an explanation of what this function does:

neighbour_ids: This is a list that will store the IDs of the k most similar movies, excluding the input movie itself.

movie_ind: This line gets the index of the movie in the sparse matrix X using the provided movie_mapper dictionary. This index is used to access the movie's vector in the matrix.

movie_vec: This line extracts the feature vector of the input movie from the sparse matrix X.

k += 1: The k value is incremented by 1 to ensure that it also includes the input movie itself when finding the nearest neighbors.

A kNN model is created using the NearestNeighbors class from a machine learning library like scikit-learn. It is configured to find k nearest neighbors using a specific distance metric (e.g., cosine similarity) and the brute-force algorithm.

The movie_vec is reshaped to have the shape (1, -1) to make it compatible with the kNN.kneighbors method.

neighbour is a tuple returned by the kNN.kneighbors method. The first element contains the indices of the nearest neighbors, and the second element (if show_distance is True) contains the distances to these neighbors.

A loop iterates over the indices of the nearest neighbors, extracts the corresponding movie IDs using the movie_inv_mapper dictionary, and appends them to the neighbour_ids list.

Finally, the input movie ID (which is the first neighbor) is removed from the list before returning it.

This function returns a list of movie IDs that are the most similar to the input movie, based on the chosen distance metric and k value. It can be used for building movie recommendation systems where you find movies similar to a given movie and recommend them to users who liked the input movie.



## Getting Recommendations

In [None]:
movie_titles = dict(zip(movies['movieId'], movies['title'])) 
print(movie_titles)

In [None]:
movie_id = int(input("Provide movie_id of film you watched recently (from above list): "))
similar_ids = find_similar_movies(movie_id, X, k=10)

print(f"Since you watched {movie_titles[movie_id]}") 
for i in similar_ids: 
    print(movie_titles[i])