<a href="https://colab.research.google.com/github/1chrizty/movie-recommender/blob/main/movie_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Load ratings data from a CSV file
ratings = pd.read_csv("/content/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880000.0
1,1,306,3.5,1147869000.0
2,1,307,5.0,1147869000.0
3,1,665,5.0,1147879000.0
4,1,899,3.5,1147869000.0


In [None]:
# Load movies data from a CSV file
movies = pd.read_csv("/content/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
#Calculate some basic statistics about data
n_ratings = len(ratings)
n_movies = len(ratings['movieId'].unique())
n_users = len(ratings['userId'].unique())

In [None]:
print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average ratings per user: {round(n_ratings / n_users, 2)}")
print(f"Average ratings per movie: {round(n_ratings / n_movies, 2)}")

Number of ratings: 6277663
Number of unique movieId's: 41929
Number of unique users: 40681
Average ratings per user: 154.31
Average ratings per movie: 149.72


In [None]:
# Calculate frequency of ratings per user
user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
user_freq.head()

Unnamed: 0,userId,n_ratings
0,1,70
1,2,184
2,3,656
3,4,242
4,5,101


In [None]:
# Find lowest and highest rated movies
mean_rating = ratings.groupby('movieId')[['rating']].mean()
print(mean_rating)

           rating
movieId          
1        3.895267
2        3.254411
3        3.147220
4        2.884494
5        3.064900
...           ...
209053   3.500000
209055   3.500000
209069   3.000000
209103   4.000000
209163   4.500000

[41929 rows x 1 columns]


In [None]:
# Lowest rated movies
lowest_rated = mean_rating['rating'].idxmin()
movies.loc[movies['movieId'] == lowest_rated]

Unnamed: 0,movieId,title,genres
5218,5326,"Frank McKlusky, C.I. (2002)",Comedy


In [None]:
# Highest rated movies
highest_rated = mean_rating['rating'].idxmax()
movies.loc[movies['movieId'] == highest_rated]

Unnamed: 0,movieId,title,genres
7986,8699,Dancing in September (2000),Drama


In [None]:
# Show number of people who rated lowest rated movie
ratings[ratings['movieId'] == lowest_rated]

Unnamed: 0,userId,movieId,rating,timestamp
3100297,20438,5326,0.5,1192714000.0


In [None]:
# Calculate movie statistics using count and mean of ratings
movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()

In [None]:
print(movie_stats)
print(movie_stats.columns)

         count      mean
movieId                 
1        14346  3.895267
2         6008  3.254411
3         2914  3.147220
4          632  2.884494
5         2943  3.064900
...        ...       ...
209053       1  3.500000
209055       1  3.500000
209069       1  3.000000
209103       1  4.000000
209163       1  4.500000

[41929 rows x 2 columns]
Index(['count', 'mean'], dtype='object')


In [None]:
# Create a user-item matrix using scripy's cssr_matrix
from scipy.sparse import csr_matrix

def create_matrix(df):
  N = len(df['userId'].unique())
  M = len(df['movieId'].unique())

  # Map Ids to indices
  user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
  movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))

  # Map indices to IDs
  user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
  movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))

  user_index = [user_mapper[i] for i in df['userId']]
  movie_index = [movie_mapper[i] for i in df['movieId']]

  X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))

  return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [None]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

In [None]:
from sklearn.neighbors import NearestNeighbors
"""
Find similar movies using KNN
"""
def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
  neighbour_ids = []

  movie_ind = movie_mapper[movie_id]
  movie_vec = X[movie_ind]
  k += 1
  kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
  kNN.fit(X)
  movie_vec = movie_vec.reshape(1, -1)
  neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
  for i in range(0, k):
    n = neighbour.item(i) # Corrected indexing here
    neighbour_ids.append(movie_inv_mapper[n])
  neighbour_ids.pop(0)

  return neighbour_ids

In [None]:
movie_titles = dict(zip(movies['movieId'], movies['title']))
# for movie_id, title in movie_titles.items():print(f"{movie_id}: {title}")

movie_id = int(input("Enter movie Code: "))

similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]

print(f"Since you watched {movie_title}")
for i in similar_ids:
  print(movie_titles[i])

Enter movie Code: 10
Since you watched GoldenEye (1995)
True Lies (1994)
Die Hard: With a Vengeance (1995)
Batman (1989)
Batman Forever (1995)
Stargate (1994)
Speed (1994)
Clear and Present Danger (1994)
Cliffhanger (1993)
Fugitive, The (1993)
Waterworld (1995)
