# Recommendation System

In [43]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

In [44]:
path = "/content/ratings.csv"

ratings = pd.read_csv(path)

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,877,4155,5,1651201566
1,305,7661,2,1639553712
2,381,8423,2,1610704432
3,208,6433,1,1650223767
4,47,7752,4,1663998365


In [45]:
path_movie = "/content/movies.csv"

movies = pd.read_csv(path_movie)

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [47]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [46]:
n_ratings = len(ratings)

In [51]:
n_movies = len(ratings["movieId"].unique())

In [49]:
n_users = len(ratings["userId"].unique())

In [52]:
print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per movie: {round(n_ratings/n_movies, 2)}")

Number of ratings: 100836
Number of unique movieId's: 9742
Number of unique users: 999
Average ratings per user: 100.94
Average ratings per movie: 10.35


In [53]:
user_freq = ratings[["userId", 'movieId']].groupby("userId").count().reset_index()

user_freq.columns = ["userId", 'n_ratings']

user_freq.head()

Unnamed: 0,userId,n_ratings
0,1,120
1,2,105
2,3,89
3,4,100
4,5,107


In [54]:
mean_rating = ratings.groupby("movieId")[["rating"]].mean()

lowest_rated = mean_rating["rating"].idxmin()

movies.loc[movies["movieId"] == lowest_rated]

highest_rated = mean_rating["rating"].idxmax()

movies.loc[movies["movieId"] == lowest_rated]

ratings[ratings["movieId"] == highest_rated]
ratings[ratings["movieId"] == lowest_rated]


movie_stats = ratings.groupby("movieId")[["rating"]].agg(["count", "mean"])

movie_stats.columns = movie_stats.columns.droplevel()

In [55]:
from scipy.sparse import csr_matrix


def create_matrix(df):
  N = len(df["userId"].unique())
  M = len(df["movieId"].unique())

  user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
  movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))

  user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
  movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))

  user_index = [user_mapper[i] for i in df["userId"]]
  movie_index = [movie_mapper[i] for i in df["movieId"]]

  X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))

  return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper


X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

In [77]:
from sklearn.neighbors import NearestNeighbors


def find_similar_movies(movie_id, X, k, metric="cosine", show_distance=False):
    if movie_id not in movie_mapper:
        print(f"Movie Id {movie_id} not found in movie_mapper")
        return []

    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind].reshape(1, -1)

    kNN = NearestNeighbors(n_neighbors=k + 1, algorithm="brute", metric=metric)
    kNN.fit(X)

    distances, indices = kNN.kneighbors(movie_vec, return_distance=True)

    similar_ids = []
    for i in range(1, len(indices[0])):  # skip the first one (it's the movie itself)
        idx = indices[0][i]
        movie_id_sim = movie_inv_mapper[idx]
        if show_distance:
            dist = distances[0][i]
            similar_ids.append((movie_id_sim, dist))
        else:
            similar_ids.append(movie_id_sim)

    return similar_ids


In [80]:
def recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, ratings, movies, k=10):
    df1 = ratings[ratings['userId'] == user_id]
    if df1.empty:
        print("User not found or has no ratings.")
        return

    movie_id = df1[df1['rating'] == max(df1['rating'])]['movieId'].iloc[0]

    movie_titles = dict(zip(movies['movieId'], movies['title']))
    movie_title = movie_titles.get(movie_id, "Unknown Movie")

    similar_ids = find_similar_movies(movie_id, X, k)
    rated_movie_ids = set(df1['movieId'])

    print(f"Since you watched '{movie_title}', you might also like:")

    for i in similar_ids:
        if i in movie_titles and i not in rated_movie_ids:
            print(movie_titles[i])

In [81]:
user_id = 877
recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, ratings, movies, k=10)

Since you watched 'Sweet November (2001)', you might also like:
Weekend (a.k.a. Le Week-end) (Week End) (1967)
Someone to Watch Over Me (1987)
