## _Importing Libraries_


In [11]:
!pip install matplotlib seaborn pandas numpy scipy scikit-learn implicit

Collecting matplotlib
  Downloading matplotlib-3.10.5-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.2-cp310-cp310-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.59.2-cp310-cp310-win_amd64.whl.metadata (111 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.9-cp310-cp310-win_amd64.whl.metadata (6.4 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-11.3.0-cp310-cp310-win_amd64.whl.metadata (9.2 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.10.5-cp310-cp310-win_amd64.whl (8.1 MB)
   ---------------------------------------- 0.0/8.1 MB ? eta -

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## _Loading Dataset_


_rating dataset that contains ratings of movies_


In [13]:
ratings = pd.read_csv("ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,877,4155,5,1651201566
1,305,7661,2,1639553712
2,381,8423,2,1610704432
3,208,6433,1,1650223767
4,47,7752,4,1663998365


_movies dataset that contains movie ID, Titles and Genres based on which our model will recommend movies_


In [14]:
movies = pd.read_csv("movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## _Statistical Analysis of Ratings_


In [None]:
print(f"Total number of ratings: {len(ratings)}")
print(f"Number of Unique Movies: {movies['movieId'].nunique()}")
print(f"Number of Unique Users: {ratings['userId'].nunique()}")

avg_number_of_ratings_all_users = len(ratings) / ratings["userId"].nunique()
print(
    f"Average number of ratings for all users: {round(avg_number_of_ratings_all_users, 2)}")

avg_number_of_ratings_all_movies = len(ratings) / ratings["movieId"].nunique()
print(
    f"Average number of ratings for all movies: {round(avg_number_of_ratings_all_movies, 2)}")

Total number of ratings: 100836
Number of Unique Movies: 9742
Number of Unique Users: 999
Average number of ratings for all users: 100.94
Average number of ratings for all movies: 10.35


## _User Rating Frequency_

_number of ratings each user has made_


In [17]:
number_of_ratings_per_user = ratings.groupby('userId').agg({'rating': 'count'})
print(f"Number of ratings per user:\n {number_of_ratings_per_user.head()}")

Number of ratings per user:
         rating
userId        
1          120
2          105
3           89
4          100
5          107


## _Movie Rating Analysis_


In [18]:
avg_rate_per_movie = ratings.groupby('movieId').agg({'rating': 'mean'})
print(f"Average ratings per movies:\n {avg_rate_per_movie.head()}")

Average ratings per movies:
            rating
movieId          
1        2.230769
2        3.000000
3        2.571429
4        3.916667
5        2.909091


In [19]:
# Finds the movie ID with the lowest average rating
lowest_rated = avg_rate_per_movie.idxmin()[0]
print(f"Movie that has the lowest average rating of {lowest_rated}")

movies.loc[movies['movieId'] == lowest_rated]

Movie that has the lowest average rating of 1285


Unnamed: 0,movieId,title,genres
984,1285,Heathers (1989),Comedy


In [20]:
# Finds the movie ID with the highest average rating
highest_rated = avg_rate_per_movie.idxmax()[0]
print(f"Movie that has the highest average rating of {highest_rated}")

movies.loc[movies['movieId'] == highest_rated]

Movie that has the highest average rating of 7831


Unnamed: 0,movieId,title,genres
5029,7831,Another Thin Man (1939),Comedy|Crime|Drama|Mystery|Romance


In [21]:
ratings[ratings['movieId'] == lowest_rated]

Unnamed: 0,userId,movieId,rating,timestamp
78025,667,1285,1,1613097939
78504,840,1285,1,1643070897
84758,979,1285,1,1616319498


In [22]:
ratings[ratings['movieId'] == highest_rated]

Unnamed: 0,userId,movieId,rating,timestamp
10060,171,7831,5,1632880028
51716,992,7831,4,1646316676
52680,120,7831,5,1653456682
84994,610,7831,5,1632111958
91240,540,7831,5,1619124112


In [23]:
movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()
movie_stats

Unnamed: 0_level_0,count,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,13,2.230769
2,6,3.000000
3,7,2.571429
4,12,3.916667
5,11,2.909091
...,...,...
9738,7,3.428571
9739,7,3.428571
9740,7,2.571429
9741,9,2.666667


## _User-Item Matrix Creation_

_`Creates a sparse user-item matrix using csr_matrix from scipy. It also generates mappings between user and movie IDs and their corresponding indices for use in the matrix.`_


- User-Item Matrix
  - `csr_matrix`: creates a sparse matrix (Compressed Sparse Row) from the user-item ratings data to save memory.
  - `user_mapper and movie_mapper`: create dictionaries that map user IDs and movie IDs to indices in the sparse matrix.
  - `user_inv_mapper and movie_inv_mapper`: create reverse dictionaries that map matrix indices back to user IDs and movie IDs.
  - `user_index and movie_index`: create lists of indices for users and movies from the ratings DataFrame.
  - `X = csr_matrix(...)`: constructs the sparse user-item matrix X with the given user and movie indices and ratings.


In [None]:
from scipy.sparse import csr_matrix


def create_matrix(df):
    N = len(df["userId"].unique())
    M = len(df["movieId"].unique())

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))

    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))

    user_index = [user_mapper[i] for i in df["userId"]]
    movie_index = [movie_mapper[i] for i in df["movieId"]]

    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper


X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(
    ratings)

## _Movie Similarity Analysis_

_`Use k-nearest neighbors algorithm to find similar movies based on the cosine similarity metric. It calculates the KNN for the given movie ID and returns a list of similar movie IDs.`_


In [None]:
from sklearn.neighbors import NearestNeighbors


def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
    neighbor_ids = []
    if movie_id not in movie_mapper:
        print(f"Movie ID {movie_id} not found in movie_mapper!")
        return []

    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]

    k += 1
    knn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric=metric)
    knn.fit(X)
    neighbor = knn.kneighbors(movie_vec, return_distance=show_distance)

    for i in range(0, k):
        n = neighbor.item(i)
        neighbor_ids.append(movie_inv_mapper[n])

    neighbor_ids.pop(0)
    return neighbor_ids

## _`Collaborative Filtering`_

_Movie Recommendation with respect to Users Preference_


In [26]:
def recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10):
    df1 = ratings[ratings['userId'] == user_id]
    movie_id = df1[df1['rating'] == max(df1['rating'])]['movieId'].iloc[0]
    movie_titles = dict(zip(movies['movieId'], movies['title']))
    similar_ids = find_similar_movies(movie_id, X, k)

    print(f"Since you watched {movie_titles[movie_id]}, you might also like:")

    for i in similar_ids:
        if i in movie_titles:
            print(movie_titles[i])

#### _`Recommending movies`_


In [None]:
user_id = 150
recommend_movies_for_user(user_id, X, user_mapper,
                          movie_mapper, movie_inv_mapper, k=10)

Since you watched Miller's Crossing (1990), you might also like:
Flawless (1999)
Lilya 4-Ever (Lilja 4-ever) (2002)
Bells of St. Mary's, The (1945)
Dark City (1998)
Cradle 2 the Grave (2003)
Japanese Story (2003)
