## **Import Important Libraries**

In [34]:
import numpy as np #for linear algebra
import pandas as pd #for dataframes
import sklearn #for data preprocessing
#for viaualization
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix #for matrix
from sklearn.neighbors import NearestNeighbors #to use knn model

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## **Read the Dataset**

In [3]:
#read rating dataset
ratings_df = pd.read_csv("ratings.csv")
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
#read movie dataset
movies_df = pd.read_csv("movies.csv")
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## **Perform Statstical Analysis**

In [6]:
number_ratings = len(ratings_df)
print(f"Number of ratings: {number_ratings}")

Number of ratings: 100836


In [7]:
number_movies = len(ratings_df['movieId'].unique())
print(f"Number of unique movie Id's: {number_movies}")

Number of unique movie Id's: 9724


In [8]:
number_users = len(ratings_df['userId'].unique())
print(f"Number of unique users: {number_users}")

Number of unique users: 610


In [9]:
print(f"Average ratings per user: {round(number_ratings/number_users, 2)}")

Average ratings per user: 165.3


In [10]:
print(f"Average ratings per movie: {round(number_ratings/number_movies, 2)}")

Average ratings per movie: 10.37


In [14]:
user_frequency = ratings_df[['userId', 'movieId']].groupby('userId').count().reset_index()
user_frequency.columns = ['User Id', 'Number of Ratings']
user_frequency.head()

Unnamed: 0,User Id,Number of Ratings
0,1,232
1,2,29
2,3,39
3,4,216
4,5,44


In [18]:
#Find Lowest and Highest rated movies:
mean_rating = ratings_df.groupby('movieId')[['rating']].mean()

In [19]:
#Lowest rated movies
lowest_rated = mean_rating['rating'].idxmin()
movies.loc[movies['movieId'] == lowest_rated]

Unnamed: 0,movieId,title,genres
2689,3604,Gypsy (1962),Musical


In [20]:
# Highest rated movies
highest_rated = mean_rating['rating'].idxmax()
movies.loc[movies['movieId'] == highest_rated]

Unnamed: 0,movieId,title,genres
48,53,Lamerica (1994),Adventure|Drama


In [23]:
#show number of people who rated movies rated movie highest
ratings_df[ratings_df['movieId']==highest_rated]

Unnamed: 0,userId,movieId,rating,timestamp
13368,85,53,5.0,889468268
96115,603,53,5.0,963180003


In [24]:
#show number of people who rated movies rated movie lowest
ratings_df[ratings_df['movieId']==lowest_rated]

Unnamed: 0,userId,movieId,rating,timestamp
13633,89,3604,0.5,1520408880


In [25]:
#use the bayesian average
movie_stats = ratings_df.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()

## **Find similar movies using KNN**

In [42]:
def Creation_Matrix(df):

    Users = len(df['userId'].unique())
    Movies = len(df['movieId'].unique())

    #Map Ids to indices
    Users_Map = dict(zip(np.unique(df["userId"]), list(range(Users))))
    Movies_Map = dict(zip(np.unique(df["movieId"]), list(range(Movies))))

    #Map indices to IDs
    Users_index_Map = dict(zip(list(range(Users)), np.unique(df["userId"])))
    Movies_index_Map = dict(zip(list(range(Movies)), np.unique(df["movieId"])))

    User_index = [Users_Map[i] for i in df['userId']]
    Movie_index = [Movies_Map[i] for i in df['movieId']]

    Matrix = csr_matrix((df["rating"], (Movie_index, User_index)), shape=(Movies,Users))

    return Matrix, Users_Map, Movies_Map, Users_index_Map, Movies_index_Map

In [43]:
Matrix, Users_Map, Movies_Map, Users_index_Map, Movies_index_Map = Creation_Matrix(ratings_df)

In [80]:
#define a function to find the similar movies
def Find_Similar_Movies(movie_id, Matrix, k, metric='cosine', show_distance=False):

    Neighbours_List = []
    #define movies index and vector
    Movies_Index = Movies_Map[movie_id]
    Movies_Vector = Matrix[Movies_Index]
    
    #define KNN model
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    #fit KNN model
    kNN.fit(Matrix)
    Movies_Vector = Movies_Vector.reshape(1,-1)
    Neighbours = kNN.kneighbors(Movies_Vector, return_distance=show_distance)
    
    #for loop to append similar movies into a list
    for i in range(0,k):
        n = Neighbours.item(i)
        Neighbours_List.append(movie_inv_mapper[n])
    Neighbours_List.pop(0)
    return Neighbours_List

In [81]:
Movies_Titles = dict(zip(movies['movieId'], movies['title']))
movie_id = 6
 
Similar_IDs = Find_Similar_Movies(movie_id, X, k=10)
Movie_Title = Movies_Titles[movie_id]
 
print(f"Since you watched the {Movie_Title} Movie")
for i in Similar_IDs:
    print(Movies_Titles[i])

Since you watched the Heat (1995) Movie
Rock, The (1996)
Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
Léon: The Professional (a.k.a. The Professional) (Léon) (1994)
Casino (1995)
Fargo (1996)
Fugitive, The (1993)
Seven (a.k.a. Se7en) (1995)
Broken Arrow (1996)
Desperado (1995)
Independence Day (a.k.a. ID4) (1996)


## **Recommed Movies for Users**

In [83]:
def Recommend_Movies_For_Users(user_id, Matrix, Users_Map, Movies_Map, movie_inv_mapper, k=10):
    Users_df = ratings_df[ratings_df['userId'] == user_id]

    if Users_df.empty:
        print(f"The User with ID {user_id} Does Not Exist.")
        return

    Movie_ID = Users_df[Users_df['rating'] == max(Users_df['rating'])]['movieId'].iloc[0]
    Movie_Titles = dict(zip(movies['movieId'], movies['title']))

    Similar_IDs = Find_Similar_Movies(Movie_ID, Matrix, k)
    Movie_Title = Movie_Titles.get(Movie_ID, "Movie not found")

    if movie_title == "Movie not found":
        print(f"Movie with ID {Movie_ID} not found.")
        return

    print(f"Since you watched {Movie_Title}, you might also like:")
    for i in Similar_IDs:
        print(Movie_Titles.get(i, "Movie not found"))

In [84]:
user_id = 150 
Recommend_Movies_For_Users(user_id, Matrix, Users_Map, Movies_Map, movie_inv_mapper, k=10)

Since you watched Twelve Monkeys (a.k.a. 12 Monkeys) (1995), you might also like:
Pulp Fiction (1994)
Terminator 2: Judgment Day (1991)
Independence Day (a.k.a. ID4) (1996)
Seven (a.k.a. Se7en) (1995)
Fargo (1996)
Fugitive, The (1993)
Usual Suspects, The (1995)
Jurassic Park (1993)
Star Wars: Episode IV - A New Hope (1977)
Heat (1995)
