# Importing Dependencies

In [34]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Loading the Data 

In [26]:
ratings = pd.read_csv("ml-25m/ratings.csv")
movies = pd.read_csv("ml-25m/movies.csv")

## Bayesian Average

In [27]:
movie_stats = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])
C = movie_stats['count'].mean()
m = movie_stats['mean'].mean()

def bayesian_avg(ratings):
    bayesian_avg = (C*m+ratings.sum())/(C+ratings.count())
    return round(bayesian_avg, 3)

bayesian_avg_ratings = ratings.groupby('movieId')['rating'].agg(bayesian_avg).reset_index()
bayesian_avg_ratings.columns = ['movieId', 'bayesian_avg']
movie_stats = movie_stats.merge(bayesian_avg_ratings, on='movieId')

movie_stats = movie_stats.merge(movies[['movieId', 'title']])
movie_stats.sort_values(by='bayesian_avg', ascending=False)

Unnamed: 0,movieId,count,mean,bayesian_avg,title
314,318,81482,4.413576,4.407,"Shawshank Redemption, The (1994)"
840,858,52498,4.324336,4.314,"Godfather, The (1972)"
49,50,55366,4.284353,4.275,"Usual Suspects, The (1995)"
1190,1221,34188,4.261759,4.247,"Godfather: Part II, The (1974)"
522,527,60411,4.247579,4.239,Schindler's List (1993)
...,...,...,...,...,...
9603,31698,633,1.232227,1.969,Son of the Mask (2005)
11349,50798,1180,1.457203,1.883,Epic Movie (2007)
6464,6587,758,1.214380,1.880,Gigli (2003)
4669,4775,669,1.125561,1.880,Glitter (2001)


# Dealing with the genres column

In [28]:
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))

# Colaborative Filtering

In [29]:
def create_matrix(df):
  
    unique_users = df['userId'].nunique()
    unique_movies = df['movieId'].nunique()

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(unique_users))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(unique_movies))))
    
    user_inv_mapper = dict(zip(list(range(unique_users)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(unique_movies)), np.unique(df["movieId"])))
    
    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [movie_mapper[i] for i in df['movieId']]

    matrix = csr_matrix((df["rating"], (user_index,item_index)), shape=(unique_users,unique_movies))
    
    return matrix, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

matrix, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

In [30]:
matrix.shape

(162541, 59047)

# Item-Item Colaborative Filtering with K-nearest Neighbors

In [31]:
def find_similar_movies(movie_id, matrix, movie_mapper, movie_inv_mapper, k, metric='cosine'):
  
    matrix = matrix.T
    neighbour_ids = []
    
    movie_ind = movie_mapper[movie_id]
    movie_vec = matrix[movie_ind]
    if isinstance(movie_vec, (np.ndarray)):
        movie_vec = movie_vec.reshape(1,-1)
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(matrix)
    neighbour = kNN.kneighbors(movie_vec, return_distance=False)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [32]:
movie_titles = dict(zip(movies['movieId'], movies['title']))

movie_id = 1

similar_movies = find_similar_movies(movie_id, matrix, movie_mapper, movie_inv_mapper, metric='cosine', k=10)
movie_title = movie_titles[movie_id]
print(f"Because you watched {movie_title}:")
print(" ")
for i in similar_movies:
    print(movie_titles[i])

Because you watched Toy Story (1995):
 
Star Wars: Episode IV - A New Hope (1977)
Toy Story 2 (1999)
Back to the Future (1985)
Forrest Gump (1994)
Jurassic Park (1993)
Star Wars: Episode VI - Return of the Jedi (1983)
Independence Day (a.k.a. ID4) (1996)
Lion King, The (1994)
Aladdin (1992)


In [33]:
from fuzzywuzzy import process

def movie_finder(title, movies_df):
    all_titles = movies_df['title'].tolist()
    closest_match = process.extractOne(title, all_titles)
    matched_title = closest_match[0]  # Get the best match title
    matched_id = movies_df[movies_df['title'] == matched_title]['movieId'].iloc[0]  # Get the corresponding movie ID
    return matched_id, matched_title


movie_input = input("")

matched_id, matched_title = movie_finder(movie_input, movies)

# Now use the matched movie ID to find similar movies
similar_movies_ids = find_similar_movies(matched_id, matrix, movie_mapper, movie_inv_mapper, k=10)

print(f"Because you watched {matched_title}:")
print("")
for movie_id in similar_movies_ids:
    print(movie_titles[movie_id])


toy stori
Because you watched Toy Story (1995):

Star Wars: Episode IV - A New Hope (1977)
Toy Story 2 (1999)
Back to the Future (1985)
Forrest Gump (1994)
Jurassic Park (1993)
Star Wars: Episode VI - Return of the Jedi (1983)
Independence Day (a.k.a. ID4) (1996)
Lion King, The (1994)
Aladdin (1992)
