In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Load data
movies = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv")
ratings = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv")

print(movies,"\n ",ratings)

      movieId                                      title  \
0           1                           Toy Story (1995)   
1           2                             Jumanji (1995)   
2           3                    Grumpier Old Men (1995)   
3           4                   Waiting to Exhale (1995)   
4           5         Father of the Bride Part II (1995)   
...       ...                                        ...   
9737   193581  Black Butler: Book of the Atlantic (2017)   
9738   193583               No Game No Life: Zero (2017)   
9739   193585                               Flint (2017)   
9740   193587        Bungo Stray Dogs: Dead Apple (2018)   
9741   193609        Andrew Dice Clay: Dice Rules (1991)   

                                           genres  
0     Adventure|Animation|Children|Comedy|Fantasy  
1                      Adventure|Children|Fantasy  
2                                  Comedy|Romance  
3                            Comedy|Drama|Romance  
4                  

In [2]:
# Create user-item matrix
def create_matrix(df):
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())
   # print("total usrid ",N,"\ntotal movie id ",M)
    
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))
    #print("userr mapper\n",user_mapper,"\n","moviee mapper \n",movie_mapper)
    
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))
    
    #print("user inv \n",user_inv_mapper,"\n","movie inv \n",movie_inv_mapper)
    
    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]
    
    #print("user index\n",user_index,"\nmovie index\n",movie_index)
    
    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))  # Transposed

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)




In [3]:
# Find similar movies using KNN
def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
    neighbour_ids = []
    
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)  # +1 to account for itself
    kNN.fit(X)
    
    movie_vec = movie_vec.reshape(1, -1)
    neighbour = kNN.kneighbors(movie_vec, n_neighbors=k+1, return_distance=show_distance)
    
    similar_indices = neighbour[0][1:] if len(neighbour) > 0 else []
    neighbour_ids = [movie_inv_mapper[n] for n in similar_indices]
    
    return neighbour_ids






In [4]:
# Popularity-Based Recommendations for New Users
def recommend_popular_movies(n_recommendations):
    popular_movies = ratings.groupby('movieId').agg({'rating': ['count', 'mean']})
    popular_movies.columns = popular_movies.columns.droplevel()
    popular_movies = popular_movies[popular_movies['count'] >= 100]  # Filter out less popular movies
    popular_movies = popular_movies.sort_values(by='mean', ascending=False).head(n_recommendations)
    return popular_movies.index.tolist()



In [5]:
# Handling Recommendations for Users (Both Old and New)
def recommend_movies_for_user(user_id, n_recommendations):
    if user_id in user_mapper:
        # Existing user, use collaborative filtering
        user_ind = user_mapper[user_id]
        similar_ids = find_similar_movies(user_id, X, k=n_recommendations)
        recommended_movies = [movie_titles[movie_id] for movie_id in similar_ids]
    else:
        # New user, use popularity-based recommendations
        recommended_movie_ids = recommend_popular_movies(n_recommendations)
        recommended_movies = [movie_titles[movie_id] for movie_id in recommended_movie_ids]
    return recommended_movies



In [6]:
# Movie titles dictionary
movie_titles = dict(zip(movies['movieId'], movies['title']))

# Test the recommendation function for an existing user
existing_user_id = 1
n_recommendations = 10
existing_user_recommendations = recommend_movies_for_user(existing_user_id, n_recommendations)

print(f"Recommendations for Existing User {existing_user_id}:")
for movie_title in existing_user_recommendations:
    print(movie_title)

# Test the recommendation function for a new user
new_user_id = ratings['userId'].max() + 1
new_user_recommendations = recommend_movies_for_user(new_user_id, n_recommendations)

print(f"\n\nRecommendations for New User {new_user_id}:")
for movie_title in new_user_recommendations:
    print(movie_title)


Recommendations for Existing User 1:
Toy Story 2 (1999)
Jurassic Park (1993)
Independence Day (a.k.a. ID4) (1996)
Star Wars: Episode IV - A New Hope (1977)
Forrest Gump (1994)
Lion King, The (1994)
Star Wars: Episode VI - Return of the Jedi (1983)
Mission: Impossible (1996)
Groundhog Day (1993)
Back to the Future (1985)


Recommendations for New User 611:
Shawshank Redemption, The (1994)
Godfather, The (1972)
Fight Club (1999)
Godfather: Part II, The (1974)
Departed, The (2006)
Goodfellas (1990)
Casablanca (1942)
Dark Knight, The (2008)
Usual Suspects, The (1995)
Princess Bride, The (1987)
