In [12]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from collections import Counter
from sklearn.neighbors import NearestNeighbors

In [2]:
df = pd.read_csv('ratings.csv')
ratings = pd.read_csv("ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,title,rating
0,1,8,Fight Club,3.7
1,1,1,The Shawshank Redemption,4.6
2,1,24,The Departed,4.3
3,1,9,Forrest Gump,4.4
4,1,32,The Pianist,3.5


In [3]:
df.head()

Unnamed: 0,userId,movieId,title,rating
0,1,8,Fight Club,3.7
1,1,1,The Shawshank Redemption,4.6
2,1,24,The Departed,4.3
3,1,9,Forrest Gump,4.4
4,1,32,The Pianist,3.5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   120 non-null    int64  
 1   movieId  120 non-null    int64  
 2   title    120 non-null    object 
 3   rating   120 non-null    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 3.9+ KB


In [5]:
movieId = {uid : i for i, uid in enumerate(df['title'].unique())}
inv_movieId = {i : uid for uid, i in movieId.items()}

In [6]:
movieId

{'Fight Club': 0,
 'The Shawshank Redemption': 1,
 'The Departed': 2,
 'Forrest Gump': 3,
 'The Pianist': 4,
 'Joker': 5,
 "Schindler's List": 6,
 'Mad Max: Fury Road': 7,
 'The Silence of the Lambs': 8,
 'Whiplash': 9,
 'Avengers: Endgame': 10,
 'Shutter Island': 11,
 'The Green Mile': 12,
 'Se7en': 13,
 'Interstellar': 14,
 'Parasite': 15,
 '12 Angry Men': 16,
 'The Dark Knight': 17,
 'Goodfellas': 18,
 'The Social Network': 19,
 'The Lion King': 20,
 'The Prestige': 21,
 'The Lord of the Rings: The Return of the King': 22,
 'The Godfather': 23,
 'Django Unchained': 24,
 'The Matrix': 25,
 'The Usual Suspects': 26,
 'Pulp Fiction': 27,
 'The Wolf of Wall Street': 28,
 'Inception': 29,
 'Saving Private Ryan': 30,
 'Gladiator': 31}

In [7]:
inv_movieId

{0: 'Fight Club',
 1: 'The Shawshank Redemption',
 2: 'The Departed',
 3: 'Forrest Gump',
 4: 'The Pianist',
 5: 'Joker',
 6: "Schindler's List",
 7: 'Mad Max: Fury Road',
 8: 'The Silence of the Lambs',
 9: 'Whiplash',
 10: 'Avengers: Endgame',
 11: 'Shutter Island',
 12: 'The Green Mile',
 13: 'Se7en',
 14: 'Interstellar',
 15: 'Parasite',
 16: '12 Angry Men',
 17: 'The Dark Knight',
 18: 'Goodfellas',
 19: 'The Social Network',
 20: 'The Lion King',
 21: 'The Prestige',
 22: 'The Lord of the Rings: The Return of the King',
 23: 'The Godfather',
 24: 'Django Unchained',
 25: 'The Matrix',
 26: 'The Usual Suspects',
 27: 'Pulp Fiction',
 28: 'The Wolf of Wall Street',
 29: 'Inception',
 30: 'Saving Private Ryan',
 31: 'Gladiator'}

In [8]:
print(Counter(df['title']))

Counter({'Fight Club': 9, 'Parasite': 8, 'The Pianist': 7, 'Mad Max: Fury Road': 7, 'Forrest Gump': 6, 'Joker': 6, 'Avengers: Endgame': 6, 'The Dark Knight': 5, 'The Social Network': 5, 'The Departed': 4, 'The Green Mile': 4, 'The Lord of the Rings: The Return of the King': 4, 'Saving Private Ryan': 4, "Schindler's List": 3, 'Whiplash': 3, 'Shutter Island': 3, 'Interstellar': 3, '12 Angry Men': 3, 'Goodfellas': 3, 'The Lion King': 3, 'The Godfather': 3, 'The Usual Suspects': 3, 'The Wolf of Wall Street': 3, 'The Silence of the Lambs': 2, 'Se7en': 2, 'Django Unchained': 2, 'Pulp Fiction': 2, 'Inception': 2, 'Gladiator': 2, 'The Shawshank Redemption': 1, 'The Prestige': 1, 'The Matrix': 1})


In [15]:
def create_matrix(df):
    user_mapper = {uid: i for i, uid in enumerate(df['userId'].unique())}
    movie_mapper = {mid: i for i, mid in enumerate(df['movieId'].unique())}
    movie_inv_mapper = {i: mid for mid, i in movie_mapper.items()}

    user_index = df['userId'].map(user_mapper)
    movie_index = df['movieId'].map(movie_mapper)

    X = csr_matrix((df["rating"], (movie_index, user_index)),
                   shape=(len(movie_mapper), len(user_mapper)))
    return X, movie_mapper, movie_inv_mapper


X, movie_mapper, movie_inv_mapper = create_matrix(ratings)

user_item_matrix = ratings.pivot_table(
    index="title", columns="userId", values="rating")
print(user_item_matrix.iloc[:10, :15])

userId              1    2    3    4    5    6    7    8    9    10   11   12  \
title                                                                           
12 Angry Men       NaN  NaN  3.6  NaN  NaN  NaN  NaN  4.1  NaN  NaN  NaN  NaN   
Avengers: Endgame  NaN  3.5  NaN  NaN  4.5  4.1  NaN  4.3  4.6  3.7  NaN  NaN   
Django Unchained   NaN  NaN  NaN  NaN  NaN  4.9  NaN  NaN  4.6  NaN  NaN  NaN   
Fight Club         3.7  3.7  NaN  3.7  NaN  3.8  3.8  NaN  4.0  NaN  3.6  NaN   
Forrest Gump       4.4  NaN  NaN  NaN  3.7  NaN  4.8  NaN  4.6  NaN  NaN  4.1   
Gladiator          NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  4.3   
Goodfellas         NaN  NaN  3.6  NaN  NaN  NaN  NaN  NaN  NaN  NaN  4.9  NaN   
Inception          NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  4.9  4.6  NaN   
Interstellar       NaN  4.9  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN   
Joker              3.6  NaN  NaN  NaN  NaN  4.2  NaN  3.6  5.0  4.1  NaN  NaN   

userId              13   14

In [16]:
def recommend_similar(movie_title, df, X, movie_mapper, movie_inv_mapper, k=5):
    movie_id = df[df['title'] == movie_title]['movieId'].iloc[0]
    movie_idx = movie_mapper[movie_id]
    movie_vec = X[movie_idx]

    model = NearestNeighbors(metric='cosine', algorithm='brute')
    model.fit(X)
    distances, indices = model.kneighbors(movie_vec, n_neighbors=k + 1)

    neighbor_ids = [movie_inv_mapper[i] for i in indices.flatten()[1:]]
    recommendations = df[df['movieId'].isin(neighbor_ids)]['title'].unique()

    print(f"\nBecause you liked **{movie_title}**, you might also enjoy:")
    for rec in recommendations:
        print(f"- {rec}")

In [17]:
recommend_similar("The Dark Knight", ratings, X,
                  movie_mapper, movie_inv_mapper, k=5)


Because you liked **The Dark Knight**, you might also enjoy:
- Fight Club
- The Pianist
- Parasite
- 12 Angry Men
- The Lord of the Rings: The Return of the King


In [18]:
movie_id = df[df['title'] == movie_title]['movieId'].iloc[0]

NameError: name 'movie_title' is not defined