In [1]:
import numpy as np
import pandas as pd

from scipy.spatial.distance import cosine 
import operator
  
import warnings
warnings.filterwarnings('ignore')

In [2]:
ratings = pd.read_csv(r"D:\Machine learning\Workbook\Recommendation\Movie_I Recommender\ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
movies = pd.read_csv(r"D:\Machine learning\Workbook\Recommendation\Movie_I Recommender\movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
data = pd.merge(ratings, movies, on='movieId')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [5]:
data.groupby('title')['rating'].mean().sort_values(ascending=False).head()

title
Karlson Returns (1970)                                                         5.0
Zeitgeist: Moving Forward (2011)                                               5.0
Dream of Light (a.k.a. Quince Tree Sun, The) (Sol del membrillo, El) (1992)    5.0
Dragons: Gift of the Night Fury (2011)                                         5.0
12 Angry Men (1997)                                                            5.0
Name: rating, dtype: float64

In [6]:
data.groupby('title')['userId'].count().sort_values(ascending=False).head()

title
Forrest Gump (1994)                 329
Shawshank Redemption, The (1994)    317
Pulp Fiction (1994)                 307
Silence of the Lambs, The (1991)    279
Matrix, The (1999)                  278
Name: userId, dtype: int64

In [7]:
n_ratings = len(data)
n_movies = len(data['movieId'].unique())
n_users = len(data['userId'].unique())

In [8]:
print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per movie: {round(n_ratings/n_movies, 2)}")

Number of ratings: 100836
Number of unique movieId's: 9724
Number of unique users: 610
Average ratings per user: 165.3
Average ratings per movie: 10.37


In [10]:
user_freq = data[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId', 'n_movies']
user_freq.head()

Unnamed: 0,userId,n_movies
0,1,232
1,2,29
2,3,39
3,4,216
4,5,44


In [14]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [9]:
# Get all unique genres from the DataFrame
unique_genres = set()
for genres in data['genres']:
    genre_list = genres.split('|')
    for genre in genre_list:
        unique_genres.add(genre)

# Convert the set to a sorted list
all_genres = sorted(list(unique_genres))
all_genres

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [10]:
# Convert genres into numerical vectors
def genre_to_vector(genres, all_genres):
    vector = [0] * len(all_genres)
    genre_list = genres.split('|')
    for genre in genre_list:
        if genre in all_genres:
            index = all_genres.index(genre)
            vector[index] = 1
    return vector

In [11]:
data['genres'][1]

'Comedy|Romance'

In [12]:
genre_to_vector(data['genres'][1], all_genres)

[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]

In [13]:
# Create movieDict
movieDict = {}
for index, row in data.iterrows():
    movieId = row['movieId']
    title = row['title']
    genre_vector = genre_to_vector(row['genres'], all_genres)
    rating = row['rating']
    movieDict[movieId] = (title, genre_vector, rating)

In [14]:
movieDict[1]

('Toy Story (1995)',
 [0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 5.0)

In [15]:
# Function to compute the distance between two movies and return the individual distances as well
def compute_distance(movie_a, movie_b):
    genre_distance = cosine(movie_a[1], movie_b[1])
    rating_distance = abs(movie_a[2] - movie_b[2])
    total_distance = genre_distance + rating_distance
    return total_distance, genre_distance, rating_distance

In [16]:
# Function to get the nearest neighbors of a movie
def get_neighbors(movie_id, k):
    distances = []
    for movie in movieDict:
        if movie != movie_id:
            dist = compute_distance(movieDict[movie_id], movieDict[movie])
            distances.append((movie, dist))
    
    # Sort distances by the second item in the tuple (the distance)
    distances.sort(key=operator.itemgetter(1))
    
    neighbors = []
    for i in range(min(k, len(distances))):
        neighbors.append(distances[i][0])
    return neighbors

In [17]:
# Compute the average rating of the K nearest neighbors
movie_id = int(input("Enter the movie ID: "))

K = 5
neighbors = get_neighbors(movie_id, K)
avg_rating = 0
for neighbor in neighbors:
    avg_rating += movieDict[neighbor][2]
    print(f"{movieDict[neighbor][0]} {movieDict[neighbor][2]}")

if neighbors:
    avg_rating /= len(neighbors)
    print(f"Average Rating of Nearest Neighbors: {avg_rating}")
else:
    print("No neighbors found.")

Rear Window (1954) 5.0
Memento (2000) 5.0
Just Cause (1995) 5.0
Old Boy (2003) 5.0
Reservoir Dogs (1992) 5.0
Average Rating of Nearest Neighbors: 5.0
