# KNN 1

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
import operator

import warnings
warnings.filterwarnings("ignore")

In [2]:
Rcols = ['user_id', 'movie_id', 'rating']
Ratings = pd.read_csv(r"D:\Machine learning\Workbook\KNN\KNearest Neighbours\u.data", sep='\t', names=Rcols, usecols=range(3))
Ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


Now, we'll group everything by movie ID, and compute the total number of ratings (each movie's popularity) and the average rating for every movie

In [3]:
movieProperties = Ratings.groupby('movie_id').agg(size=('rating', 'size'), mean=('rating', 'mean'))
movieProperties.head()

Unnamed: 0_level_0,size,mean
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


The raw number of ratings isn't very useful for computing distances between movies, so we'll create a new DataFrame that contains the normalized number of ratings. So, a value of 0 means nobody rated it, and a value of 1 will mean it's the most popular movie there is.

In [4]:
movieNormalizedNumRatings = (movieProperties['size'] - movieProperties['size'].min()) / (movieProperties['size'].max() - movieProperties['size'].min())
movieNormalizedNumRatings.head()

movie_id
1    0.773585
2    0.222985
3    0.152659
4    0.356775
5    0.145798
Name: size, dtype: float64

Now, let's get the genre information from the u.item file. The way this works is there are 19 fields, each corresponding to a specific genre - a value of '0' means it is not in that genre, and '1' means it is in that genre. A movie may have more than one genre associated with it.

While we're at it, we'll put together everything into one big Python dictionary called movieDict. Each entry will contain the movie name, list of genre values, the normalized popularity score, and the average rating for each movie:

In [5]:
movieDict = {}
with open(r"D:\Machine learning\Workbook\KNN\KNearest Neighbours\u.item") as f:
    for line in f:
        fields = line.strip().split('|')
        movieID = int(fields[0])
        name = fields[1]
        genres = np.array(list(map(int, fields[5:25])))
        size = movieNormalizedNumRatings.loc[movieID]
        mean_rating = movieProperties.loc[movieID, 'mean']
        movieDict[movieID] = (name, genres, size, mean_rating)

In [6]:
print(list(movieDict.items())[:5])

[(1, ('Toy Story (1995)', array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.7735849056603774, 3.8783185840707963)), (2, ('GoldenEye (1995)', array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]), 0.22298456260720412, 3.2061068702290076)), (3, ('Four Rooms (1995)', array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]), 0.15265866209262435, 3.033333333333333)), (4, ('Get Shorty (1995)', array([0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.3567753001715266, 3.550239234449761)), (5, ('Copycat (1995)', array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]), 0.1457975986277873, 3.302325581395349))]


Now let's define a function that computes the "distance" between two movies based on how similar their genres are, and how similar their popularity is. Just to make sure it works, we'll compute the distance between movie ID's 2 and 4:

In [7]:
movieDict[51]

('Legends of the Fall (1994)',
 array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1]),
 0.137221269296741,
 3.45679012345679)

In [8]:
movieDict[53]

('Natural Born Killers (1994)',
 array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),
 0.21783876500857632,
 2.953125)

In [9]:
# Function to compute the distance between two movies and return the individual distances as well
def compute_distance(movie_a, movie_b):
    genre_distance = cosine(movie_a[1], movie_b[1])
    popularity_distance = abs(movie_a[2] - movie_b[2])
    total_distance = genre_distance + popularity_distance
    return total_distance, genre_distance, popularity_distance

In [10]:
# Compute the distance between two specific movies
total_distance, genre_distance, popularity_distance = compute_distance(movieDict[53], movieDict[51])

# Print the distances
print(f"Total Distance: {total_distance}")
print(f"Genre Distance: {genre_distance}")
print(f"Popularity Distance: {popularity_distance}")

Total Distance: 1.0806174957118353
Genre Distance: 1.0
Popularity Distance: 0.08061749571183532


In [11]:
movie_id = int(input("Enter the movie ID: "))

# Function to get the nearest neighbors of a movie
def get_neighbors(movie_id, k):
    distances = []
    for movie in movieDict:
        if movie != movie_id:
            dist = compute_distance(movieDict[movie_id], movieDict[movie])
            distances.append((movie, dist))
    distances.sort(key=operator.itemgetter(1))
    
    neighbors = []
    for i in range(k):
        neighbors.append(distances[i][0])
    return neighbors

# Compute the average rating of the K nearest neighbors
K = 5
neighbors = get_neighbors(movie_id, K)
avg_rating = 0
for neighbor in neighbors:
    avg_rating += movieDict[neighbor][3]
    print(f"{movieDict[neighbor][0]} {movieDict[neighbor][3]}")

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
