# **Book Recommendation: k-Nearest Neighbors (Item Based)**

# KNN Algorithm Overview



1.   Basic Concept:

*   KNN classifies a data point based on the majority label of its k nearest neighbors in the feature space.
*   In regression, the output is the average (or weighted average) of the labels of k nearest neighbors.


2.   Key Features:

*   Instance-based: It does not build an explicit model but uses the dataset as its model.
*   Distance Metric: Determines the similarity between data points using metrics like Euclidean, Manhattan, or Minkowski distance.


3.   Parameters:

*   k: Number of neighbors to consider.
*   Distance metric: Defines how "closeness" is calculated


In [100]:
!pip install datasets



In [101]:
!pip install pandas requests onedrivedownloader



# Import and Load Data

In [102]:
import os
import requests
import io
import pandas as pd

from onedrivedownloader import download

dataset_file_path = "../Dataset/"
dataset_file_name = "filtered_user_rating.csv"
file_path = os.path.join(dataset_file_path, dataset_file_name)

if not os.path.isfile(file_path):
  print(f"Dataset not found at {dataset_file_path}, downloading from OneDrive...")

  # Replace with your direct OneDrive link for the csv file
  onedrive_link = "https://indianinstituteofscience-my.sharepoint.com/:x:/g/personal/rishavg_iisc_ac_in/ET-n21kcA3tIh-n2BjHvLjMBWI-sTFpE0O6zdUDLokuajQ?e=JZ4NjZ"

  #download(onedrive_link, filename="filtered_user_rating.csv")
  download(onedrive_link, filename=os.path.join(dataset_file_path, dataset_file_name))
  print("Dataset download completed...")
else:
  print(f"File {dataset_file_name} found at {dataset_file_path}")

File filtered_user_rating.csv found at ../Dataset/


In [103]:
import pandas as pd
all_users_rating_df = pd.read_csv("filtered_user_rating.csv", sep=',', on_bad_lines='skip')

In [104]:
all_users_rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203540 entries, 0 to 203539
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   customer_id      203540 non-null  int64  
 1   review_id        203540 non-null  object 
 2   product_id       203540 non-null  object 
 3   product_title    203540 non-null  object 
 4   star_rating      203540 non-null  float64
 5   helpful_votes    203540 non-null  float64
 6   total_votes      203540 non-null  float64
 7   review_headline  203540 non-null  object 
 8   review_date      203540 non-null  object 
dtypes: float64(3), int64(1), object(5)
memory usage: 14.0+ MB


In [105]:
all_users_rating_df.shape

(203540, 9)

## Data Cleaning and Preprocessing


In [106]:
# Drop Rows with Any Missing Values
all_users_rating_df.dropna(inplace=True)

all_users_rating_df.shape

(203540, 9)

In [107]:
#Number of distinct customers, and distinct products.
distinct_customer_ids = all_users_rating_df['customer_id'].unique()
distinct_product_ids = all_users_rating_df['product_id'].unique()

print(f"Number of distinct customers: {len(distinct_customer_ids)}")
print(f"Number of distinct products: {len(distinct_product_ids)}")


Number of distinct customers: 81797
Number of distinct products: 89507


In [108]:
# # Filtering out the data with star_rating >= 3 to focus on positive interactions
# filter_user_rating_df = all_users_rating_df[all_users_rating_df['star_rating'] >= 3]
# filter_user_rating_df.shape
filter_user_rating_df = all_users_rating_df

## Prepare dataset for KNN

In [109]:
# Map categorical IDs to numerical indices (sparse matrix requires numerical indices)
customer_mapping = {id_: idx for idx, id_ in enumerate(filter_user_rating_df['customer_id'].unique())}
product_mapping = {id_: idx for idx, id_ in enumerate(filter_user_rating_df['product_id'].unique())}

filter_user_rating_df['customer_idx'] = filter_user_rating_df['customer_id'].map(customer_mapping)
filter_user_rating_df['product_idx'] = filter_user_rating_df['product_id'].map(product_mapping)

filter_user_rating_df.head()

Unnamed: 0,customer_id,review_id,product_id,product_title,star_rating,helpful_votes,total_votes,review_headline,review_date,customer_idx,product_idx
0,51964897,R1TNWRKIVHVYOV,262181533,The Psychology of Proof: Deductive Reasoning i...,4.0,0.0,2.0,Execellent cursor examination,2005-10-14,0,0
1,24853483,RCYSGJQVQLD3R,373513194,Kiss of the Blue Dragon (Silhouette Bombshell),4.0,0.0,0.0,A different sort of futuristic & very interest...,2005-10-14,1,1
2,50122160,R36ACJURUNHD38,1410202984,Dahcotah: Life and Legends of the Sioux,5.0,0.0,0.0,A groundbreaking look into Sioux (Dakota) cust...,2005-10-14,2,2
3,50122160,R3QP8VTFWA343T,816524718,Navajo Nation Peacemaking: Living Traditional ...,5.0,0.0,1.0,An anthology of essays offering insights from ...,2005-10-14,2,3
4,47412112,R229JMAAVX4SMK,1591160529,"Inuyasha, Volume 5",5.0,0.0,0.0,TONIGHT I'M A BOY,2005-10-14,3,4


In [110]:
from scipy.sparse import csr_matrix
#Sparse matrix: rows = products, columns = customers, values = ratings
sparse_matrix = csr_matrix((filter_user_rating_df['star_rating'], (filter_user_rating_df['product_idx'], filter_user_rating_df['customer_idx'])))
sparse_matrix

<89507x81797 sparse matrix of type '<class 'numpy.float64'>'
	with 193686 stored elements in Compressed Sparse Row format>

In [111]:
#Split into training and testing datasets
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(filter_user_rating_df, test_size=0.3, random_state=42)
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

Train data shape: (142478, 11)
Test data shape: (61062, 11)


In [112]:
# Build a sparse matrix for the train dataset
train_sparse_matrix = csr_matrix((train_data['star_rating'],
                                  (train_data['product_idx'], train_data['customer_idx'])))
train_sparse_matrix

<89507x81797 sparse matrix of type '<class 'numpy.float64'>'
	with 137014 stored elements in Compressed Sparse Row format>

In [113]:
# Fit KNN Model
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5, n_jobs=-1)
knn.fit(train_sparse_matrix)

In [114]:
# Reverse mapping for product indices
reverse_product_mapping = {idx: id_ for id_, idx in product_mapping.items()}

In [115]:
# Recommend books similar to the given product_id using KNN.
def recommend_books(product_id, n_recommendations=5):
    """
    Recommends books similar to the given product_id using KNN.

    Args:
        product_id: The ID of the product to find recommendations for.
        n_recommendations: The number of recommendations to generate.

    Returns:
        A tuple containing two lists:
            - recommended_products: A list of recommended product IDs.
            - distances: A list of distances corresponding to the recommended products.
    """

    # Get the index for the given product_id
    product_idx = product_mapping.get(product_id)
    if product_idx is None or product_idx >= train_sparse_matrix.shape[0]:
        return f"Product ID {product_id} not found in the training data."

    # Find K nearest neighbors
    distances, indices = knn.kneighbors(train_sparse_matrix[product_idx], n_neighbors=n_recommendations + 1)

    # Get recommended product IDs and distances
    recommended_indices = indices.flatten()[1:]  # Exclude the input product itself
    recommended_products = [reverse_product_mapping[idx] for idx in recommended_indices]
    distances = distances.flatten()[1:]  # Exclude the distance to the input product itself

    return recommended_products, distances  # Return both recommendations and distances

## Create get_recommends()

In [116]:
import random

# Get a random index within the range of the DataFrame's length
random_index = random.randint(0, len(product_mapping) - 1)

# Access the product_id at the random index
test_product_id = list(product_mapping.keys())[random_index]

# Get recommendations and distances
recommendations, distances = recommend_books(test_product_id, n_recommendations=5)  # Get both values

# Display the recommendations and distances
print(f"Recommendations for Product ID {test_product_id}:")
for product_id, distance in zip(recommendations, distances):  # Iterate through both lists
    print(f"Product ID: {product_id}, Distance: {distance:.4f}")


    # 0807219916, 3927552003, 0446608386, 1928936229, 0596004877, 0434410152

Recommendations for Product ID 0786928484:
Product ID: 0786011327, Distance: 1.0000
Product ID: 038549887X, Distance: 1.0000
Product ID: 0201154870, Distance: 1.0000
Product ID: 0849381185, Distance: 1.0000
Product ID: 0761314105, Distance: 1.0000


In [117]:
# prompt: randomly select 10 distinct product_ids from test_data and generate a map of product_id and product_title

# Assuming 'test_data' is your DataFrame (replace with your actual DataFrame name)
# and it has columns 'product_id' and 'product_title'

def get_product_map(test_data, num_products=10):
    """Randomly selects distinct product IDs and creates a product ID-title map."""

    # Ensure 'product_id' is unique and there are enough unique product ids
    unique_products = test_data['product_id'].unique()
    if len(unique_products) < num_products:
        num_products = len(unique_products)
        print(f"Warning: Only {num_products} unique products available.")

    selected_product_ids = random.sample(list(unique_products), num_products)
    product_map = {}
    for product_id in selected_product_ids:
      product_title = test_data[test_data['product_id'] == product_id]['product_title'].iloc[0]
      product_map[product_id] = product_title
    return product_map

# Example usage:
product_id_title_map = get_product_map(test_data)
product_id_title_map

{'0115915095': 'Documents on British Foreign Policy, 1919-39: 1920-German Affairs 1st Series, v. 9',
 '0375402314': 'Midnight in the Garden of Good and Evil (AUDIO CD)',
 '0060196572': "The Girls Of Summer: The U.S. Women's Soccer Team and How It Changed The World",
 '0590634275': 'Captain Underpants and the Attack of the Talking Toilets',
 '0449002616': 'Vampire Virus',
 '0812570693': 'Brotherhood of the Wolf (The Runelords, Book Two)',
 'B0006ALAJE': 'Hunger and love,',
 '9686636498': 'Apoyo y Estímulo:¡LA ESPOSA IDEAL! (The Ideal Wife )',
 '0802713750': 'Lusitania: An Epic Tragedy',
 '0684870754': 'Life Lessons: Two Experts on Death and Dying Teach Us About the Mysteries of Life and Living'}

In [124]:
# prompt: create a dropdown of product_title and no of recommendations, map product_title to product_id in backend and use it to recommend the books using above method, print product_id, Product_title and it;s distance in the output on clicking the submit button

import ipywidgets as widgets
from IPython.display import display, clear_output

# Assuming the necessary libraries and variables (product_mapping, recommend_books, etc.) are defined as in the original code.

# Create the dropdown for product selection
product_options = list(product_id_title_map.values()) # Use product titles for dropdown
product_dropdown = widgets.Dropdown(
    options=product_options,
    value=product_options[0],
    description='Product Title:'
)

# Create the dropdown for the number of recommendations
recommendations_dropdown = widgets.Dropdown(
    options=[1, 2, 3, 4, 5],
    value=5,
    description='No. of Recommendations:'
)

# Create the submit button
submit_button = widgets.Button(description='Submit')

# Create an output area for displaying the results
output_area = widgets.Output()

# Function to handle button click
def on_submit_button_clicked(b):
    with output_area:
        clear_output()  # Clear previous output

        selected_product_title = product_dropdown.value
        selected_num_recommendations = recommendations_dropdown.value

        # Find corresponding product_id from the title in product_id_title_map
        product_id = None
        for id, title in product_id_title_map.items():
            if title == selected_product_title:
                product_id = id
                break
        if product_id is None:
          print(f"Error: Product ID not found for the selected product title '{selected_product_title}'.")
          return

        # Get the recommendations
        recommendations, distances = recommend_books(product_id, selected_num_recommendations)

        print(f"Recommendations for Product Title '{selected_product_title}' (Product ID: {product_id}):\n")
        if isinstance(recommendations, str):
          print(recommendations) # Handle the case where product_id is not found
        else:
          for product_id, distance in zip(recommendations, distances):
              product_title = filter_user_rating_df[filter_user_rating_df['product_id'] == product_id]['product_title'].iloc[0] if not filter_user_rating_df[filter_user_rating_df['product_id'] == product_id].empty else "Product title not available"
              # print(f"Product ID: {product_id}, Product Title: {product_title}, Distance: {distance:.4f}")
              print(f"{product_title}")


# Attach the click event handler to the submit button
submit_button.on_click(on_submit_button_clicked)

# Display the widgets
display(product_dropdown)
display(recommendations_dropdown)
display(submit_button)
display(output_area)

Dropdown(description='Product Title:', options=('Documents on British Foreign Policy, 1919-39: 1920-German Aff…

Dropdown(description='No. of Recommendations:', index=4, options=(1, 2, 3, 4, 5), value=5)

Button(description='Submit', style=ButtonStyle())

Output()