# Dependencies

In [1]:
# imports
import pandas as pd 
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import joblib

# Load data

In [9]:
df = pd.read_csv('books.csv')
df.head()

Unnamed: 0,BookID,Title,Author,Genre,Description
0,79,Harry Potter and the Sorcerer's Stone,J.K. Rowling,Fantasy,Harry Potter has no idea how famous he is. Tha...
1,81,IT,Stephen King,Horror,A promise made twenty-eight years ago calls se...
2,84,Harry Potter and the chamber of secrets,J.K. Rowling,Fantasy,Ever since Harry Potter had come home for the ...
3,85,Grokking Algorithms,Aditya Bhargava,Software Engineering,"Grokking Algorithms is a fully illustrated, fr..."
4,86,Introduction to Algorithms,Thomas H. Cormen,Computer Science,Some books on algorithms are rigorous but inco...


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   BookID       10 non-null     int64 
 1   Title        10 non-null     object
 2   Author       10 non-null     object
 3   Genre        10 non-null     object
 4   Description  10 non-null     object
dtypes: int64(1), object(4)
memory usage: 532.0+ bytes


In [11]:
df.describe()

Unnamed: 0,BookID
count,10.0
mean,86.7
std,4.620005
min,79.0
25%,84.25
50%,87.0
75%,89.75
max,94.0


In [12]:
df.shape

(10, 5)

# Prepare data to create embeddings

In [14]:
# Concatenate text into a new column
df['combined_text'] = df.apply(lambda row: ' '.join([
    row['Title'], 
    row['Author'], 
    row['Genre'], 
    row['Description']
]), axis=1)

df

Unnamed: 0,BookID,Title,Author,Genre,Description,combined_text
0,79,Harry Potter and the Sorcerer's Stone,J.K. Rowling,Fantasy,Harry Potter has no idea how famous he is. Tha...,Harry Potter and the Sorcerer's Stone J.K. Row...
1,81,IT,Stephen King,Horror,A promise made twenty-eight years ago calls se...,IT Stephen King Horror A promise made twenty-e...
2,84,Harry Potter and the chamber of secrets,J.K. Rowling,Fantasy,Ever since Harry Potter had come home for the ...,Harry Potter and the chamber of secrets J.K. ...
3,85,Grokking Algorithms,Aditya Bhargava,Software Engineering,"Grokking Algorithms is a fully illustrated, fr...",Grokking Algorithms Aditya Bhargava Software E...
4,86,Introduction to Algorithms,Thomas H. Cormen,Computer Science,Some books on algorithms are rigorous but inco...,Introduction to Algorithms Thomas H. Cormen Co...
5,88,Harry Potter and the Prisoner of Azkaban,J.K. Rowling,Fantasy,"For twelve long years, the dread fortress of A...",Harry Potter and the Prisoner of Azkaban J.K. ...
6,89,Harry Potter and the Goblet of Fire,J.K. Rowling,Fantasy,Harry Potter is midway through his training as...,Harry Potter and the Goblet of Fire J.K. Rowli...
7,90,Harry Potter and the Order of the Phoenix,J.K. Rowling,Fantasy,Harry Potter is about to start his fifth year ...,Harry Potter and the Order of the Phoenix J.K....
8,91,Harry Potter and the Half-Blood Prince,J.K. Rowling,Fantasy,"As the Harry Potter sequence draws to a close,...",Harry Potter and the Half-Blood Prince J.K. Ro...
9,94,The Fellowship Of The Ring: The Lord of The Rings,J.R.R. Tolkein,Fantasy,"One Ring to rule them all, One Ring to find th...",The Fellowship Of The Ring: The Lord of The Ri...


# Create embeddings matrix for training the model 

In [26]:
# Load the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the concatenated text
embeddings = model.encode(df['combined_text'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

# Book recomendation based off textual similarity using K-Nearest Neighbors (KNN) model 

In [27]:
# Using 'cosine' for metric for text similarity
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
model_knn.fit(embeddings)

# Testing recommendation system model

In [34]:
# Function to make recommendations with case-insensitive and trimmed string matching
def make_recommendations(title, data=df, model=model_knn, embeddings=embeddings, n_recommendations=5):
    # Normalize the title for matching by converting to lower case and stripping spaces
    normalized_title = title.strip().lower()
    data['normalized_title'] = data['Title'].str.strip().str.lower()
    
    # Check if the normalized title exists in the dataframe
    if normalized_title not in data['normalized_title'].values:
        return "Book not found in the dataset."

    # Find the index of the book in the dataframe
    book_idx = data.index[data['normalized_title'] == normalized_title].tolist()[0]

    # Use the KNN model to find the n_recommendations closest books
    distances, indices = model.kneighbors([embeddings[book_idx]], n_neighbors=n_recommendations + 1)

    # Get the indices of the closest books (excluding the first one, which is the book itself)
    closest_books_indices = indices.flatten()[1:]

    # Return the titles of the closest books
    recommended_titles = data.iloc[closest_books_indices]['Title'].tolist()
    return recommended_titles


In [35]:
# Test the function with normalized
title_to_test = 'Harry Potter and the Chamber of Secrets'  # Add book title
recommended_books = make_recommendations(title_to_test, df, model_knn, embeddings, 3)
print("Books recommended:", recommended_books)


Books recommended: ['Harry Potter and the Order of the Phoenix', 'Harry Potter and the Goblet of Fire', "Harry Potter and the Sorcerer's Stone"]


# Save models

In [36]:
# Save the embeddings
joblib.dump(embeddings, 'embeddings.pkl')

# Save the K-Nearest Neighbors model
joblib.dump(model_knn, 'model_knn.pkl')

['model_knn.pkl']