# Dependencies

In [85]:
# imports
import pandas as pd 
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import joblib

# Load data

In [86]:
df = pd.read_csv('books_data.csv')
df.head()

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,2003-03-01,http://books.google.nl/books?id=399SPgAACAAJ&d...,,


In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212404 entries, 0 to 212403
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Title          212403 non-null  object 
 1   description    143962 non-null  object 
 2   authors        180991 non-null  object 
 3   image          160329 non-null  object 
 4   previewLink    188568 non-null  object 
 5   publisher      136518 non-null  object 
 6   publishedDate  187099 non-null  object 
 7   infoLink       188568 non-null  object 
 8   categories     171205 non-null  object 
 9   ratingsCount   49752 non-null   float64
dtypes: float64(1), object(9)
memory usage: 16.2+ MB


In [88]:
df.describe()

Unnamed: 0,ratingsCount
count,49752.0
mean,21.252975
std,201.340431
min,1.0
25%,1.0
50%,2.0
75%,5.0
max,4895.0


In [89]:
df.shape

(212404, 10)

# Cleaning data

In [90]:
# Dropping unnecessary columns
df = df.drop(['image', 'previewLink', 'publisher', 'publishedDate', 'infoLink', 'ratingsCount'], axis=1)

# Dropping all NAN values
df_cleaned = df.dropna(subset=['description', 'categories', 'Title', 'authors'])

# Rename categories to genre to better fit project data
df_cleaned = df_cleaned.rename(columns={'categories': 'genre'})

# Regex to remove unwanted characters
df_cleaned['authors'] = df_cleaned['authors'].str.replace("[\[\]']", "", regex=True)
df_cleaned['genre'] = df_cleaned['genre'].str.replace("[\[\]']", "", regex=True)

# Concatenate text into a new column
df_cleaned['combined_text'] = df_cleaned.apply(lambda row: ' '.join([
    row['Title'], 
    row['authors'], 
    row['genre'], 
    row['description']
]), axis=1)

df_cleaned


Unnamed: 0,Title,description,authors,genre
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,Philip Nel,Biography & Autobiography
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,David R. Ray,Religion
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,Veronica Haddon,Fiction
5,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,Everett Ferguson,Religion
8,Saint Hyacinth of Poland,The story for children 10 and up of St. Hyacin...,Mary Fabyan Windeatt,Biography & Autobiography
...,...,...,...,...
212394,Final things,Grace's father believes in science and builds ...,Jenny Offill,Fiction
212397,The Magic of the Soul: Applying Spiritual Powe...,"""The Magic of the Soul, Applying Spiritual Pow...",Patrick J. Harbula,"Body, Mind & Spirit"
212398,Autodesk Inventor 10 Essentials Plus,Autodesk Inventor 2017 Essentials Plus provide...,"Daniel Banach, Travis Jones",Computers
212399,The Orphan Of Ellis Island (Time Travel Advent...,"During a school trip to Ellis Island, Dominick...",Elvira Woodruff,Juvenile Fiction


# Create embeddings matrix for training the model 

In [91]:
# Load the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the concatenated text
embeddings = model.encode(df_cleaned['combined_text'].tolist(), show_progress_bar=True)

Shape of the TF-IDF Matrix: (136138, 10000)


# Book recomendation based off textual similarity using K-Nearest Neighbors (KNN) model 

In [None]:
# Using 'cosine' for metric for text similarity
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
model_knn.fit(embeddings)

# Testing recommendation system model

In [94]:
# Function to to make recommendations
def make_recommendations(title, data=df_cleaned, model=model_knn, embeddings=embeddings, n_recommendations=5):
    # Find the index of the book in the dataframe
    book_idx = data.index[data['Title'] == title].tolist()[0]

    # Use the KNN model to find the n_recommendations closest books
    distances, indices = model.kneighbors([embeddings[book_idx]], n_neighbors=n_recommendations + 1)

    # Get the indices of the closest books (excluding the first one, which is the book itself)
    closest_books_indices = indices.flatten()[1:]

    # Return the titles of the closest books
    recommended_titles = data.iloc[closest_books_indices]['Title'].tolist()
    return recommended_titles


In [97]:
# Test book recommendations from dataset
recommended_books = make_recommendations('HERE COMES ALEX PUMPERNICKEL!', df_cleaned, model_knn, embeddings, 5)
print("Books recommended:", recommended_books)


Books recommended: ['Distemper (Alex Bernier Mysteries)', 'Bad Seed: An Alex Bernier Mystery', 'The Pandora Key (Alex Shanahan)', 'Texas Crude', 'I Wanna Iguana']


# Save models

In [None]:
# Save the embeddings
joblib.dump(embeddings, 'embeddings.pkl')

# Save the K-Nearest Neighbors model
joblib.dump(model_knn, 'model_knn.pkl')