In [97]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize, MinMaxScaler
import pickle
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPT2Model, RagTokenizer, RagRetriever, RagTokenForGeneration, RagModel
from tqdm import tqdm
import re

In [98]:
# Load in csv dataset
input_dataset = pd.read_csv(f"datasets/7k/books.csv")
input_dataset

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9.780000e+12,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9.780000e+12,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9.780010e+12,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9.780010e+12,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9.780010e+12,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6805,9.788190e+12,8185300534,I Am that,Talks with Sri Nisargadatta Maharaj,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0
6806,9.788190e+12,8185944601,Secrets Of The Heart,,Khalil Gibran,Mysticism,http://books.google.com/books/content?id=XcrVp...,,1993.0,4.08,74.0,324.0
6807,9.788450e+12,8445074873,Fahrenheit 451,,Ray Bradbury,Book burning,,,2004.0,3.98,186.0,5733.0
6808,9.789030e+12,9027712050,The Berlin Phenomenology,,Georg Wilhelm Friedrich Hegel,History,http://books.google.com/books/content?id=Vy7Sk...,Since the three volume edition ofHegel's Philo...,1981.0,0.00,210.0,0.0


In [99]:
data = deepcopy(input_dataset)

# Drop missing titled books
data = data.dropna(subset=["title", "categories", "description"])

# Drop unnecessary columns
drop_columns = ["isbn13", "isbn10", "thumbnail"]
data = data.drop(columns=drop_columns)

# Preprocess text-based columns
text_columns = ["title", "subtitle", "authors", "categories", "description"]
data[text_columns] = data[text_columns].fillna("")
for feature in text_columns:
    data[feature] = data[feature].str.lower()
# Remove punctuation
data[text_columns] = data[text_columns].replace(r"[^\w\s]", "", regex=True)
# Remove any excess spaces
data[text_columns] = data[text_columns].replace(r"\s+", " ", regex=True)

# Preprocess numerical columns
numerical_columns = ["published_year", "average_rating", "num_pages", "ratings_count"]
data[numerical_columns] = data[numerical_columns].fillna(0)
scaler = MinMaxScaler()
data[numerical_columns] = data[numerical_columns].fillna(0)
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

data

Unnamed: 0,title,subtitle,authors,categories,description,published_year,average_rating,num_pages,ratings_count
0,gilead,,marilynne robinson,fiction,a novel that readers and critics have been eag...,0.992571,0.770,0.073908,0.000064
1,spiders web,a novel,charles osborneagatha christie,detective and mystery stories,a new christie for christmas a fulllength nove...,0.990589,0.766,0.072113,0.000917
2,the one tree,,stephen r donaldson,american fiction,volume two of stephen donaldsons acclaimed sec...,0.981674,0.794,0.143327,0.000031
3,rage of angels,,sidney sheldon,fiction,a memorable mesmerizing heroine jennifer brill...,0.987122,0.786,0.153202,0.005246
4,the four loves,,clive staples lewis,christian life,lewis work on the nature of love divides love ...,0.991580,0.830,0.050868,0.005983
...,...,...,...,...,...,...,...,...,...
6803,journey to the east,,hermann hesse,adventure stories,this book tells the tale of a man who goes on ...,0.991580,0.740,0.052364,0.000004
6804,the monk who sold his ferrari a fable about fu...,,robin sharma,health fitness,wisdom to create a life of passion purpose and...,0.992075,0.764,0.059246,0.000279
6805,i am that,talks with sri nisargadatta maharaj,sri nisargadatta maharajsudhakar s dikshit,philosophy,this collection of the timeless teachings of o...,0.990094,0.902,0.158887,0.000018
6808,the berlin phenomenology,,georg wilhelm friedrich hegel,history,since the three volume edition ofhegels philos...,0.981179,0.000,0.062837,0.000000


In [100]:
# Load pre-trained BERT model and tokenizer
text_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
text_tokenizer.pad_token = text_tokenizer.eos_token
text_model = GPT2Model.from_pretrained("gpt2")

In [101]:
# Function to encode text
def encode_text(input_text):
    inputs = text_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = text_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

In [102]:
# Encode text features
combined_text = data[text_columns].agg(" ".join, axis=1)
text_embeddings = np.array([encode_text(text) for text in tqdm(combined_text)])


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

In [None]:
# # Encode numerical features
# numerical_embeddings = data[numerical_columns].values

In [None]:
# Calculate similarity scores for text features
def calculate_text_similarity(input_text):
    input_embedding = encode_text(input_text).squeeze()  # Remove extra dimension
    text_embeddings_reshaped = text_embeddings.reshape(text_embeddings.shape[0], -1)  # Flatten text_embeddings
    similarities = cosine_similarity(input_embedding.reshape(1, -1), text_embeddings_reshaped)
    return similarities.flatten()

In [None]:
# Recommend books based on input data
def recommend_books(input_text, num_books=5):
    text_similarities = calculate_text_similarity(input_text)
    top_indices = text_similarities.argsort()[-num_books:][::-1]
    recommended_books = data.iloc[top_indices]
    return recommended_books

In [None]:
# Example usage
user_input_text = "I'm interested in programming, and want to learn more about Python. Maybe even some data science and machine learning."
recommended_books = recommend_books(user_input_text)

final_result = input_dataset.loc[recommended_books.index, ["title", "authors", "categories", "description", "published_year", "average_rating", "num_pages"]]
final_result

Unnamed: 0,title,authors,categories,description,published_year,average_rating,num_pages
2535,Introduction to Phenomenology,Dermot Moran,Philosophy,The book should be of interest to all students...,2000.0,4.25,592.0
1326,More Than You Know,Michael J. Mauboussin,Business & Economics,Mauboussin mines disciplines that are not norm...,2006.0,4.08,268.0
3317,"Philosophical Papers: Volume 1, Human Agency a...",Charles Taylor,Philosophy,Philosophical Papers will interest a very wide...,1985.0,4.31,304.0
6412,The Nature of Play,Anthony D. Pellegrini;Peter K. Smith,Psychology,"""Comprehensive and up to date, this tightly ed...",2005.0,4.25,308.0
6364,Crash Course in Web Design for Libraries,Charles P. Rubenstein,Computers,A handbook offers guidance in developing profe...,2007.0,2.67,196.0


In [None]:
# consolidated_model = {"text_tokenizer": tokenizer, "text_model": model}
# with open("model.pkl", "wb") as f:
#     pickle.dump(consolidated_model, f)