In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("BooksDataset.csv")
df_clean = pd.read_csv("BooksDatasetClean.csv")

In [4]:
df.shape, df_clean.shape

((103082, 7), (103063, 8))

In [5]:
df.columns, df_clean.columns

(Index(['Title', 'Authors', 'Description', 'Category', 'Publisher',
        'Publish Date', 'Price'],
       dtype='object'),
 Index(['Title', 'Authors', 'Description', 'Category', 'Publisher',
        'Price Starting With ($)', 'Publish Date (Month)',
        'Publish Date (Year)'],
       dtype='object'))

In [6]:
df.describe()

Unnamed: 0,Title,Authors,Description,Category,Publisher,Publish Date,Price
count,103082,103082,70213,76912,103074,103082,103082
unique,97818,63580,68831,3106,13029,956,1387
top,The Nutcracker,By,For Ingest Only - Data needs to be cleaned up ...,"Fiction , General",Simon & Schuster,"Thursday, January 1, 2004",Price Starting at $5.29
freq,12,1043,30,2549,1521,868,41876


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103082 entries, 0 to 103081
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Title         103082 non-null  object
 1   Authors       103082 non-null  object
 2   Description   70213 non-null   object
 3   Category      76912 non-null   object
 4   Publisher     103074 non-null  object
 5   Publish Date  103082 non-null  object
 6   Price         103082 non-null  object
dtypes: object(7)
memory usage: 5.5+ MB


In [8]:
df["Category"].value_counts()

Category
Fiction , General                                                            2549
Fiction , Literary                                                           1709
Fiction , Mystery & Detective , General                                      1690
Fiction , Thrillers , General                                                1115
Fiction , Romance , Contemporary                                             1074
                                                                             ... 
Juvenile Fiction , Concepts , Money                                             1
Juvenile Nonfiction , People & Places , Caribbean & Latin America               1
Fiction , Thrillers , Supernatural                                              1
Mathematics , Counting & Numeration                                             1
Young Adult Nonfiction , Biography & Autobiography , Science & Technology       1
Name: count, Length: 3106, dtype: int64

In [12]:
wdf = df_clean.copy()
wdf["Description"] = wdf["Description"].fillna(wdf["Category"])
wdf["Description"] = wdf["Description"].fillna(wdf["Title"])
wdf["Category"] = wdf["Category"].fillna("")

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer

wdf['Category_list'] = wdf['Category'].str.split(' , ')
wdf['Category_list'] = wdf['Category_list'].apply(lambda arr: [s.strip() for s in arr])


In [14]:
wdf.head()

Unnamed: 0,Title,Authors,Description,Category,Publisher,Price Starting With ($),Publish Date (Month),Publish Date (Year),Category_list
0,Goat Brothers,"By Colton, Larry","History , General","History , General",Doubleday,8.79,January,1993,"[History, General]"
1,The Missing Person,"By Grumbach, Doris","Fiction , General","Fiction , General",Putnam Pub Group,4.99,March,1981,"[Fiction, General]"
2,Don't Eat Your Heart Out Cookbook,"By Piscatella, Joseph C.","Cooking , Reference","Cooking , Reference",Workman Pub Co,4.99,September,1983,"[Cooking, Reference]"
3,When Your Corporate Umbrella Begins to Leak: A...,"By Davis, Paul D.",When Your Corporate Umbrella Begins to Leak: A...,,Natl Pr Books,4.99,April,1991,[]
4,Amy Spangler's Breastfeeding : A Parent's Guide,"By Spangler, Amy",Amy Spangler's Breastfeeding : A Parent's Guide,,Amy Spangler,5.32,February,1997,[]


In [15]:

mlb = MultiLabelBinarizer()
encoded_categories = mlb.fit_transform(wdf['Category_list'])

encoded_df = pd.DataFrame(encoded_categories, columns=mlb.classes_)

wdf = pd.concat([wdf, encoded_df], axis=1)


In [16]:
# importing libraries
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity


# Set a random seed
random_seed = 42
random.seed(random_seed)

# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)


# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')




In [17]:


def product_text_embedding(text):
    encoding = tokenizer.batch_encode_plus( [text],
        padding=True,              
        truncation=True,           
        return_tensors='pt',      
        add_special_tokens=True    
    )

    input_ids = encoding['input_ids'] 
    attention_mask = encoding['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state 

    sentence_embedding = word_embeddings.mean(dim=1)
    return sentence_embedding


text = "Check working"


product_text_embedding(text).shape

torch.Size([1, 768])

In [18]:
exp_df = wdf.copy()

exp_df = exp_df.drop(columns=['Authors', 'Category', 'Category_list', 'Publisher', 'Price Starting With ($)', 'Publish Date (Month)', 'Publish Date (Year)'])

exp_df.head()

Unnamed: 0,Title,Description,Unnamed: 3,17th Century,18th Century,19th Century,20th Century,21st Century,A+,ACT,...,XML,Yearbooks & Annuals,Yiddish,Yoga,Young Adult Fiction,Young Adult Nonfiction,Youth,Zen,Zoology,Zoos
0,Goat Brothers,"History , General",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Missing Person,"Fiction , General",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Don't Eat Your Heart Out Cookbook,"Cooking , Reference",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,When Your Corporate Umbrella Begins to Leak: A...,When Your Corporate Umbrella Begins to Leak: A...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Amy Spangler's Breastfeeding : A Parent's Guide,Amy Spangler's Breastfeeding : A Parent's Guide,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def create_new_dataframe(df):
  new_df = pd.DataFrame()
  numerical_cols = df.select_dtypes(include=np.number).columns

  length = df.shape[0]
  for index, row in df.iterrows():
    vector = product_text_embedding(row['Description']).reshape(768)
    numerical_values = row[numerical_cols].values
    name = row['Title']  

    combined_vector = np.concatenate((vector, numerical_values))  

    new_df = pd.concat([new_df, pd.DataFrame({'book_embedding': [combined_vector], 
                                            'name': [name]})], ignore_index=True)
    
    print(index/length, end='\r')

  return new_df

def find_closest_records(record, new_df, n=5):
  record_vector = record['book_embedding']

  distances = []
  names = []

  length = new_df.shape[0]
  for index, row in new_df.iterrows():
    other_vector = row['book_embedding']

    similarity = cosine_similarity(record_vector.reshape(1, -1), other_vector.reshape(1, -1))[0][0]

    distance = 1 - similarity

    distances.append(distance)
    names.append(row['name'])
    
    print(index/length, end='\r')

  sorted_indices = sorted(range(len(distances)), key=lambda k: distances[k])
  
  closest_names = [names[i] for i in sorted_indices[:n]]
  closest_distances = [distances[i] for i in sorted_indices[:n]]

  return closest_names, closest_distances 

In [40]:
new_df = create_new_dataframe(exp_df)

0.99999029719686025657

In [47]:
new_df['book_embedding'].shape

(103063,)

In [49]:
2771 * 103063 / 1000000

285.587573

In [51]:
new_df.to_csv('new_df.csv', index=False)
new_df.to_csv('book_embeddings.csv', index=True)

In [67]:
new_df.iloc[9809]

book_embedding    [-0.33744266629219055, 0.36259767413139343, 0....
name                                         Writing Was Everything
Name: 9809, dtype: object

In [68]:
find_closest_records(new_df.iloc[9809], new_df, n=10)

0.99999029719686025657

(['Writing Was Everything',
  'Readings',
  'Living by Fiction',
  "The Style's the Man: Reflections on Proust, Fitzgerald, Wharton, Vidal, and Others",
  'Jafsie and John Henry: Essays',
  'Walt Whitman: The Song of Himself',
  'Penchants and Places: Essays and Criticism',
  "Why Kerouac Matters: The Lessons of On the Road (They're Not What You Think)",
  'An American Procession',
  'The Best American Spiritual Writing 2006 (The Best American Series)'],
 [-8.881784197001252e-16,
  0.04478843953272027,
  0.04814461994689079,
  0.05072595540718339,
  0.05231547279624038,
  0.06466600989494808,
  0.06471385790572293,
  0.06644086051241072,
  0.0666229453083158,
  0.06724310170713488])

In [38]:
exp_df.shape

(103063, 2005)