In [32]:
import pandas as pd
import numpy as np

In [9]:
dataset = pd.read_csv("Datasets/BooksDatasetClean.csv")

### Simply preprocess for dataset

- Fill in some missing values
- Split all categories into separate columns of categories
- Delete redundant columns

In [23]:
# Create new preprocessed dataset
preprocessed_dataset = dataset.copy()

# Fill missing values
preprocessed_dataset["Description"] = preprocessed_dataset["Description"].fillna(preprocessed_dataset["Category"])
preprocessed_dataset["Description"] = preprocessed_dataset["Description"].fillna(preprocessed_dataset["Title"])
preprocessed_dataset["Category"] = preprocessed_dataset["Category"].fillna("")

# Split categories into a list
preprocessed_dataset['Category_list'] = preprocessed_dataset['Category'].str.split(' , ')
preprocessed_dataset['Category_list'] = preprocessed_dataset['Category_list'].apply(lambda arr: [s.strip() for s in arr])

# Split list category into different columns
exploded = preprocessed_dataset.explode('Category_list')
dummies = pd.get_dummies(exploded['Category_list'])
preprocessed_dataset = preprocessed_dataset.join(dummies.groupby(exploded.index).sum())

# Drop unnecessary columns
preprocessed_dataset.drop(columns=['Authors', 'Category', 'Category_list', 'Publisher',
                                    'Price Starting With ($)', 'Publish Date (Month)', 'Publish Date (Year)'], inplace=True)

# Show new dataset
preprocessed_dataset.head(1)

Unnamed: 0,Title,Description,Unnamed: 3,17th Century,18th Century,19th Century,20th Century,21st Century,A+,ACT,...,XML,Yearbooks & Annuals,Yiddish,Yoga,Young Adult Fiction,Young Adult Nonfiction,Youth,Zen,Zoology,Zoos
0,Goat Brothers,"History , General",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Setup embeddings producer

- Import libraries
- Setup torch and BERT
- Create embeddings producer function

In [30]:
# importing libraries
import random
import torch
from transformers import BertTokenizer, BertModel

# Setup torch
random_seed = 42
random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)


# Define function to get text embedding
def produce_text_embedding(text):
    encoding = tokenizer.batch_encode_plus(
        [text],
        padding=True,              
        truncation=True,           
        return_tensors='pt',      
        add_special_tokens=True    
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state  

    sentence_embedding = word_embeddings.mean(dim=1)

    return sentence_embedding.cpu() 


### Function to produce all embeddings
- Never tested
- Probably needs optimization
- Has to save intermediate results
- Has to have ability to start from arbitrary point
- Saving in numpy format has to be provided

In [None]:
def create_new_dataframe(df):
    new_df = pd.DataFrame()
    numerical_cols = df.select_dtypes(include=np.number).columns

    length = df.shape[0]
    for index, row in df.iterrows():
        vector = produce_text_embedding(row['Description']).reshape(768)

        numerical_values = row[numerical_cols].values
        name = row['Title']

        combined_vector = np.concatenate((vector, numerical_values))

        vector_str = ','.join(map(str, combined_vector))

        new_row = pd.DataFrame({'book_embedding': [vector_str], 'name': [name]})
        new_df = pd.concat([new_df, new_row], ignore_index=True)

        print(f'Progress: {index / length:.2%}', end='\r')

    return new_df

### Importing numpy vectors
- Imports numpy matrix. Also such matrix has to be saved by previous method
- Consists of all books in dataframe
- For each vector first element is the book name, others are embedding

In [None]:
dataset = np.load("./Datasets/books_embeddings.npy", allow_pickle= True)

### Get most similar vectors in the dataset

- So far only finds itself. kinda enough

In [59]:
def cosine_similarity(a, b):
    l2a = 0
    l2b = 0
    scalarMultiplication = 0
    for i in range(1, a.shape[0]):
        scalarMultiplication += a[i] * b[i]
        l2a += a[i] ** 2
        l2b += b[i] ** 2

    l2a = np.sqrt(l2a) + 0.0001
    l2b = np.sqrt(l2b) + 0.0001

    return 1 - scalarMultiplication / (l2a * l2b)

def find_closest_records(record, df, n=5):
    record_vector = (record[1:]).reshape(1,-1)
    print("record vector", record_vector)
    data_matrix = df[:, 1:].astype(np.float64)
    all_names = df[:,0]
    similiraties = cosine_similarity(record_vector, data_matrix).flatten()
    distances = 1 - similiraties
    sorted_indeces = np.argsort(distances)[:n]
    names = all_names[sorted_indeces]
    return list(names)

find_closest_records(dataset[0], dataset, n=10)

record vector [[0.012158381752669811 0.12222578376531601 -0.3661309778690338 ... 0 0 0]]


['Goat Brothers']

In [None]:
new_df['book_embedding'].shape

In [None]:
find_closest_records(load_df.iloc[0], load_df, n=10)