In [1]:
import pandas as pd

# Load dataset

In [2]:
!ls

embedded_data.csv               goodreads_semantic_search.ipynb
goodreads_data.csv              readme.md


In [3]:
df = pd.read_csv("goodreads_data.csv", index_col=0)
df.head(5)

Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings,URL
0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ...",4.27,5691311,https://www.goodreads.com/book/show/2657.To_Ki...
1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",4.47,9278135,https://www.goodreads.com/book/show/72193.Harr...
2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...","['Classics', 'Fiction', 'Romance', 'Historical...",4.28,3944155,https://www.goodreads.com/book/show/1885.Pride...
3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,"['Classics', 'Nonfiction', 'History', 'Biograp...",4.18,3488438,https://www.goodreads.com/book/show/48855.The_...
4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...",3.98,3575172,https://www.goodreads.com/book/show/170448.Ani...


In [4]:
df.loc[1, "Description"]

'Harry Potter thinks he is an ordinary boy - until he is rescued by an owl, taken to Hogwarts School of Witchcraft and Wizardry, learns to play Quidditch and does battle in a deadly duel. The Reason ... HARRY POTTER IS A WIZARD!'

In [5]:
len(df)

10000

## data preprocessing

In [6]:
df["Description"].isnull().values.any()

True

In [7]:
df = df[~df["Description"].isnull().values]
df["Description"].isnull().values.any()

False

In [8]:
df["Description"].map(type).eq(str).all()

True

# Embed description

In [9]:
import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def embed_text(text: str):
    model_id = "sentence-transformers/all-MiniLM-L6-v2"
    # model_id ="sentence-transformers/paraphrase-xlm-r-multilingual-v1"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    
    model = AutoModel.from_pretrained(model_id)
    
    with torch.no_grad():
        model_output = model(**encoded_input)
        
    token_embeddings = model_output[0]
    input_mask_expanded = encoded_input['attention_mask'].unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [11]:
text_vector = embed_text('Harry Potter thinks he is an ordinary boy - until he is rescued by an owl, taken to Hogwarts School of Witchcraft and Wizardry, learns to play Quidditch and does battle in a deadly duel.')
text_vector.shape

torch.Size([1, 384])

In [12]:
text_vector[0]

tensor([ 4.2993e-02,  1.9790e-01,  9.8992e-02, -1.5816e-01, -1.6097e-01,
        -3.2356e-01,  4.0950e-01, -1.5431e-01, -1.2849e-01, -1.2688e-01,
         2.5054e-02, -1.8622e-01, -4.0656e-01,  3.0986e-01,  1.5803e-01,
        -1.2521e-01,  1.2770e-01, -8.3968e-02, -1.3942e-01,  1.1383e-01,
        -1.8851e-01,  2.3556e-01,  1.4153e-01, -1.5108e-01, -3.2936e-02,
        -1.5616e-01,  1.9694e-01,  2.2613e-02, -1.0271e-01,  6.3527e-02,
         7.5810e-02, -1.2322e-01, -2.8831e-01, -1.1784e-01, -2.6006e-01,
         7.0224e-02,  1.2808e-01,  3.0344e-01,  2.8782e-01, -1.3410e-01,
        -2.5474e-01, -1.0768e-01, -4.1185e-01,  4.6213e-02, -7.5831e-02,
         2.4211e-03, -2.2581e-01, -1.1629e-01, -5.5889e-02,  3.4285e-02,
        -1.7312e-01,  4.3307e-02, -1.7202e-01,  9.9613e-02,  2.1858e-01,
         1.1413e-01,  2.4236e-01,  3.0370e-02,  2.3173e-01,  1.0230e-01,
        -1.5516e-01,  9.7178e-02, -9.6926e-02,  2.7740e-01, -4.7354e-02,
        -1.4012e-01, -1.1380e-01, -7.5994e-02,  1.1

# Similarity

In [13]:
import numpy as np

def cosine_similarity(v1, v2):
    return np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

In [14]:
harry_potter = embed_text(df.loc[1, "Description"])[0]
test_sentence = embed_text("A normal boy who becomes a wizard")[0]

In [15]:
cosine_similarity(harry_potter, test_sentence)

0.63204676

In [16]:
anne_frank = embed_text(df.loc[3, "Description"])[0]
cosine_similarity(anne_frank, test_sentence)

0.08396207

# Embed data

In [17]:
import time
start_time = time.time()

test_data = df.iloc[:100]
test_data["vector"] = test_data["Description"].apply(lambda text: embed_text(text)[0])

end_time = time.time()

print(end_time - start_time)

54.720436811447144


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data["vector"] = test_data["Description"].apply(lambda text: embed_text(text)[0])


In [18]:
test_data.head(5)

Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings,URL,vector
0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ...",4.27,5691311,https://www.goodreads.com/book/show/2657.To_Ki...,"[tensor(0.0377), tensor(0.0389), tensor(-0.065..."
1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",4.47,9278135,https://www.goodreads.com/book/show/72193.Harr...,"[tensor(0.0345), tensor(0.1521), tensor(0.1143..."
2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...","['Classics', 'Fiction', 'Romance', 'Historical...",4.28,3944155,https://www.goodreads.com/book/show/1885.Pride...,"[tensor(-0.1175), tensor(-0.1331), tensor(0.07..."
3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,"['Classics', 'Nonfiction', 'History', 'Biograp...",4.18,3488438,https://www.goodreads.com/book/show/48855.The_...,"[tensor(0.0041), tensor(0.1269), tensor(-0.008..."
4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...",3.98,3575172,https://www.goodreads.com/book/show/170448.Ani...,"[tensor(-0.0434), tensor(0.0593), tensor(-0.06..."


# Search similarity

In [19]:
request = embed_text("Boy who became a wizard.")[0]

In [23]:
start_time = time.time()

# calcualte embedding of possible request
request = embed_text("Boy who became a wizard.")[0]

end_request = time.time()

# calucalte similarity scores between request and book description
test_data["score"] = test_data["vector"].apply(lambda vec: cosine_similarity(request, vec))
test_data.sort_values("score", ascending=False, inplace=True)

end_time = time.time()

print(f"Embed Request: {end_request - start_time} seconds")
print(f"Response: {end_time - end_request} seconds")

Embed Request: 0.5208179950714111 seconds
Response: 0.009649038314819336 seconds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data["score"] = test_data["vector"].apply(lambda vec: cosine_similarity(request, vec))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.sort_values("score", ascending=False, inplace=True)


In [24]:
test_data

Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings,URL,vector,score
1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",4.47,9278135,https://www.goodreads.com/book/show/72193.Harr...,"[tensor(0.0345), tensor(0.1521), tensor(0.1143...",0.595246
17,"The Giver (The Giver, #1)",Lois Lowry,"At the age of twelve, Jonas, a young boy from ...","['Young Adult', 'Fiction', 'Classics', 'Dystop...",4.12,2285401,https://www.goodreads.com/book/show/3636.The_G...,"[tensor(-0.0335), tensor(0.3690), tensor(-0.16...",0.423278
84,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,Ever since Harry Potter had come home for the ...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",4.43,3597747,https://www.goodreads.com/book/show/15881.Harr...,"[tensor(-0.0252), tensor(0.0513), tensor(0.096...",0.364060
37,The Adventures of Huckleberry Finn,Mark Twain,A nineteenth-century boy from a Mississippi Ri...,"['Classics', 'Fiction', 'Historical Fiction', ...",3.83,1237071,https://www.goodreads.com/book/show/2956.The_A...,"[tensor(-0.3741), tensor(0.2123), tensor(-0.08...",0.357276
58,Where the Wild Things Are,Maurice Sendak,"Max, a wild and naughty boy, is sent to bed wi...","['Childrens', 'Picture Books', 'Fiction', 'Cla...",4.24,963323,https://www.goodreads.com/book/show/19543.Wher...,"[tensor(0.1523), tensor(0.2483), tensor(-0.081...",0.338852
...,...,...,...,...,...,...,...,...,...
21,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,"Could you survive on your own in the wild, wit...","['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",4.33,7963002,https://www.goodreads.com/book/show/2767052-th...,"[tensor(0.1271), tensor(0.0620), tensor(0.0228...",0.046601
48,A Christmas Carol,Charles Dickens,"'If I had my way, every idiot who goes around ...","['Classics', 'Fiction', 'Christmas', 'Fantasy'...",4.07,779092,https://www.goodreads.com/book/show/5326.A_Chr...,"[tensor(-0.0177), tensor(0.1255), tensor(0.021...",0.019640
52,The Grapes of Wrath,John Steinbeck,The Pulitzer Prize-winning epic of the Great D...,"['Classics', 'Fiction', 'Historical Fiction', ...",4.00,864334,https://www.goodreads.com/book/show/18114322-t...,"[tensor(0.0600), tensor(0.0772), tensor(-0.063...",0.016832
39,Les Misérables,Victor Hugo,"Victor Hugo's tale of injustice, heroism and l...","['Classics', 'Fiction', 'Historical Fiction', ...",4.20,769757,https://www.goodreads.com/book/show/24280.Les_...,"[tensor(-0.1524), tensor(0.0338), tensor(0.026...",-0.001694


In [22]:
# TODOs
# KNN
# FAISS und andere algorithmen zum Beschleungigen (-> App mit milvus schneller)
# Vector size
# Welche similarity function bei welcher vector size