In [1]:
import pandas as pd

# Load dataset

In [2]:
!ls

goodreads_data.csv              readme.md
goodreads_semantic_search.ipynb


In [3]:
df = pd.read_csv("goodreads_data.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings,URL
0,0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ...",4.27,5691311,https://www.goodreads.com/book/show/2657.To_Ki...
1,1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",4.47,9278135,https://www.goodreads.com/book/show/72193.Harr...
2,2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...","['Classics', 'Fiction', 'Romance', 'Historical...",4.28,3944155,https://www.goodreads.com/book/show/1885.Pride...
3,3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,"['Classics', 'Nonfiction', 'History', 'Biograp...",4.18,3488438,https://www.goodreads.com/book/show/48855.The_...
4,4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...",3.98,3575172,https://www.goodreads.com/book/show/170448.Ani...


In [4]:
df.loc[1, "Description"]

'Harry Potter thinks he is an ordinary boy - until he is rescued by an owl, taken to Hogwarts School of Witchcraft and Wizardry, learns to play Quidditch and does battle in a deadly duel. The Reason ... HARRY POTTER IS A WIZARD!'

In [5]:
len(df)

10000

## data preprocessing

In [6]:
df["Description"].isnull().values.any()

True

In [7]:
df = df[~df["Description"].isnull().values]
df["Description"].isnull().values.any()

False

In [8]:
df["Description"].map(type).eq(str).all()

True

# Embbed description

In [9]:
import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def embbed_text(text: str):
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-xlm-r-multilingual-v1')
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    
    model = AutoModel.from_pretrained('sentence-transformers/paraphrase-xlm-r-multilingual-v1')
    
    with torch.no_grad():
        model_output = model(**encoded_input)
        
    token_embeddings = model_output[0]
    input_mask_expanded = encoded_input['attention_mask'].unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [11]:
text_vector = embbed_text('Harry Potter thinks he is an ordinary boy - until he is rescued by an owl, taken to Hogwarts School of Witchcraft and Wizardry, learns to play Quidditch and does battle in a deadly duel.')
text_vector.shape

torch.Size([1, 768])

In [12]:
text_vector[0]

tensor([-2.8036e-02, -3.2336e-02,  1.9003e-01,  2.2420e-01,  5.0261e-01,
         1.1537e-01, -4.0926e-03, -9.2236e-02,  1.1352e-01,  3.4750e-01,
        -2.0338e-02, -5.9704e-03,  1.2111e-01, -1.6532e-01, -6.1809e-02,
        -6.0135e-02, -3.0957e-01, -3.3927e-01,  2.1840e-01, -6.6099e-01,
        -9.4402e-03, -4.6507e-01,  3.5305e-01, -2.5839e-01,  4.3892e-01,
         2.0926e-01, -1.8385e-01,  3.1524e-01,  5.9155e-01, -4.5683e-01,
         4.3781e-02, -2.1923e-01, -1.3661e-02,  3.1315e-01, -8.1621e-02,
         1.1361e-01, -1.6899e-02,  8.3974e-02,  5.0431e-01,  6.4424e-02,
        -1.8838e-01,  1.2939e-02,  5.1717e-02,  1.1867e-01,  8.2663e-02,
        -9.8702e-02, -2.0744e-01,  5.9413e-01,  5.1866e-03, -1.1067e-01,
         2.6276e-01,  4.1311e-02,  1.5343e-01,  3.1478e-02,  6.6940e-02,
         2.1501e-01, -1.0987e-02, -4.5606e-02,  2.0691e-02, -2.8726e-01,
         3.0701e-01,  2.2944e-01, -3.6302e-01, -1.0851e-01, -2.1084e-02,
        -4.6555e-02, -2.3323e-01,  1.5387e-01,  1.1

# Similarity

In [13]:
import numpy as np

def cosine_similarity(v1, v2):
    return np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

In [14]:
harry_potter = embbed_text(df.loc[1, "Description"])[0]
test_sentence = embbed_text("A normal boy who becomes a wizard")[0]

In [15]:
cosine_similarity(harry_potter, test_sentence)

0.6423878

In [16]:
anne_frank = embbed_text(df.loc[3, "Description"])[0]
cosine_similarity(anne_frank, test_sentence)

0.009424428

# Embbed data

In [None]:
import time
start_time = time.time()

df["vectors"] = df["Description"].apply(lambda text: embbed_text(text)[0])

end_time = time.time()

print(end_time - start_time)

In [None]:
df.head(5)

In [None]:
df.to_csv("embedded_data.csv", encoding='utf-8')

In [None]:
# TODOs
# KNN
# FAISS und andere algorithmen zum Beschleungigen