In [1]:
import pandas as pd 
from langchain.vectorstores import FAISS
import re
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings

In [2]:
df = pd.read_csv("./data/book_data.csv").drop_duplicates(subset=["book_desc"])

In [3]:
df

Unnamed: 0,book_authors,book_desc,book_edition,book_format,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url
0,Suzanne Collins,Winning will make you famous. Losing means cer...,,Hardcover,9.78044E+12,374 pages,4.33,5519135,160706,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...
1,J.K. Rowling|Mary GrandPré,There is a door at the end of a silent corrido...,US Edition,Paperback,9.78044E+12,870 pages,4.48,2041594,33264,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction,https://images.gr-assets.com/books/1255614970l...
2,Harper Lee,The unforgettable novel of a childhood in a sl...,50th Anniversary,Paperback,9.78006E+12,324 pages,4.27,3745197,79450,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,https://images.gr-assets.com/books/1361975680l...
3,Jane Austen|Anna Quindlen|Mrs. Oliphant|George...,«È cosa ormai risaputa che a uno scapolo in po...,"Modern Library Classics, USA / CAN",Paperback,9.78068E+12,279 pages,4.25,2453620,54322,Pride and Prejudice,Classics|Fiction|Romance,https://images.gr-assets.com/books/1320399351l...
4,Stephenie Meyer,About three things I was absolutely positive.F...,,Paperback,9.78032E+12,498 pages,3.58,4281268,97991,Twilight,Young Adult|Fantasy|Romance|Paranormal|Vampire...,https://images.gr-assets.com/books/1361039443l...
...,...,...,...,...,...,...,...,...,...,...,...,...
54295,Avi Steinberg,Avi Steinberg is stumped. After defecting from...,,Hardcover,9.78039E+12,399 pages,3.51,3717,661,Running the Books: The Adventures of an Accide...,Nonfiction|Autobiography|Memoir|Biography|Writ...,https://images.gr-assets.com/books/1320533033l...
54296,Howard Megdal,"In this fearless and half-crazy story, Howard ...",,Hardcover,9.78161E+12,256 pages,3.37,27,9,Taking the Field: A Fan's Quest to Run the Tea...,Sports|Baseball|Sports and Games|Sports|Nonfic...,https://images.gr-assets.com/books/1312074392l...
54297,Howard Megdal,From the icons of the game to the players who ...,,Hardcover,9.78006E+12,256 pages,3.97,34,5,"The Baseball Talmud: Koufax, Greenberg, and th...",Nonfiction|Sports and Games|Sports,https://images.gr-assets.com/books/1348841629l...
54299,Mimi Baird|Eve Claxton,"Soon to be a major motion picture, from Brad P...",,Hardcover,9.7808E+12,272 pages,3.82,867,187,He Wanted the Moon: The Madness and Medical Ge...,Nonfiction|Autobiography|Memoir|Biography|Psyc...,https://images.gr-assets.com/books/1403192135l...


In [4]:
df = df[df["book_desc"].astype(str).apply(len) < 1500]

In [5]:
df["book_desc"] = df["book_desc"].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["book_desc"] = df["book_desc"].fillna('')


# Разобьем документ на чанки

In [6]:
docstring = ' <s><s> '.join(df["book_desc"].values[:300])

In [7]:
def document_processing(data: str) -> str:
    return re.sub(r'[\r\n\t\f\v]+', '', data)

In [8]:
document = document_processing(docstring)

In [9]:
from langchain.text_splitter import CharacterTextSplitter

source_chunks = []
splitter = CharacterTextSplitter(        
    separator = " <s><s> ",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
for chunk in splitter.split_text(document):
    source_chunks.append(Document(page_content=chunk, metadata={}))

# всего получилось чанков:
len(source_chunks)

Created a chunk of size 1121, which is longer than the specified 800
Created a chunk of size 818, which is longer than the specified 800
Created a chunk of size 814, which is longer than the specified 800
Created a chunk of size 1141, which is longer than the specified 800
Created a chunk of size 1089, which is longer than the specified 800
Created a chunk of size 1116, which is longer than the specified 800
Created a chunk of size 951, which is longer than the specified 800
Created a chunk of size 1368, which is longer than the specified 800
Created a chunk of size 1038, which is longer than the specified 800
Created a chunk of size 1016, which is longer than the specified 800
Created a chunk of size 1394, which is longer than the specified 800
Created a chunk of size 948, which is longer than the specified 800
Created a chunk of size 1479, which is longer than the specified 800
Created a chunk of size 982, which is longer than the specified 800
Created a chunk of size 916, which is l

288

In [10]:
# source_chunks = []
# # splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0)

# for chunk in document.split(" <s><s> "):
#     source_chunks.append(Document(page_content=chunk, metadata={}))

# # всего получилось чанков:
# len(source_chunks)

# Инициализируем модель для эмбеддингов

In [12]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('sergeyzh/rubert-tiny-turbo')

# sentences = ["привет мир", "hello world", "здравствуй вселенная"]
# embeddings = model.encode(sentences)
# print(util.dot_score(embeddings, embeddings))

  from tqdm.autonotebook import tqdm, trange


In [13]:
embeddings = HuggingFaceEmbeddings(
    model_name='sergeyzh/rubert-tiny-turbo',  # distilbert-base-uncased  sergeyzh/rubert-tiny-turbo
    # multi_process=True,
    encode_kwargs={"normalize_embeddings": True}
)


# Инициализируем векторную базу Faiss

In [14]:
db = FAISS.from_documents(source_chunks, embeddings)

In [15]:
import joblib

joblib.dump(db, 'models/vector_db.pkl')

['models/vector_db.pkl']

In [16]:
def get_simillar_book(book_desc: str, index_db: FAISS, n_chunks: int = 5) -> str:
    # Поиск релевантных отрезков из базы знаний
    docs = index_db.similarity_search(book_desc, k=n_chunks)

    # message_content = re.sub(r'\n{2}', ' ', '\n '.join([f'\nОтрывок документа №{i+1}\n=====================' + doc.page_content + '\n' for i, doc in enumerate(docs)]))

    return docs

# Получим похожие книги на 1ю в датасете

In [17]:
book_desc = df["book_desc"].values[0]

In [18]:
message_content = get_simillar_book(book_desc=book_desc, index_db=db, n_chunks=6)

In [19]:
descs = [desc.page_content for desc in message_content]

In [20]:
df[df["book_desc"].isin(descs)][["book_authors", "book_title"]]

Unnamed: 0,book_authors,book_title
0,Suzanne Collins,The Hunger Games
19,Veronica Roth,Divergent
77,Louisa May Alcott,Little Women
153,Veronica Roth,Insurgent
262,Suzanne Collins,Catching Fire
292,Kristin Cashore,Graceling


# Qdrant исследование: