In [1]:
import re
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
movies_df = pd.read_csv('./../data/movies_data.csv')
movies_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,PlotSummary,movie_text
0,1907,Daniel Boone,American,Wallace McCutcheon and Ediwin S. Porter,"William Craven, Florence Lawrence",biographical,https://en.wikipedia.org/wiki/Daniel_Boone_(19...,Boone's daughter befriends an Indian maiden as...,"Boone's cabin is attacked by the Indians, who ...",Title: Daniel Boone.\nRelease Year: 1907.\nDir...
1,1908,The Adventures of Dollie,American,D. W. Griffith,"Arthur V. Johnson, Linda Arvidson",drama,https://en.wikipedia.org/wiki/The_Adventures_o...,On a beautiful summer day a father and mother ...,A father and mother take their daughter Dollie...,Title: The Adventures of Dollie.\nRelease Year...
2,1908,The Black Viper,American,D. W. Griffith,D. W. Griffith,drama,https://en.wikipedia.org/wiki/The_Black_Viper,A thug accosts a girl as she leaves her workpl...,A thug accosts a girl as she leaves her workpl...,Title: The Black Viper.\nRelease Year: 1908.\n...
3,1908,The Call of the Wild,American,D. W. Griffith,Charles Inslee,adventure,https://en.wikipedia.org/wiki/The_Call_of_the_...,A white girl (Florence Lawrence) rejects a pro...,Florence Lawrence was already the most popular...,Title: The Call of the Wild.\nRelease Year: 19...
4,1908,A Christmas Carol,American,Unknown,Tom Ricketts,drama,https://en.wikipedia.org/wiki/A_Christmas_Caro...,No prints of the first American film adaptatio...,No prints of the first American film adaptatio...,Title: A Christmas Carol.\nRelease Year: 1908....


In [4]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=30,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

In [5]:
chunks = []
chunk_source = []  # Для отслеживания источника чанка


for idx, row in movies_df.iterrows():
    movie_chunks = splitter.split_text(row['movie_text'])
    chunks.extend(movie_chunks)
    chunk_source.extend([idx] * len(movie_chunks))

print(f"Total chunks: {len(chunks)}")

Total chunks: 76335


In [6]:
model = SentenceTransformer('all-mpnet-base-v2') # multi-qa-mpnet-base-dot-v1

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
embeddings = model.encode(chunks, show_progress_bar=True)

Batches:   0%|          | 0/2386 [00:00<?, ?it/s]

In [8]:
embeddings[0].shape

(768,)

In [9]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

In [10]:
faiss.write_index(index, "./../data/movies_info.index")


In [49]:
# index = faiss.read_index("movies_info.index")
# index

In [11]:
chunk_metadata = {
    'chunks': chunks,
    'source_index': chunk_source,
    'titles': movies_df['Title'].iloc[chunk_source].values,
    'movie_text': movies_df['movie_text'].iloc[chunk_source].values
}
import pickle
with open('./../data/chunks_metadata.pkl', 'wb') as f:
    pickle.dump(chunk_metadata, f)

In [12]:
query_embedding = model.encode(['space adventure'])

distances, indices = index.search(query_embedding, 5)

In [13]:
distances

array([[0.86031044, 0.8655559 , 0.91898096, 0.91898096, 0.91898096]],
      dtype=float32)

In [14]:
indices

array([[57173, 75286, 42099, 46850, 47005]])