In [3]:
import os
import polars as pl
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from tqdm import tqdm

# Load data (adjust path/format as needed)
df = pl.read_parquet("data/projevy.parquet").filter(
    pl.col("komora_komplet").str.contains('PČR, PS 2021-')
)

# Initialize multilingual embedding model (optimized for Czech)
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# Alternatives: "intfloat/multilingual-e5-large", "DeepPavlov/LaBSE-en-ru"

# Generate embeddings in batches (avoids OOM errors)
BATCH_SIZE = 128  # Adjust based on GPU/CPU memory
embeddings = []

for batch in tqdm(df.iter_slices(n_rows=BATCH_SIZE), total=len(df) // BATCH_SIZE):
    texts = batch["text"].to_list()
    batch_emb = model.encode(
        texts, 
        convert_to_numpy=True,
        batch_size=BATCH_SIZE,
        show_progress_bar=False
    )
    embeddings.append(batch_emb)

embeddings = np.vstack(embeddings).astype(np.float32)

# Build FAISS index (cosine similarity)
index = faiss.IndexFlatIP(embeddings.shape[1])  # Inner Product = Cosine when normalized
faiss.normalize_L2(embeddings)                  # Critical for cosine similarity!
index.add(embeddings)

652it [1:30:46,  8.35s/it]                                                                                             


In [25]:
# Save index and metadata
faiss.write_index(index, os.path.join("data_raw","speech_index.faiss"))
df.write_parquet(os.path.join("data_raw","speeches_with_ids.parquet"))  # Preserve original IDs

In [9]:
class CzechSpeechSearcher:
    def __init__(self, df_path, index_path):
        self.df = pl.read_parquet(df_path)
        self.index = faiss.read_index(index_path)
        self.model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    
    def search(self, query: str, top_k: int = 5) -> pl.DataFrame:
        # Embed query
        query_embed = self.model.encode(
            [query], 
            convert_to_numpy=True, 
            show_progress_bar=False
        ).astype(np.float32)
        faiss.normalize_L2(query_embed)
        
        # Search FAISS
        distances, indices = self.index.search(query_embed, top_k)
        
        # Get results
        results = self.df[indices[0]].with_columns(
            similarity_score=pl.Series(distances[0])
        )
        return results.sort("similarity_score", descending=True)

In [31]:
# Initialize searcher (do this once)
searcher = CzechSpeechSearcher(
    df_path="data_raw/speeches_with_ids.parquet",   # ✅ Correct (Parquet metadata)
    index_path="data_raw/speech_index.faiss"        # ✅ Correct (FAISS index)
)

In [45]:
pl.Config(fmt_str_lengths=1000, tbl_width_chars=1000)

<polars.config.Config at 0x1a75b2d7f20>

In [89]:
# Example: Find speeches about "education reform"
results = searcher.search("autismus", top_k=10)

# Show results
print(results.select(["datum", "mluvci", "similarity_score","soubor","poradi"]))

shape: (10, 5)
┌────────────┬────────────────────────────┬──────────────────┬──────────────────────────────────────┬────────┐
│ datum      ┆ mluvci                     ┆ similarity_score ┆ soubor                               ┆ poradi │
│ ---        ┆ ---                        ┆ ---              ┆ ---                                  ┆ ---    │
│ date       ┆ str                        ┆ f32              ┆ str                                  ┆ i32    │
╞════════════╪════════════════════════════╪══════════════════╪══════════════════════════════════════╪════════╡
│ 2023-09-12 ┆ null                       ┆ 0.589722         ┆ 2021ps_stenprot_074schuz_s074189.htm ┆ 1      │
│ 2022-01-25 ┆ Poslanec David Kasal       ┆ 0.571919         ┆ 2021ps_stenprot_006schuz_s006203.htm ┆ 4      │
│ 2024-11-20 ┆ null                       ┆ 0.527713         ┆ 2021ps_stenprot_119schuz_s119056.htm ┆ 1      │
│ 2023-10-13 ┆ Poslankyně Jana Pastuchová ┆ 0.492555         ┆ 2021ps_stenprot_077schuz_s077101.h