In [153]:
import pandas as pd
import numpy as np
import pickle
import os
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer

In [130]:
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

In [131]:
def _get_model():
    return SentenceTransformer(MODEL_NAME, 
                               cache_folder="./models_cache",
                               token=None
                              )

In [132]:
def create_vector_embedding(text, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
    model = _get_model()
    return model.encode(text, normalize_embeddings=True) #return vector

In [149]:
library_directory_path = "library.csv"
library_text_path = "library.parquet"
library_vectors_path = "library.pkl"

In [155]:
def get_directory(file_path="library.csv"):
    return pd.read_csv(file_path)

def get_text(file_path):
    return pd.read_parquet(file_path)

def get_vectors(file_path="library.pkl"):
    with open(file_path, "rb") as f:
        return pickle.load(f)

def load_library(csv_path="library.csv",
                 parquet_path="library.parquet",
                 pkl_path="libary.pkl"):
    
    metadata = get_directory(csv_path)
    text = get_text(parquet_path)
    vectors = get_vectors(pkl_path)

    return metadata, text, vectors

In [150]:
def pull_text_for_document(document_name):
    base_name = os.path.splitext(document_name)[0]
    
    metadata, text, vectors = load_library(library_directory_path,
                                           library_text_path,
                                           library_vectors_path
                                          )
    
    rows = metadata[(metadata["document"] == base_name)]

    if rows.empty:
        print(f"No document found matching: {document_name}")
    
    uids = rows["chunk_uid"].tolist()
    
    return text[text["chunk_uid"].isin(uids)]
    

In [151]:
def pull_data_for_document(document_name):
    base_name = os.path.splitext(document_name)[0]

    metadata, text, vectors = load_library(library_directory_path,
                                           library_text_path,
                                           library_vectors_path
                                          )

    rows = metadata[(metadata["document"] == base_name)]

    if rows.empty:
        print(f"No document found matching: {document_name}")
        return

    vectors_df = pd.DataFrame(vectors)
    
    df = rows.merge(text, on="chunk_uid", how="inner")
    df = df.merge(vectors_df, on="chunk_uid", how="inner")

    return df

In [187]:
def topk_cosine(df: pd.DataFrame, query_vector: np.ndarray, K: int=5):

    X = np.stack(df["vector_embedding"].to_numpy())
    Xn = X / np.linalg.norm(X, axis=1, keepdims=True).clip(min=1e-12)
    qn = query_vector / np.linalg.norm(_input_vector).clip(min=1e-12)

    sims = Xn @ qn

    K = min(K, sims.size)
    idx = np.argpartition(-sims, K-1)[:K]
    idx = idx[np.argsort(-sims[idx])]

    out = df.iloc[idx].copy()
    out["cosine"] = sims[idx]

    return out

In [197]:
def find_n_matches(_input, K):
    qvec = create_vector_embedding(_input)
    
    metadata, text, vectors = load_library(library_directory_path,
                                           library_text_path,
                                           library_vectors_path
                                          )

    vectors_df = pd.DataFrame(vectors)
    matches_df = topk_cosine(vectors_df, qvec, K)
    df = matches_df.merge(metadata, on="chunk_uid", how="inner")
    df = df.merge(text, on="chunk_uid", how="inner")
    
    return df

In [212]:
_input = "Whether because they resented the lack of choice — the way access to discounts effectively forced you to pay extra for shopping without a card — or worried about the unknown fate of their shopping data, customers found ways to make the data gathered about them less reliable, less useful, for its conjectured purposes."

matches = find_n_matches(_input, 3)

In [213]:
for idx, row in matches.iterrows(

Unnamed: 0,chunk_uid,vector_embedding,cosine,document,document_type,n_tokens,embedding_model,character_start,character_end,timestamp_iso,timestamp_ms,text
0,2173cf50e5da055b5ff7af6204187280a9837cc0a33b3a...,"[-0.07057506, 0.06895024, -0.024253165, 0.0449...",0.717731,Obfuscation,.docx,201,sentence-transformers/all-MiniLM-L6-v2,1317,2456,2025-09-01T17:51:25.070Z,1756749085070,Whether because they resented the lack of choi...
1,2791d212bef0adec6dc00583aec2aab49dc787537f62e3...,"[-0.1424175, 0.064272344, -0.025587615, 0.0227...",0.710471,Obfuscation,.docx,239,sentence-transformers/all-MiniLM-L6-v2,591,1846,2025-09-01T17:51:25.029Z,1756749085029,"So far, so normal — but the appearance of “loy..."
2,483c073d837ac8e09f4730de2c07ff0e66d05c28867abe...,"[-0.0809205, 0.06468982, -0.027038231, 0.01004...",0.69137,Obfuscation,.docx,207,sentence-transformers/all-MiniLM-L6-v2,0,1049,2025-09-01T17:51:24.989Z,1756749084989,1. Introduction: The problem of data gathering...
