In [56]:
import csv
import os
import datetime as datetime
from datetime import datetime, timezone
import pandas as pd
import numpy as np
import hashlib
from sentence_transformers import SentenceTransformer
from pathlib import Path
import pickle
import docx2txt
from typing import List, Dict, Tuple, Sequence, Optional
from transformers import AutoTokenizer
import re
import fitz

In [3]:
_TOKENIZER = None

_CLEAN_PUA = re.compile(r'[\uE000-\uF8FF]')                 # private-use (e.g., \uf0b7)
_CLEAN_ZW  = re.compile(r'[\u200B-\u200D\uFEFF]')           # zero-widths/BOM
_CLEAN_CTRL= re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F]')    # other control chars
_SOFT_HY   = re.compile(r'\u00AD')                          # soft hyphen
_LINE_HY   = re.compile(r'-\s*\n\s*')                       # hyphen line-break joins
_BULLETS   = re.compile(r'[\u2022\u00B7]')                  # • or ·

In [32]:
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

In [4]:
def get_iso_time():
    now = datetime.now(timezone.utc)
    timestamp_iso = now.isoformat(timespec="milliseconds")
    timestamp_iso = timestamp_iso.replace("+00:00", "Z")
    timestamp_ms = int(now.timestamp()*1000)

    return timestamp_iso, timestamp_ms

In [5]:
def get_uid(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()

In [6]:
def create_vector_embedding(text, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
    model = SentenceTransformer(embedding_model)
    return model.encode(text, normalize_embeddings=True) #return vector

In [7]:
#check if there is an existing row in the input history for user where the same input was already used on the same model
#returns row where user_input and model combination already exist
def check_existing_input(user_input, prompt_model, filename):
    if not os.path.isfile(filename):
        return None

    df = pd.read_csv(filename)

    matches = df.loc[(df['prompt_model'] == prompt_model) & (df['user_input'] == user_input)].copy()

    return matches if not matches.empty else None

In [8]:
def save_input_embedding(user_hash, user_input, input_uid, embedding_model='sentence-transformers/all-MiniLM-L6-v2'):
    filename = user_hash+"_InputEmbeddings.pkl"
    embedding = create_vector_embedding(user_input, embedding_model)

    new_row = {
        "uid" : input_uid,
        "embedding" : embedding
    }
    
    if os.path.isfile(filename):
        with open(filename, "rb") as f:
            data = pickle.load(f)

    else:
        data = []

    data.append(new_row)

    with open(filename, "wb") as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
def create_all_input_embeddings(user, embedding_model='sentence-transformers/all-MiniLM-L6-v2'):
    user_hash = get_uid(user)
    filename = user_hash+"_InputHistory.csv"
    temp_file = user_hash+"InputHistory_temp.csv"
    changed = False

    if not os.path.isfile(filename):
        print(f"File does not exist for user: {user}")
        return False

    df = pd.read_csv(
        filename,
        dtype={
            "embedding_exists" : "int64",
            "embedding_model" : "string"
        }
    )

    for index, row in df.iterrows():
        if row['embedding_exists'] == 0:
            save_input_embedding(user_hash, row['user_input'], row['input_uid'], embedding_model)
            df.at[index, 'embedding_exists'] = 1
            df.at[index, 'embedding_model'] = embedding_model
            df.to_csv(temp_file, index=False)
            changed = True

    if changed:
        os.replace(temp_file, filename)
        print("Updates written to file")

    else:
        print("No updates written")

In [10]:
def load_input_embeddings(user):
    user_hash = get_uid(user)
    filename = user_hash+"_InputEmbeddings.pkl"

    with open(filename, "rb") as f:
        return pickle.load(f)

In [11]:
def save_user_input(user, user_input, prompt_model):
    user_hash = get_uid(user)
    input_uid = get_uid(user_input)
    filename = user_hash+"_InputHistory.csv"
    
    if not os.path.isfile(filename):
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow([
                'user_input',
                'input_uid',
                'prompt_model',
                'timestamp_iso',
                'timestamp_ms',
                'embedding_exists',
                'embedding_model',
                'processed'
            ])

    existing_row = check_existing_input(user_input, prompt_model, filename)
    
    if existing_row is not None:
        dt = existing_row['timestamp_ms'].iloc[0]
        dt = datetime.fromtimestamp(dt / 1000)
        print(f"Input already used at: {dt}")
        return False

    timestamp_iso, timestamp_ms = get_iso_time()

    with open(filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([
            user_input,
            input_uid,
            prompt_model,
            timestamp_iso,
            timestamp_ms,
            0, #flag for if a vector embedding exists for the text, default=0
            "None", #placeholder for vector embedding model used
            0 #flag for if the text has been processed by LLM
        ])
        
    return True

In [12]:
def _get_tokenizer(name: str = "sentence-transformers/all-MiniLM-L6-v2"):
    global _TOKENIZER
    if _TOKENIZER is None:
        _TOKENIZER = AutoTokenizer.from_pretrained(name, use_fast=True)
    return _TOKENIZER

In [13]:
def _tok_len(tok, s: str) -> int:
    return len(tok.encode(s, add_special_tokens=False))

In [14]:
def clean_text(text: str) -> str:
    text = _LINE_HY.sub('-', text)          # join hyphenated line breaks
    text = _SOFT_HY.sub('', text)           # drop soft hyphen
    text = _CLEAN_ZW.sub('', text)          # remove zero-widths
    text = _CLEAN_PUA.sub('', text)         # remove private-use (incl. \uf0b7)
    text = _CLEAN_CTRL.sub(' ', text)       # drop stray controls
    text = _BULLETS.sub(' • ', text)        # normalize bullets if you want to keep them
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [15]:
def simple_sentence_split(text: str) -> List[Dict]:
    text = clean_text(text)
    _SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-Z0-9"\'(])')
    _ABBR_END = re.compile(r'\b(e\.g|i\.e|Mr|Ms|Dr)\.$')
    norm = re.sub(r'\s+', ' ', text).strip()
    
    if not norm:
        return []
    parts, start = [], 0
    for m in _SPLIT.finditer(norm):
        parts.append((start, m.start()))
        start = m.end()
    parts.append((start, len(norm)))

    joined: List[Dict] = []
    for s, e in parts:
        seg = norm[s:e]
        if not seg:
            continue
        if joined and _ABBR_END.search(joined[-1]["text"]):
            joined[-1]["text"] = norm[joined[-1]["start"]:e]
            joined[-1]["end"] = e
        else:
            joined.append({"text": seg, "start": s, "end": e})
    return joined

In [16]:
def _split_overlong_unit(u: Dict, tok, max_tokens: int) -> List[Dict]:
    enc = tok(u["text"], add_special_tokens=False, return_offsets_mapping=True)
    ids = enc["input_ids"]
    offs = enc["offset_mapping"]  # [(start,end) in u["text"]]
    out: List[Dict] = []
    
    i = 0
    while i < len(ids):
        j = min(i + max_tokens, len(ids))
        sub_rel_start = offs[i][0]
        sub_rel_end   = offs[j-1][1]
        sub_text = u["text"][sub_rel_start:sub_rel_end]
        out.append({
            "text": sub_text,
            "start": u["start"] + sub_rel_start,
            "end":   u["start"] + sub_rel_end,
            **{k: v for k, v in u.items() if k not in ("text","start","end")}
        })
        i = j
        
    return out

In [76]:
def pack_chunks(
    units: Sequence[Dict],
    tokenizer_name: str = "sentence-transformers/all-MiniLM-L6-v2",
    max_tokens: int = 240,
    overlap_tokens: int = 48,
    corpus: Optional[str] = None,
    carry_keys: Sequence[str] = ("page",)
    ) -> List[Dict]:
    
    tok = _get_tokenizer(tokenizer_name)

    # 1) Expand any overlong sentences
    expanded: List[Dict] = []
    src_idx: List[int] = []
    for i, u in enumerate(units):
        if _tok_len(tok, u["text"]) > max_tokens:
            parts = _split_overlong_unit(u, tok, max_tokens)
            expanded.extend(parts)
            src_idx.extend([i] * len(parts))
        else:
            expanded.append(u.copy())
            src_idx.append(i)

    # 2) Precompute token lengths
    for u in expanded:
        u["_tok"] = _tok_len(tok, u["text"])

    # 3) Greedy pack with overlap
    chunks: List[Dict] = []
    cur: List[Dict] = []
    cur_src: List[int] = []
    cur_tok = 0

    def flush():
        nonlocal cur, cur_src, cur_tok
        if not cur:
            return
        s0, eN = cur[0]["start"], cur[-1]["end"]
        text = corpus[s0:eN] if corpus is not None else " ".join(u["text"] for u in cur).strip()

        meta = {}
        for k in carry_keys:
            vals = [u.get(k) for u in cur if u.get(k) is not None]
            if vals:
                meta[k + "s"] = sorted(set(vals))  # e.g., 'pages'

        chunks.append({
            "id": len(chunks),
            "text": text,
            "start": s0,
            "end": eN,
            "n_tokens": cur_tok,
            "unit_indices": sorted(set(cur_src)),
            **meta
        })

        # build overlap tail
        if overlap_tokens > 0:
            t = 0
            tail, tail_src = [], []
            for u, si in zip(reversed(cur), reversed(cur_src)):
                tail.insert(0, u); tail_src.insert(0, si)
                t += u["_tok"]
                if t >= overlap_tokens:
                    break
            cur, cur_src = tail, tail_src
            cur_tok = sum(u["_tok"] for u in cur)
        else:
            cur, cur_src, cur_tok = [], [], 0

    for u, si in zip(expanded, src_idx):
        if cur and cur_tok + u["_tok"] > max_tokens:
            flush()
        cur.append(u); cur_src.append(si); cur_tok += u["_tok"]
    flush()

    # optional: drop temp fields
    for u in expanded:
        u.pop("_tok", None)

    return chunks

In [18]:
def save_chunk_to_parquet(chunk_uid, text, filename="library.parquet"):
    new_row = pd.DataFrame([{
        "chunk_uid" : chunk_uid,
        "text" : text
    }])

    if not os.path.isfile(filename):
        new_row.to_parquet(filename, engine="fastparquet", index=False)

    else:
        df = pd.read_parquet(filename, engine="fastparquet")

        if chunk_uid not in df["chunk_uid"].values:
            df = pd.concat([df, new_row], ignore_index=True)
            df.to_parquet(filename, engine="fastparquet", index=False)

In [19]:
def save_vector(chunk_uid, vector, filename="library.pkl"):
    new_row = {
        "chunk_uid" : chunk_uid,
        "vector_embedding" : vector
    }

    if not os.path.isfile(filename):
        with open(filename, "wb") as f:
            pickle.dump([new_row], f)
        return

    with open(filename, "rb") as f:
        library = pickle.load(f)

    for row in library:
        if row["chunk_uid"] == chunk_uid:
            if np.allclose(row["vector_embedding"], new_row["vector_embedding"]):
                return
            else:
                row["vector_embedding"] = new_row["vector_embedding"]
                with open(filename, "wb") as f:
                    pickle.dump(library, f)
                return

    library.append(new_row)
    with open(filename, "wb") as f:
        pickle.dump(library, f)

    return

In [20]:
def parse_docx(file_path):
    return docx2txt.process(file_path)

In [64]:
def parse_pdf(file_path):
    doc = fitz.open(file_path)
    
    page_texts = []

    for p in range(len(doc)):
        raw = doc[p].get_text("text")
        page_texts.append(clean_text(raw))

        SEP = "\n\n"
        corpus = SEP.join(page_texts)

        units = []
        offset = 0

        for page_idx, page_clean in enumerate(page_texts, start=1):
            sentences = simple_sentence_split(page_clean)
            for s in sentences:
                units.append({
                    "text" : s["text"],
                    "start" : offset + s["start"],
                    "end" : offset + s["end"],
                    "page" : page_idx
                })
                
    return corpus, units

In [78]:
#corpus, units = parse_pdf("2001.00973v1.pdf")

In [79]:
#pack_chunks(units, corpus)[2]

In [80]:
def process_docx(file_path):
    text = parse_docx(file_path) or ""
    sentences = simple_sentence_split(text)

    chunks = pack_chunks(
        sentences,
        tokenizer_name="sentence-transformers/all-MiniLM-L6-v2",
        max_tokens=240,
        overlap_tokens=48,
        corpus=None,
        carry_keys=("page",)
    )
    return chunks

In [77]:
def process_pdf(file_path):
    corpus, units = parse_pdf(file_path)
    chunks = pack_chunks(units, corpus)
    return chunks

In [81]:
def get_pipeline(ext):
    document_pipeline = {
        ".pdf": process_pdf,
        ".docx": process_docx,
        ".txt": None, 
        ".md": None, 
        ".csv": None, 
        ".xlsx": None, 
        ".pptx": None,
        ".rtf": None, 
        ".epub": None, 
        ".odt": None, 
        ".ods": None, 
        ".odp": None,
        ".html": None, 
        ".json": None, 
        ".yml": None, 
        ".eml": None,
    }
    return document_pipeline.get(ext.lower())

In [None]:
def _get_model():
    return SentenceTransformer(MODEL_NAME, cache_folder="./models_cache", use_auth_token=True)

In [30]:
def create_vector_embedding(text, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
    model = SentenceTransformer(embedding_model,
                               cache_folder="./models_cache"
                               )
    
    vector = model.encode(text, convert_to_numpy=True, batch_size=32, show_progress_bar=False)

    return vector

In [37]:
def process_document(file_path, **kwargs):
    embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
    library_csv = "library.csv"
    vector_pkl = "library_vectors.pkl"
    ext = Path(file_path).suffix.lower()
    pipeline = get_pipeline(ext)

    model = SentenceTransformer(embedding_model,
                                cache_folder="./models_cache"
                               )
    
    if pipeline is None:
        raise ValueError(f"Unsupported file type: {ext}")
        
    chunks = pipeline(file_path, **kwargs)

    if not os.path.isfile(library_csv):
        with open(library_csv, mode='w', newline='', encoding='utf-8') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow([
                "chunk_uid",
                "document",
                "document_type",
                "n_tokens",
                "embedding_model",
                "character_start",
                "character_end",
                "timestamp_iso",
                "timestamp_ms"
            ])

    c = 0
    for chunk in chunks:
        chunk_uid = get_uid(chunk['text'])
        
        #check if row already exists
        df = pd.read_csv(library_csv)
        if not df.loc[df['chunk_uid'] == chunk_uid].empty:
            continue

        vector = model.encode(chunk['text'], convert_to_numpy=True, batch_size=32, show_progress_bar=False)
        
        timestamp_iso, timestamp_ms = get_iso_time()
        
        with open(library_csv, mode='a', newline='', encoding='utf-8') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow([
                chunk_uid,
                Path(file_path).stem,
                ext,
                chunk['n_tokens'],
                MODEL_NAME,
                chunk['start'],
                chunk['end'],
                timestamp_iso,
                timestamp_ms
            ])

        #save the raw text and the uid into a parquet file
        save_chunk_to_parquet(chunk_uid, chunk['text'])
        save_vector(chunk_uid, vector)
        
        c += 1
            
    print(f"{c} new chunks added to the library")
                
    #return chunks

In [100]:
user = "TylerTwohig"
user_input = "z"
prompt_model = "DeepSeek-R1:latest"
save_user_input("TylerTwohig", user_input, prompt_model)

Input already used at: 2025-08-31 17:44:57.977000


False

In [82]:
process_document("2001.00973v1.pdf")

105 new chunks added to the library


In [130]:
df = pd.read_parquet("library.parquet", engine="fastparquet")

#for text in df['text']:
#    print(text, "\n")

df['text'][0]

'1. Introduction: The problem of data gathering — Asymmetries of power and knowledge What is obfuscation? Supermarkets and grocery chains have always been in the data business, as well as the food business: with small profit margins and a product that can quickly spoil, they pay close attention to inventory, purchasing patterns, and geography. The introduction of store “loyalty cards” perfectly fit a decades–long pattern: rewarding loyal customers with additional discounts in return for better data, which could inform mailings, coupon campaigns, even which products to shelve together. So far, so normal — but the appearance of “loyalty cards,” with their rather sinister Orwellian name, and direct connection of data collection with access to sales and discounts, sparked a strange revolt. Customers engaged in boycotts and tongue–in–cheek protests, but as loyalty cards became more common, and apparently permanent, strategies appeared to mitigate the perceived loss of privacy without entire

In [50]:
with open("library.pkl", "rb") as f:
    data = pickle.load(f)

In [51]:
len(data)

129