In [18]:
import csv
import os
import datetime as datetime
from datetime import datetime, timezone
import pandas as pd
import numpy as np
import hashlib
from sentence_transformers import SentenceTransformer
from pathlib import Path
import pickle
import docx2txt
from typing import List, Dict, Tuple
from transformers import AutoTokenizer
import re

CMC Act Taxonomy (Herring, Das & Penumarthy, updated 2024 — 18 acts + 2 meta-acts)

In [2]:
def get_iso_time():
    now = datetime.now(timezone.utc)
    timestamp_iso = now.isoformat(timespec="milliseconds")
    timestamp_iso = timestamp_iso.replace("+00:00", "Z")
    timestamp_ms = int(now.timestamp()*1000)

    return timestamp_iso, timestamp_ms

In [3]:
def get_uid(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()

In [4]:
def create_vector_embedding(text, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
    model = SentenceTransformer(embedding_model)
    return model.encode(text, normalize_embeddings=True) #return vector

In [5]:
#check if there is an existing row in the input history for user where the same input was already used on the same model
#returns row where user_input and model combination already exist
def check_existing_input(user_input, prompt_model, filename):
    if not os.path.isfile(filename):
        return None

    df = pd.read_csv(filename)

    matches = df.loc[(df['prompt_model'] == prompt_model) & (df['user_input'] == user_input)].copy()

    return matches if not matches.empty else None

In [6]:
def save_input_embedding(user_hash, user_input, input_uid, embedding_model='sentence-transformers/all-MiniLM-L6-v2'):
    filename = user_hash+"_InputEmbeddings.pkl"
    embedding = create_vector_embedding(user_input, embedding_model)

    new_row = {
        "uid" : input_uid,
        "embedding" : embedding
    }
    
    if os.path.isfile(filename):
        with open(filename, "rb") as f:
            data = pickle.load(f)

    else:
        data = []

    data.append(new_row)

    with open(filename, "wb") as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
def create_all_input_embeddings(user, embedding_model='sentence-transformers/all-MiniLM-L6-v2'):
    user_hash = get_uid(user)
    filename = user_hash+"_InputHistory.csv"
    temp_file = user_hash+"InputHistory_temp.csv"
    changed = False

    if not os.path.isfile(filename):
        print(f"File does not exist for user: {user}")
        return False

    df = pd.read_csv(
        filename,
        dtype={
            "embedding_exists" : "int64",
            "embedding_model" : "string"
        }
    )

    for index, row in df.iterrows():
        if row['embedding_exists'] == 0:
            save_input_embedding(user_hash, row['user_input'], row['input_uid'], embedding_model)
            df.at[index, 'embedding_exists'] = 1
            df.at[index, 'embedding_model'] = embedding_model
            df.to_csv(temp_file, index=False)
            changed = True

    if changed:
        os.replace(temp_file, filename)
        print("Updates written to file")

    else:
        print("No updates written")

In [8]:
def load_input_embeddings(user):
    user_hash = get_uid(user)
    filename = user_hash+"_InputEmbeddings.pkl"

    with open(filename, "rb") as f:
        return pickle.load(f)

In [9]:
def save_user_input(user, user_input, prompt_model):
    user_hash = get_uid(user)
    input_uid = get_uid(user_input)
    filename = user_hash+"_InputHistory.csv"
    
    if not os.path.isfile(filename):
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow([
                'user_input',
                'input_uid',
                'prompt_model',
                'timestamp_iso',
                'timestamp_ms',
                'embedding_exists',
                'embedding_model',
                'processed'
            ])

    existing_row = check_existing_input(user_input, prompt_model, filename)
    
    if existing_row is not None:
        dt = existing_row['timestamp_ms'].iloc[0]
        dt = datetime.fromtimestamp(dt / 1000)
        print(f"Input already used at: {dt}")
        return False

    timestamp_iso, timestamp_ms = get_iso_time()

    with open(filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([
            user_input,
            input_uid,
            prompt_model,
            timestamp_iso,
            timestamp_ms,
            0, #flag for if a vector embedding exists for the text, default=0
            "None", #placeholder for vector embedding model used
            0 #flag for if the text has been processed by LLM
        ])
        
    return True

In [33]:
def simple_sentence_split(text: str) -> List[str]:
    text = re.sub(r'\s+', ' ', text).strip()
    parts = re.split(r'(?<=[.!?])\s+(?=[A-Z0-9"\'(])', text)
    
    joined = []
    for p in parts:
        if joined and re.search(r'\b(e\.g|i\.e|Mr|Ms|Dr)\.$', joined[-1]):
            joined[-1] += " " + p
        else:
            joined.append(p)

    return [s for s in joined if s]

In [74]:
def simple_sentence_split(text: str) -> List[Dict]:
    _SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-Z0-9"\'(])')
    _ABBR_END = re.compile(r'\b(e\.g|i\.e|Mr|Ms|Dr)\.$')
    norm = re.sub(r'\s+', ' ', text).strip()
    
    if not norm:
        return []

    parts = []
    start = 0

    for m in _SPLIT.finditer(norm):
        parts.append((start, m.start()))
        start = m.end()
    parts.append((start, len(norm)))

    joined: List[Dict] = []
    for s, e in parts:
        seg = norm[s:e]
        
        if not seg:
            continue

        if joined and _ABBR_END.search(joined[-1]["text"]):
            joined[-1]["text"] = norm[joined[-1]["start"]:e]
            joined[-1]["end"] = e
        else:
            joined.append({
                "text": seg, 
                "start": s,
                "end" : e
            })

    return joined

In [10]:
def parse_docx(file_path):
    return docx2txt.process(file_path)

In [63]:
def process_docx(file_path):
    text = parse_docx(file_path)
    sentences = simple_sentence_split(text)
    return sentences

In [None]:
def get_pipeline(ext):
    document_pipeline = {
        ".pdf" : None,
        ".docx" : process_docx,
        ".txt" : None,
        ".md" : None,
        ".csv" : None,
        ".xlsx" : None,
        ".pptx" : None,
        ".rtf" : None,
        ".epub" : None,
        ".odt" : None,
        ".ods" : None,
        ".odp" : None,
        ".html" : None,
        ".json" : None,
        ".yml" : None,
        ".eml" : None,
    }

    return document_pipeline.get(ext.lower())

In [69]:
def process_document(file_path, **kwargs):
    ext = Path(file_path).suffix.lower()
    pipeline = get_pipeline(ext)
    
    if pipeline is None:
        raise ValueError(f"Unsupported file type: {ext}")
        
    return pipeline(file_path, **kwargs)
    #return text

In [70]:
sentences = process_document("Obfuscation.docx")
#for s in sentences:
#    print(s['text'], "\n")

sentences

In [73]:
text = process_docx("Obfuscation.docx")
text

[{'text': '1.', 'start': 0, 'end': 2},
 {'text': 'Introduction: The problem of data gathering — Asymmetries of power and knowledge What is obfuscation?',
  'start': 3,
  'end': 104},
 {'text': 'Supermarkets and grocery chains have always been in the data business, as well as the food business: with small profit margins and a product that can quickly spoil, they pay close attention to inventory, purchasing patterns, and geography.',
  'start': 105,
  'end': 344},
 {'text': 'The introduction of store “loyalty cards” perfectly fit a decades–long pattern: rewarding loyal customers with additional discounts in return for better data, which could inform mailings, coupon campaigns, even which products to shelve together.',
  'start': 345,
  'end': 590},
 {'text': 'So far, so normal — but the appearance of “loyalty cards,” with their rather sinister Orwellian name, and direct connection of data collection with access to sales and discounts, sparked a strange revolt.',
  'start': 591,
  'end': 

In [96]:
user = "TylerTwohig"
user_input = "z"
prompt_model = "DeepSeek-R1:latest"
save_user_input("TylerTwohig", user_input, prompt_model)

True

In [97]:
create_all_input_embeddings("TylerTwohig")

Updates written to file


In [100]:
load_input_embeddings("TylerTwohig")

[{'uid': 'd7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592',
  'embedding': array([ 3.54968309e-02,  6.12862259e-02,  5.26920967e-02,  7.07050711e-02,
          3.31013724e-02, -3.06695681e-02,  6.62064692e-03, -6.11833110e-02,
         -1.32599648e-03,  1.06456354e-02,  3.86499465e-02,  3.99531834e-02,
         -3.83675732e-02, -1.66688077e-02, -5.61557757e-03, -2.43558567e-02,
         -3.59968878e-02, -3.02429292e-02,  5.84699847e-02, -4.94961441e-02,
         -7.72955045e-02, -5.23877069e-02,  2.45272014e-02,  2.93106139e-02,
         -7.39092082e-02, -2.49591768e-02, -6.53142333e-02, -4.28864323e-02,
          7.11656213e-02, -1.13819472e-01, -1.26593551e-02,  3.96260880e-02,
         -2.10035760e-02,  1.78064331e-02, -3.18874940e-02, -9.11229476e-02,
          5.91224693e-02, -7.30399555e-03,  3.31367590e-02,  2.99061500e-02,
          4.21688482e-02, -1.69130042e-02, -4.50015664e-02,  2.96744686e-02,
         -9.92585197e-02,  5.32891788e-02, -7.64785260e-02, -1.

In [150]:
text = docx2txt.process("Youtube Recommendations.docx")

In [75]:
text

[{'text': '1.', 'start': 0, 'end': 2},
 {'text': 'Introduction: The problem of data gathering — Asymmetries of power and knowledge What is obfuscation?',
  'start': 3,
  'end': 104},
 {'text': 'Supermarkets and grocery chains have always been in the data business, as well as the food business: with small profit margins and a product that can quickly spoil, they pay close attention to inventory, purchasing patterns, and geography.',
  'start': 105,
  'end': 344},
 {'text': 'The introduction of store “loyalty cards” perfectly fit a decades–long pattern: rewarding loyal customers with additional discounts in return for better data, which could inform mailings, coupon campaigns, even which products to shelve together.',
  'start': 345,
  'end': 590},
 {'text': 'So far, so normal — but the appearance of “loyalty cards,” with their rather sinister Orwellian name, and direct connection of data collection with access to sales and discounts, sparked a strange revolt.',
  'start': 591,
  'end': 