In [3]:
import csv
import os
import datetime as datetime
from datetime import datetime, timezone
import pandas as pd
import numpy as np
import hashlib
from sentence_transformers import SentenceTransformer
from pathlib import Path
import pickle

CMC Act Taxonomy (Herring, Das & Penumarthy, updated 2024 — 18 acts + 2 meta-acts)

In [4]:
def get_iso_time():
    now = datetime.now(timezone.utc)
    timestamp_iso = now.isoformat(timespec="milliseconds")
    timestamp_iso = timestamp_iso.replace("+00:00", "Z")
    timestamp_ms = int(now.timestamp()*1000)

    return timestamp_iso, timestamp_ms

In [5]:
def get_uid(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()

In [6]:
def create_vector_embedding(text, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
    model = SentenceTransformer(embedding_model)
    return model.encode(text, normalize_embeddings=True) #return vector

In [12]:
#check if there is an existing row in the input history for user where the same input was already used on the same model
#returns row where user_input and model combination already exist
def check_existing_input(user_input, prompt_model, filename):
    if not os.path.isfile(filename):
        return None

    df = pd.read_csv(filename)

    matches = df.loc[(df['prompt_model'] == prompt_model) & (df['user_input'] == user_input)].copy()

    return matches if not matches.empty else None

In [22]:
def save_input_embedding(user_hash, user_input, input_uid, embedding_model='sentence-transformers/all-MiniLM-L6-v2'):
    filename = user_hash+"_InputEmbeddings.pkl"
    embedding = create_vector_embedding(user_input, embedding_model)

    new_row = {
        "uid" : input_uid,
        "embedding" : embedding
    }
    
    if os.path.isfile(filename):
        with open(filename, "rb") as f:
            data = pickle.load(f)

    else:
        data = []

    data.append(new_row)

    with open(filename, "wb") as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

In [63]:
def create_all_input_embeddings(user, embedding_model='sentence-transformers/all-MiniLM-L6-v2'):
    user_hash = get_uid(user)
    filename = user_hash+"_InputHistory.csv"
    temp_file = user_hash+"InputHistory_temp.csv"
    changed = False

    if not os.path.isfile(filename):
        print(f"File does not exist for user: {user}")
        return False

    df = pd.read_csv(
        filename,
        dtype={
            "embedding_exists" : "int64",
            "embedding_model" : "string"
        }
    )

    for index, row in df.iterrows():
        if row['embedding_exists'] == 0:
            save_input_embedding(user_hash, row['user_input'], row['input_uid'], embedding_model)
            df.at[index, 'embedding_exists'] = 1
            df.at[index, 'embedding_model'] = embedding_model
            df.to_csv(temp_file, index=False)
            changed = True

    if changed:
        os.replace(temp_file, filename)
        print("Updates written to file")

    else:
        print("No updates written")

In [9]:
def load_input_embeddings(user):
    user_hash = get_uid(user)
    filename = user_hash+"_InputEmbeddings.pkl"

    with open(filename, "rb") as f:
        return pickle.load(f)

In [59]:
def save_user_input(user, user_input, prompt_model):
    user_hash = get_uid(user)
    input_uid = get_uid(user_input)
    filename = user_hash+"_InputHistory.csv"
    
    if not os.path.isfile(filename):
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow([
                'user_input',
                'input_uid',
                'prompt_model',
                'timestamp_iso',
                'timestamp_ms',
                'embedding_exists',
                'embedding_model',
                'processed'
            ])

    existing_row = check_existing_input(user_input, prompt_model, filename)
    
    if existing_row is not None:
        dt = existing_row['timestamp_ms'].iloc[0]
        dt = datetime.fromtimestamp(dt / 1000)
        print(f"Input already used at: {dt}")
        return False


    timestamp_iso, timestamp_ms = get_iso_time()

    with open(filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([
            user_input,
            input_uid,
            prompt_model,
            timestamp_iso,
            timestamp_ms,
            0, #flag for if a vector embedding exists for the text, default=0
            "None", #placeholder for vector embedding model used
            0 #flag for if the text has been processed by LLM
        ])

    #save_input_embedding(user_hash, user_input, input_uid)

    return True

In [64]:
user = "TylerTwohig"
user_input = "The quick brown fox jumps over the lazy dog"
prompt_model = "DeepSeek-R1:latest"
save_user_input("TylerTwohig", user_input, prompt_model)

True

CMC Act Taxonomy (Herring, Das & Penumarthy, updated 2024 — 18 acts + 2 meta-acts)

In [65]:
create_all_input_embeddings("TylerTwohig")

Updates written to file
