#  Prompt Understanding & Optimization (Proof of Concept)

This notebook:
- Parses a user prompt
- Assesses its linguistic complexity
- Generates a more concise or semantically similar version
- (Optionally) compares semantic similarity using sentence embeddings

Future integration:
- Use as a backend service for Sustainable AI Prompt Optimizer UI.


In [2]:
#Imports & basic setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity

# NLP / embeddings
from sentence_transformers import SentenceTransformer

# For simplification using T5 (can switch to GPT-2 or API later)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Linguistic processing
import re
import nltk
from nltk.corpus import stopwords

# Ensure NLTK resources are ready
nltk.download("punkt")
nltk.download("stopwords")

plt.rcParams["figure.figsize"] = (6, 4)





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joseg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joseg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#Prompt parsing & basic text utilities
#Goal: clean text, tokenize, and get basic stats.

STOPWORDS = set(stopwords.words("english"))

def normalize_whitespace(text: str) -> str:
    return re.sub(r"\s+", " ", text).strip()

def tokenize_words(text: str):
    # simple word split; you can swap in nltk.word_tokenize if needed
    text = text.lower()
    tokens = re.findall(r"[a-zA-Z']+", text)
    return tokens

def split_sentences(text: str):
    # very simple sentence splitter
    sentences = re.split(r"[.!?]+", text)
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences


In [4]:
#Linguistic complexity metrics
#We want: length, type–token ratio, stopword ratio, avg sentence length, etc.

def compute_complexity_features(prompt: str) -> dict:
    text = normalize_whitespace(prompt)
    tokens = tokenize_words(text)
    sentences = split_sentences(text)

    n_tokens = len(tokens)
    n_types = len(set(tokens)) if tokens else 0
    type_token_ratio = n_types / n_tokens if n_tokens > 0 else 0.0

    n_stopwords = sum(1 for t in tokens if t in STOPWORDS)
    stopword_ratio = n_stopwords / n_tokens if n_tokens > 0 else 0.0

    avg_sentence_len = np.mean([len(tokenize_words(s)) for s in sentences]) if sentences else 0.0

    features = {
        "char_count": len(prompt),
        "token_count": n_tokens,
        "unique_token_count": n_types,
        "type_token_ratio": type_token_ratio,
        "stopword_ratio": stopword_ratio,
        "sentence_count": len(sentences),
        "avg_sentence_len": avg_sentence_len,
    }
    return features

# quick smoke test
sample = "Could you please help me rewrite this very long and slightly redundant prompt, so it becomes shorter but keeps the same meaning?"
compute_complexity_features(sample)


{'char_count': 128,
 'token_count': 22,
 'unique_token_count': 22,
 'type_token_ratio': 1.0,
 'stopword_ratio': 0.45454545454545453,
 'sentence_count': 1,
 'avg_sentence_len': np.float64(22.0)}

In [5]:
#Load a lightweight sentence transformer model
#Use sentence-transformers for embeddings, e.g. all-MiniLM-L6-v2.

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(embedding_model_name)

def get_embedding(text: str) -> np.ndarray:
    emb = embedder.encode([text], convert_to_numpy=True, normalize_embeddings=True)
    return emb[0]

def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    return float(cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0])


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [7]:
#T5-based prompt simplifier
#We’ll use a small T5 model (you can swap models later). Prompt pattern: "summarize:" or "paraphrase:".
simplifier_model_name = "t5-small"  # can be upgraded later

t5_tokenizer = AutoTokenizer.from_pretrained(simplifier_model_name)
t5_model = AutoModelForSeq2SeqLM.from_pretrained(simplifier_model_name)

def simplify_prompt(prompt: str,
                    max_length: int = 64,
                    num_beams: int = 4) -> str:
    """
    Uses a T5 model to generate a shorter / simplified version of the prompt.
    """
    input_text = f"summarize: {prompt}"
    inputs = t5_tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        max_length=256,
    )

    output_ids = t5_model.generate(
        **inputs,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True,
    )
    simplified = t5_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return simplified.strip()


In [8]:
#Semantic similarity between original & simplified prompt
#Now combine embeddings + T5 to check “did we keep the meaning?”

def evaluate_simplification(original: str, simplified: str) -> dict:
    # complexity before and after
    orig_features = compute_complexity_features(original)
    simp_features = compute_complexity_features(simplified)

    # embeddings + cosine similarity
    emb_orig = get_embedding(original)
    emb_simp = get_embedding(simplified)
    sim = cosine_sim(emb_orig, emb_simp)

    return {
        "original": original,
        "simplified": simplified,
        "semantic_similarity": sim,
        "original_features": orig_features,
        "simplified_features": simp_features,
    }


In [9]:
#End-to-end “optimize_prompt” pipeline
#This function is what you’ll eventually call from a UI or API.
def optimize_prompt(user_prompt: str) -> dict:
    """
    Given a user prompt:
    - Compute its complexity
    - Suggest a simplified version
    - Compute semantic similarity between original & simplified
    """
    original = normalize_whitespace(user_prompt)

    # 1) Complexity metrics
    complexity = compute_complexity_features(original)

    # 2) Simplified version
    simplified = simplify_prompt(original)

    # 3) Semantic similarity & delta in complexity
    result = evaluate_simplification(original, simplified)

    return {
        "complexity": complexity,
        "simplified_prompt": simplified,
        "semantic_similarity": result["semantic_similarity"],
        "simplified_complexity": result["simplified_features"],
    }


In [10]:
test_prompt = """
I would like you to please generate a detailed, step-by-step explanation of how to deploy
a medium-sized transformer model to production, including all possible configuration options,
even if some of them are not strictly necessary for a basic deployment.
"""

result = optimize_prompt(test_prompt)

print("=== Original Prompt ===")
print(normalize_whitespace(test_prompt))
print("\n=== Simplified Prompt ===")
print(result["simplified_prompt"])
print("\n=== Semantic Similarity ===")
print(f"{result['semantic_similarity']:.3f}")

print("\n=== Original Complexity ===")
for k, v in result["complexity"].items():
    print(f"{k:20s}: {v}")

print("\n=== Simplified Complexity ===")
for k, v in result["simplified_complexity"].items():
    print(f"{k:20s}: {v}")


=== Original Prompt ===
I would like you to please generate a detailed, step-by-step explanation of how to deploy a medium-sized transformer model to production, including all possible configuration options, even if some of them are not strictly necessary for a basic deployment.

=== Simplified Prompt ===
I would like you to generate a detailed, step-by-step explanation of how to deploy a medium-sized transformer model to production, including all possible configuration options. some of the options are not strictly necessary for a basic deployment.

=== Semantic Similarity ===
0.994

=== Original Complexity ===
char_count          : 255
token_count         : 42
unique_token_count  : 36
type_token_ratio    : 0.8571428571428571
stopword_ratio      : 0.4523809523809524
sentence_count      : 1
avg_sentence_len    : 42.0

=== Simplified Complexity ===
char_count          : 247
token_count         : 40
unique_token_count  : 33
type_token_ratio    : 0.825
stopword_ratio      : 0.45
sentence_c

In [11]:
#Batch evaluation & clustering
#If we need to add the “clustering” part from our spec:
from sklearn.cluster import KMeans

def cluster_prompts(prompts, n_clusters=3):
    embs = embedder.encode(prompts, convert_to_numpy=True, normalize_embeddings=True)
    km = KMeans(n_clusters=n_clusters, random_state=42)
    labels = km.fit_predict(embs)
    return labels, km

# Example usage:
prompt_list = [
    "Explain gradient descent in simple terms.",
    "Summarize the concept of backpropagation.",
    "How do I cook pasta?",
    "Give me tips to optimize a neural network.",
    "Best way to boil spaghetti?"
]

labels, model_km = cluster_prompts(prompt_list, n_clusters=2)
pd.DataFrame({"prompt": prompt_list, "cluster": labels})


Unnamed: 0,prompt,cluster
0,Explain gradient descent in simple terms.,0
1,Summarize the concept of backpropagation.,0
2,How do I cook pasta?,1
3,Give me tips to optimize a neural network.,0
4,Best way to boil spaghetti?,1


In [12]:
#We can save complexity results, or later export a small CSV of prompts before/after optimization.

def log_optimization(user_prompt: str, path: str = "prompt_optimization_log.csv"):
    res = optimize_prompt(user_prompt)
    row = {
        "original_prompt": normalize_whitespace(user_prompt),
        "simplified_prompt": res["simplified_prompt"],
        "semantic_similarity": res["semantic_similarity"],
        **{f"orig_{k}": v for k, v in res["complexity"].items()},
        **{f"simp_{k}": v for k, v in res["simplified_complexity"].items()},
    }
    df_row = pd.DataFrame([row])

    try:
        df_existing = pd.read_csv(path)
        df = pd.concat([df_existing, df_row], ignore_index=True)
    except FileNotFoundError:
        df = df_row

    df.to_csv(path, index=False)
    return df

# test log
log_optimization("Please rewrite this extremely verbose and redundant prompt into something shorter.")


Unnamed: 0,original_prompt,simplified_prompt,semantic_similarity,orig_char_count,orig_token_count,orig_unique_token_count,orig_type_token_ratio,orig_stopword_ratio,orig_sentence_count,orig_avg_sentence_len,simp_char_count,simp_token_count,simp_unique_token_count,simp_type_token_ratio,simp_stopword_ratio,simp_sentence_count,simp_avg_sentence_len
0,Please rewrite this extremely verbose and redu...,rewrite this extremely verbose and redundant p...,0.965815,82,11,11,1.0,0.272727,1,11.0,75,10,10,1.0,0.3,1,10.0
