JSON dataset

In [1]:
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pandas as pd
#spacy
import spacy
from nltk.corpus import stopwords

#visualization
import pyLDAvis
import pyLDAvis.gensim
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
from bertopic import BERTopic
import re
stopwords = stopwords.words("english")

In [2]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [3]:
# for item in data:
#     if isinstance(item["Description"], str):
#         original_text = item["Description"]
#         # Remove unwanted tags and patterns
#         cleaned_text = remove_tags(original_text)
#         # Apply lemmatization on the cleaned text
#         lemmatized_text = lemmatization(cleaned_text)
#         # Tokenize the lemmatized text
#         tokenized_words = gen_words(lemmatized_text)
#         # Update "Description" with the joined tokenized words
#         item["Description_new"] = " ".join(tokenized_words)
# write_data("data_processed.json", data)
# data = load_data("data_processed.json")
# for item in data:
#     Loop over possible comment fields
#     for i in range(20):  # From Comment to Comment.19
#         comment_key = "Comment" if i == 0 else f"Comment.{i}"
#         if isinstance(item.get(comment_key), str):
#             comment_text = item[comment_key]
#             cleaned_comment = remove_tags(comment_text)
#             lemmatized_comment = lemmatization(cleaned_comment)
#             tokenized_comment = gen_words(lemmatized_comment)
#             item[comment_key] = " ".join(tokenized_comment)
# write_data("data_processed.json", data)
#Extract values and collect unique ones using a set
# unique_categories = set()

# for item in data:
#     if "Custom field (Request Category)" in item:
#         unique_categories.add(item["Custom field (Request Category)"])

# # Print the unique values
# for value in unique_categories:
#     print(value)


# Extract values and collect unique ones using a set
# unique_categories = set()

# for item in data:
#     if "Custom field (Cause of issue)" in item:
#         unique_categories.add(item["Custom field (Cause of issue)"])

# # Print the unique values
# for value in unique_categories:
#     print(value)


In [30]:
# Helper functions
# preprocessing for LDA
def lemmatization(text, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    doc = nlp(text)
    new_text = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    return " ".join(new_text)

def gen_words(text):
    return gensim.utils.simple_preprocess(text, deacc=True)
import re

def remove_tags(text):
    special_chars = {'{', '[', '(', '*', '!', '/', '"', '\\', ':', ';', '<'}
    # Split the text into words using whitespace
    words = text.split()
    # Create a new list to hold words that do not contain any special characters or digits
    cleaned_words = []
    # Iterate over each word in the list
    for word in words:
        # Check if the word contains any of the special characters or any digits
        if not any(char in word for char in special_chars) and not re.search(r'\d', word):
            cleaned_words.append(word)
    # Rejoin the cleaned words into a string
    return ' '.join(cleaned_words)
  
    

# preprocessing for bertopic


def clean_for_bertopic(text):
    if not isinstance(text, str):
        return ""

    # Remove email headers/footers and metadata
    text = re.sub(r'On .* wrote:', '', text)
    text = re.sub(r'(?i)Yvonne Brown.*?(Director)?', '', text)
    text = re.sub(r'\*P\.*.*?\d{3}-\d{3}-\d{4}', '', text)
    text = re.sub(r'Capital Area Food Bank.*', '', text)
    text = re.sub(r'\|', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+', '', text)

    # Remove all digits
    text = re.sub(r'\d+', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize and filter
    words = text.split()
    words = [
        word.lower() for word in words
        if word.isalpha() and len(word) >= 3
    ]

    return ' '.join(words)

def combine_description_and_comments(item):
    base_text = item.get("Description", "")
    comments = []
    
    # Include all comment fields
    for i in range(20):  # Assuming Comment to Comment.19
        key = "Comment" if i == 0 else f"Comment.{i}"
        comment = item.get(key)
        if isinstance(comment, str):
            comments.append(comment)

    full_text = base_text + " " + " ".join(comments)
    return full_text
def get_top_words(topic_id, model, top_n=10):
    topic = model.get_topic(topic_id)
    if topic is None:
        return ""
    return ", ".join([f"{word}:{weight:.3f}" for word, weight in topic[:top_n]])

def save_bertopic_to_csv(model, docs, topics, filename="bertopic_summary.csv", top_n_words=10):
    # Topic summary info
    df_info = model.get_topic_info()

    # Add top words with weights
    df_info["Top_Words"] = df_info["Topic"].apply(
        lambda x: get_top_words(int(x), model) if x != -1 else ""
    )

    # Add example document per topic
    df_docs = pd.DataFrame({"Document": docs, "Topic": topics})
    sample_docs = df_docs.groupby("Topic")["Document"].first().reset_index()
    df_info = df_info.merge(sample_docs, how="left", on="Topic")

    # Save to CSV
    df_info.to_csv(filename, index=False)
    print(f"Saved BERTopic summary to {filename}")
    from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def filter_low_tfidf_words(docs, threshold_percentile=30):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(docs)
    tfidf_scores = X.toarray()
    feature_names = np.array(vectorizer.get_feature_names_out())

    filtered_docs = []
    for i, doc_scores in enumerate(tfidf_scores):
        threshold = np.percentile(doc_scores[doc_scores > 0], threshold_percentile)  # avoid zeros
        keep_indices = np.where(doc_scores >= threshold)[0]
        selected_words = feature_names[keep_indices]
        doc_words = docs[i].split()
        doc_filtered = [word for word in doc_words if word in selected_words]
        filtered_docs.append(" ".join(doc_filtered))

    return filtered_docs
def filter_low_tfidf_words_justdescripton(docs, threshold_percentile=30):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(docs)
    tfidf_scores = X.toarray()
    feature_names = np.array(vectorizer.get_feature_names_out())

    filtered_docs = []

    for i, doc_scores in enumerate(tfidf_scores):
        non_zero_scores = doc_scores[doc_scores > 0]

        if len(non_zero_scores) == 0:
            # If no non-zero TF-IDF, keep original document as-is
            filtered_docs.append(docs[i])
            continue

        threshold = np.percentile(non_zero_scores, threshold_percentile)
        keep_indices = np.where(doc_scores >= threshold)[0]
        selected_words = set(feature_names[keep_indices])

        doc_words = docs[i].split()
        doc_filtered = [word for word in doc_words if word in selected_words]
        filtered_docs.append(" ".join(doc_filtered))

    return filtered_docs

In [31]:
# dataset prepared for bertopic
data_bertopic = load_data("data_case2-1.json")
for ticket in data_bertopic:
    # Clean Description
    if "Description" in ticket and isinstance(ticket["Description"], str):
        ticket["Description"] = remove_tags(ticket["Description"])

    # Clean all comment fields: Comment, Comment.1, ..., Comment.19
    for i in range(20):
        key = "Comment" if i == 0 else f"Comment.{i}"
        if key in ticket and isinstance(ticket[key], str):
            ticket[key] = remove_tags(ticket[key])
write_data("data_bertopic.json", data_bertopic)

Bertopic

In [25]:
# Hyperparameters
embedding_model_name = "all-MiniLM-L6-v2"
min_topic_size = 3

Using only description

In [54]:
data = load_data("data_bertopic.json")
data_all_desc = [item["Description"] for item in data if isinstance(item.get("Description"), str)]

# docs_descriptions = filter_low_tfidf_words_justdescripton(data_all_desc)
topic_model_all_descriptions = BERTopic(min_topic_size=min_topic_size, embedding_model=embedding_model_name)
topics_all_descriptions, probs_all_descriptions = topic_model_all_descriptions.fit_transform(data_all_desc)
save_bertopic_to_csv(
    model=topic_model_all_descriptions,
    docs=data_all_desc,
    topics=topics_all_descriptions,
    filename=f"bertopic_onlydescriptions_processeddata_min_topic_{min_topic_size}.csv"
)

Saved BERTopic summary to bertopic_onlydescriptions_processeddata_min_topic_3.csv


Use Llama2 to extract meaningful topics

In [None]:
%%bash
brew install cmake pkg-config openblas



In [None]:
%%bash
cd llama.cpp
make LLAMA_METAL=1


In [1]:
from huggingface_hub import login
login(token="HF_TOKEN_REMOVED")



  from .autonotebook import tqdm as notebook_tqdm


In [9]:
import pandas as pd
from llama_cpp import Llama
import re

def parse_top_words(text):
    if not isinstance(text, str):
        return []
    return [match.split(':')[0].strip() for match in text.split(',') if ':' in match]

# Load your BERTopic CSV data
df = pd.read_csv("bertopic_onlydescriptions_processeddata_min_topic_3.csv")  # Adjust path if needed

# Initialize llama.cpp
llm = Llama(model_path="/Users/alifehmiyildiz/Desktop/GitHub/FoodTicketAI/llama.cpp/build/tinyllama.gguf", n_ctx=2048)

def generate_topic_label(keywords, docs, max_tokens=32):
    prompt = f"""Return only a short 1 to 3 word topic label.

Keywords:
{', '.join(keywords)}

Examples:
- {docs[0]}
- {docs[1] if len(docs) > 1 else ''}

Topic Label:"""

    response = llm(prompt, max_tokens=max_tokens)
    output = response['choices'][0]['text'].strip()

    # Only take the first line or sentence before a newline, period, or colon
    clean_label = re.split(r"[\n\.\:]", output)[0].strip()

    return clean_label

# List to store results
topic_id_label_pairs = []

for i, row in df.iterrows():
    keywords = parse_top_words(row['Top_Words'])

    try:
        docs = eval(row['Representative_Docs']) if isinstance(row['Representative_Docs'], str) else []
    except:
        docs = []

    if len(keywords) == 0 or len(docs) == 0:
        label = "[insufficient data]"
    else:
        label = generate_topic_label(keywords, docs)

    print(f"Topic {row['Topic']} → {label}")
    topic_id_label_pairs.append({'Topic': row['Topic'], 'Generated_Label': label})

# Convert to DataFrame for display/export
output_df = pd.DataFrame(topic_id_label_pairs)

# Optional: Save to CSV if needed
output_df.to_csv("topic_id_label_only.csv", index=False)

# Optional: Display just the result
print("\nGenerated Topic Labels:")
print(output_df)


llama_model_load_from_file_impl: using device Metal (Apple M1) - 10916 MiB free
llama_model_loader: loaded meta data with 23 key-value pairs and 201 tensors from /Users/alifehmiyildiz/Desktop/GitHub/FoodTicketAI/llama.cpp/build/tinyllama.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = tinyllama_tinyllama-1.1b-chat-v1.0
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   4:                          llama.block_count u32              = 22
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 5632
llama_model_loader: -

Topic -1 → [insufficient data]


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     772.15 ms /    98 tokens (    7.88 ms per token,   126.92 tokens per second)
llama_perf_context_print:        eval time =     419.35 ms /    31 runs   (   13.53 ms per token,    73.92 tokens per second)
llama_perf_context_print:       total time =    1195.10 ms /   129 tokens
Llama.generate: 20 prefix-match hit, remaining 66 prompt tokens to eval


Topic 0 → Delivery


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     585.98 ms /    66 tokens (    8.88 ms per token,   112.63 tokens per second)
llama_perf_context_print:        eval time =     425.62 ms /    31 runs   (   13.73 ms per token,    72.83 tokens per second)
llama_perf_context_print:       total time =    1014.99 ms /    97 tokens
Llama.generate: 20 prefix-match hit, remaining 142 prompt tokens to eval


Topic 1 → Order Produce Pound Limit Last Sales


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     762.41 ms /   142 tokens (    5.37 ms per token,   186.25 tokens per second)
llama_perf_context_print:        eval time =     440.39 ms /    31 runs   (   14.21 ms per token,    70.39 tokens per second)
llama_perf_context_print:       total time =    1206.63 ms /   173 tokens
Llama.generate: 20 prefix-match hit, remaining 88 prompt tokens to eval


Topic 2 → - Add (1), Pallet (2), Potatoes (3), Add (4), Pallets (5), Onions (6), Potatoes


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     597.93 ms /    88 tokens (    6.79 ms per token,   147.17 tokens per second)
llama_perf_context_print:        eval time =     457.32 ms /    31 runs   (   14.75 ms per token,    67.79 tokens per second)
llama_perf_context_print:       total time =    1058.72 ms /   119 tokens
Llama.generate: 20 prefix-match hit, remaining 47 prompt tokens to eval


Topic 3 → "Greeing"


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     512.24 ms /    47 tokens (   10.90 ms per token,    91.75 tokens per second)
llama_perf_context_print:        eval time =     419.88 ms /    31 runs   (   13.54 ms per token,    73.83 tokens per second)
llama_perf_context_print:       total time =     935.47 ms /    78 tokens
Llama.generate: 20 prefix-match hit, remaining 229 prompt tokens to eval


Topic 4 → Cancelled Order


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     971.54 ms /   229 tokens (    4.24 ms per token,   235.71 tokens per second)
llama_perf_context_print:        eval time =     435.16 ms /    31 runs   (   14.04 ms per token,    71.24 tokens per second)
llama_perf_context_print:       total time =    1410.40 ms /   260 tokens
Llama.generate: 20 prefix-match hit, remaining 244 prompt tokens to eval


Topic 5 → Eternal


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     986.07 ms /   244 tokens (    4.04 ms per token,   247.45 tokens per second)
llama_perf_context_print:        eval time =     436.50 ms /    31 runs   (   14.08 ms per token,    71.02 tokens per second)
llama_perf_context_print:       total time =    1426.34 ms /   275 tokens
Llama.generate: 20 prefix-match hit, remaining 180 prompt tokens to eval


Topic 6 → Best By Date


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     812.22 ms /   180 tokens (    4.51 ms per token,   221.62 tokens per second)
llama_perf_context_print:        eval time =     475.34 ms /    31 runs   (   15.33 ms per token,    65.22 tokens per second)
llama_perf_context_print:       total time =    1291.62 ms /   211 tokens
Llama.generate: 20 prefix-match hit, remaining 790 prompt tokens to eval


Topic 7 → Total Amount Applied


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    3135.40 ms /   790 tokens (    3.97 ms per token,   251.96 tokens per second)
llama_perf_context_print:        eval time =      62.16 ms /     4 runs   (   15.54 ms per token,    64.35 tokens per second)
llama_perf_context_print:       total time =    3198.58 ms /   794 tokens
Llama.generate: 20 prefix-match hit, remaining 577 prompt tokens to eval


Topic 8 → Internal Communication


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    2375.17 ms /   577 tokens (    4.12 ms per token,   242.93 tokens per second)
llama_perf_context_print:        eval time =     454.40 ms /    31 runs   (   14.66 ms per token,    68.22 tokens per second)
llama_perf_context_print:       total time =    2833.46 ms /   608 tokens
Llama.generate: 20 prefix-match hit, remaining 734 prompt tokens to eval


Topic 9 → User Account Login, User Account, User Login, User Login Error, User Login Help, User Login Help, User Login Help Center, User Login Support,


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    2872.60 ms /   734 tokens (    3.91 ms per token,   255.52 tokens per second)
llama_perf_context_print:        eval time =     146.73 ms /    10 runs   (   14.67 ms per token,    68.15 tokens per second)
llama_perf_context_print:       total time =    3021.13 ms /   744 tokens
Llama.generate: 20 prefix-match hit, remaining 80 prompt tokens to eval


Topic 10 → Delivery Cancellation and Re-Scheduling


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     601.97 ms /    80 tokens (    7.52 ms per token,   132.90 tokens per second)
llama_perf_context_print:        eval time =     415.89 ms /    31 runs   (   13.42 ms per token,    74.54 tokens per second)
llama_perf_context_print:       total time =    1021.48 ms /   111 tokens
Llama.generate: 20 prefix-match hit, remaining 260 prompt tokens to eval


Topic 11 → Cancelled Order


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    1148.99 ms /   260 tokens (    4.42 ms per token,   226.29 tokens per second)
llama_perf_context_print:        eval time =     443.45 ms /    31 runs   (   14.30 ms per token,    69.91 tokens per second)
llama_perf_context_print:       total time =    1596.21 ms /   291 tokens
Llama.generate: 20 prefix-match hit, remaining 1022 prompt tokens to eval


Topic 12 → temporary help center


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    3840.46 ms /  1022 tokens (    3.76 ms per token,   266.11 tokens per second)
llama_perf_context_print:        eval time =     478.27 ms /    31 runs   (   15.43 ms per token,    64.82 tokens per second)
llama_perf_context_print:       total time =    4322.96 ms /  1053 tokens
Llama.generate: 20 prefix-match hit, remaining 238 prompt tokens to eval


Topic 13 → Disability, Food, and Delivery


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     952.29 ms /   238 tokens (    4.00 ms per token,   249.92 tokens per second)
llama_perf_context_print:        eval time =     471.71 ms /    31 runs   (   15.22 ms per token,    65.72 tokens per second)
llama_perf_context_print:       total time =    1427.89 ms /   269 tokens
Llama.generate: 20 prefix-match hit, remaining 105 prompt tokens to eval


Topic 14 → - order


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     633.30 ms /   105 tokens (    6.03 ms per token,   165.80 tokens per second)
llama_perf_context_print:        eval time =     421.04 ms /    31 runs   (   13.58 ms per token,    73.63 tokens per second)
llama_perf_context_print:       total time =    1057.87 ms /   136 tokens
Llama.generate: 20 prefix-match hit, remaining 166 prompt tokens to eval


Topic 15 → This is a short 1 to 3 word topic label that indicates the type of food item requested


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     783.01 ms /   166 tokens (    4.72 ms per token,   212.00 tokens per second)
llama_perf_context_print:        eval time =     424.98 ms /    31 runs   (   13.71 ms per token,    72.94 tokens per second)
llama_perf_context_print:       total time =    1211.48 ms /   197 tokens
Llama.generate: 20 prefix-match hit, remaining 645 prompt tokens to eval


Topic 16 → FRUITS, VEGETABLES, AVAILABLE


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    2608.70 ms /   645 tokens (    4.04 ms per token,   247.25 tokens per second)
llama_perf_context_print:        eval time =     465.82 ms /    31 runs   (   15.03 ms per token,    66.55 tokens per second)
llama_perf_context_print:       total time =    3078.37 ms /   676 tokens
Llama.generate: 20 prefix-match hit, remaining 132 prompt tokens to eval


Topic 17 → Food Expired and Damaged


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     699.27 ms /   132 tokens (    5.30 ms per token,   188.77 tokens per second)
llama_perf_context_print:        eval time =     422.74 ms /    31 runs   (   13.64 ms per token,    73.33 tokens per second)
llama_perf_context_print:       total time =    1125.56 ms /   163 tokens
Llama.generate: 20 prefix-match hit, remaining 467 prompt tokens to eval


Topic 18 → BINS


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    1684.71 ms /   467 tokens (    3.61 ms per token,   277.20 tokens per second)
llama_perf_context_print:        eval time =     511.13 ms /    31 runs   (   16.49 ms per token,    60.65 tokens per second)
llama_perf_context_print:       total time =    2199.88 ms /   498 tokens
Llama.generate: 20 prefix-match hit, remaining 69 prompt tokens to eval


Topic 19 → Delivery Status


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     567.03 ms /    69 tokens (    8.22 ms per token,   121.69 tokens per second)
llama_perf_context_print:        eval time =     423.31 ms /    31 runs   (   13.66 ms per token,    73.23 tokens per second)
llama_perf_context_print:       total time =     993.86 ms /   100 tokens
Llama.generate: 20 prefix-match hit, remaining 126 prompt tokens to eval


Topic 20 → cancel


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     672.94 ms /   126 tokens (    5.34 ms per token,   187.24 tokens per second)
llama_perf_context_print:        eval time =     258.00 ms /    19 runs   (   13.58 ms per token,    73.64 tokens per second)
llama_perf_context_print:       total time =     933.30 ms /   145 tokens
Llama.generate: 20 prefix-match hit, remaining 36 prompt tokens to eval


Topic 21 → Error/issue with system, shopping, inventory, list, system, problem


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     505.10 ms /    36 tokens (   14.03 ms per token,    71.27 tokens per second)
llama_perf_context_print:        eval time =     418.93 ms /    31 runs   (   13.51 ms per token,    74.00 tokens per second)
llama_perf_context_print:       total time =     927.58 ms /    67 tokens
Llama.generate: 20 prefix-match hit, remaining 191 prompt tokens to eval


Topic 22 → Thank You For Your Service


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     828.52 ms /   191 tokens (    4.34 ms per token,   230.53 tokens per second)
llama_perf_context_print:        eval time =     434.04 ms /    31 runs   (   14.00 ms per token,    71.42 tokens per second)
llama_perf_context_print:       total time =    1266.49 ms /   222 tokens
Llama.generate: 20 prefix-match hit, remaining 198 prompt tokens to eval


Topic 23 → payment, payment method, payment error, payment system, payment processing, payment transaction, payment system, payment solution, payment gateway, payment gateway error, payment


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     902.65 ms /   198 tokens (    4.56 ms per token,   219.35 tokens per second)
llama_perf_context_print:        eval time =     436.14 ms /    31 runs   (   14.07 ms per token,    71.08 tokens per second)
llama_perf_context_print:       total time =    1342.66 ms /   229 tokens
Llama.generate: 20 prefix-match hit, remaining 404 prompt tokens to eval


Topic 24 → Produce, Distribute, Available, Fresh, Have, Donâ, Produce, Add, Ana, Ana, Add, Ana,


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    1455.35 ms /   404 tokens (    3.60 ms per token,   277.60 tokens per second)
llama_perf_context_print:        eval time =     447.28 ms /    31 runs   (   14.43 ms per token,    69.31 tokens per second)
llama_perf_context_print:       total time =    1906.96 ms /   435 tokens
Llama.generate: 20 prefix-match hit, remaining 579 prompt tokens to eval


Topic 25 → - order status (pending, confirmed, completed, refunded, cancelled, rejected, expired, abandoned)


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    2419.16 ms /   579 tokens (    4.18 ms per token,   239.34 tokens per second)
llama_perf_context_print:        eval time =     268.67 ms /    18 runs   (   14.93 ms per token,    67.00 tokens per second)
llama_perf_context_print:       total time =    2690.12 ms /   597 tokens
Llama.generate: 20 prefix-match hit, remaining 207 prompt tokens to eval


Topic 26 → Capital Area Food Bank will be closed on Memorial Day, May 30


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     908.74 ms /   207 tokens (    4.39 ms per token,   227.79 tokens per second)
llama_perf_context_print:        eval time =     437.33 ms /    31 runs   (   14.11 ms per token,    70.88 tokens per second)
llama_perf_context_print:       total time =    1350.38 ms /   238 tokens
Llama.generate: 20 prefix-match hit, remaining 177 prompt tokens to eval


Topic 27 → Ebenezer Church of God SeaRChe, Organize,


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     800.29 ms /   177 tokens (    4.52 ms per token,   221.17 tokens per second)
llama_perf_context_print:        eval time =     439.09 ms /    31 runs   (   14.16 ms per token,    70.60 tokens per second)
llama_perf_context_print:       total time =    1243.39 ms /   208 tokens
Llama.generate: 20 prefix-match hit, remaining 195 prompt tokens to eval


Topic 28 → - Cancelled


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     928.35 ms /   195 tokens (    4.76 ms per token,   210.05 tokens per second)
llama_perf_context_print:        eval time =     438.56 ms /    31 runs   (   14.15 ms per token,    70.69 tokens per second)
llama_perf_context_print:       total time =    1371.00 ms /   226 tokens
Llama.generate: 20 prefix-match hit, remaining 202 prompt tokens to eval


Topic 29 → - Pickup, Wednesday, System, Pick, Select, Place, Like, Feb, Change, Pending


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     989.36 ms /   202 tokens (    4.90 ms per token,   204.17 tokens per second)
llama_perf_context_print:        eval time =     435.56 ms /    31 runs   (   14.05 ms per token,    71.17 tokens per second)
llama_perf_context_print:       total time =    1428.99 ms /   233 tokens
Llama.generate: 20 prefix-match hit, remaining 226 prompt tokens to eval


Topic 30 → ITEM FEE


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     947.83 ms /   226 tokens (    4.19 ms per token,   238.44 tokens per second)
llama_perf_context_print:        eval time =     474.86 ms /    31 runs   (   15.32 ms per token,    65.28 tokens per second)
llama_perf_context_print:       total time =    1426.75 ms /   257 tokens
Llama.generate: 20 prefix-match hit, remaining 154 prompt tokens to eval


Topic 31 → Payment, Payment, Payment, Payment, Payment, Payment, Payment, Payment, Payment, Payment, Pay


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     714.37 ms /   154 tokens (    4.64 ms per token,   215.58 tokens per second)
llama_perf_context_print:        eval time =     435.47 ms /    31 runs   (   14.05 ms per token,    71.19 tokens per second)
llama_perf_context_print:       total time =    1153.69 ms /   185 tokens
Llama.generate: 20 prefix-match hit, remaining 139 prompt tokens to eval


Topic 32 → Soup, Vegetables, Vegetable Cases, Food Cases, Bean Cases, Chewy, Soup, Soups,


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     700.28 ms /   139 tokens (    5.04 ms per token,   198.49 tokens per second)
llama_perf_context_print:        eval time =      82.46 ms /     6 runs   (   13.74 ms per token,    72.76 tokens per second)
llama_perf_context_print:       total time =     783.92 ms /   145 tokens
Llama.generate: 20 prefix-match hit, remaining 135 prompt tokens to eval


Topic 33 → "Gift Basket"


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     704.36 ms /   135 tokens (    5.22 ms per token,   191.66 tokens per second)
llama_perf_context_print:        eval time =     432.56 ms /    31 runs   (   13.95 ms per token,    71.67 tokens per second)
llama_perf_context_print:       total time =    1140.84 ms /   166 tokens
Llama.generate: 20 prefix-match hit, remaining 145 prompt tokens to eval


Topic 34 → chicken, ground chub, frog chub, chub, poutches, shef stable, packing


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     704.28 ms /   145 tokens (    4.86 ms per token,   205.88 tokens per second)
llama_perf_context_print:        eval time =     430.39 ms /    31 runs   (   13.88 ms per token,    72.03 tokens per second)
llama_perf_context_print:       total time =    1138.75 ms /   176 tokens
Llama.generate: 20 prefix-match hit, remaining 125 prompt tokens to eval


Topic 35 → Based on the text material above, generate the response to the following quesion or instruction


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     659.34 ms /   125 tokens (    5.27 ms per token,   189.58 tokens per second)
llama_perf_context_print:        eval time =      67.89 ms /     5 runs   (   13.58 ms per token,    73.65 tokens per second)
llama_perf_context_print:       total time =     728.64 ms /   130 tokens
Llama.generate: 20 prefix-match hit, remaining 544 prompt tokens to eval


Topic 36 → Produce


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    2314.61 ms /   544 tokens (    4.25 ms per token,   235.03 tokens per second)
llama_perf_context_print:        eval time =     464.37 ms /    31 runs   (   14.98 ms per token,    66.76 tokens per second)
llama_perf_context_print:       total time =    2782.88 ms /   575 tokens
Llama.generate: 20 prefix-match hit, remaining 637 prompt tokens to eval


Topic 37 → Capital Area Food Bank, Partners, Partner Support, Cuostoamer Relations, Delay, At, Important, Important Operating


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    2510.66 ms /   637 tokens (    3.94 ms per token,   253.72 tokens per second)
llama_perf_context_print:        eval time =     469.62 ms /    31 runs   (   15.15 ms per token,    66.01 tokens per second)
llama_perf_context_print:       total time =    2984.63 ms /   668 tokens
Llama.generate: 20 prefix-match hit, remaining 99 prompt tokens to eval


Topic 38 → Additional Details


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     638.74 ms /    99 tokens (    6.45 ms per token,   154.99 tokens per second)
llama_perf_context_print:        eval time =     428.11 ms /    31 runs   (   13.81 ms per token,    72.41 tokens per second)
llama_perf_context_print:       total time =    1070.83 ms /   130 tokens
Llama.generate: 20 prefix-match hit, remaining 207 prompt tokens to eval


Topic 39 → Winter Squash


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     885.06 ms /   207 tokens (    4.28 ms per token,   233.88 tokens per second)
llama_perf_context_print:        eval time =     481.84 ms /    31 runs   (   15.54 ms per token,    64.34 tokens per second)
llama_perf_context_print:       total time =    1371.22 ms /   238 tokens
Llama.generate: 20 prefix-match hit, remaining 547 prompt tokens to eval


Topic 40 → Pick Up or Deliver?


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    2417.19 ms /   547 tokens (    4.42 ms per token,   226.30 tokens per second)
llama_perf_context_print:        eval time =     466.84 ms /    31 runs   (   15.06 ms per token,    66.40 tokens per second)
llama_perf_context_print:       total time =    2888.20 ms /   578 tokens
Llama.generate: 20 prefix-match hit, remaining 850 prompt tokens to eval


Topic 41 → Product Feedback


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    3726.93 ms /   850 tokens (    4.38 ms per token,   228.07 tokens per second)
llama_perf_context_print:        eval time =     484.98 ms /    31 runs   (   15.64 ms per token,    63.92 tokens per second)
llama_perf_context_print:       total time =    4216.58 ms /   881 tokens
Llama.generate: 20 prefix-match hit, remaining 67 prompt tokens to eval


Topic 42 → Your order was received by our staff on 08/01/2019 at 09


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     570.00 ms /    67 tokens (    8.51 ms per token,   117.54 tokens per second)
llama_perf_context_print:        eval time =     431.94 ms /    31 runs   (   13.93 ms per token,    71.77 tokens per second)
llama_perf_context_print:       total time =    1005.94 ms /    98 tokens
Llama.generate: 20 prefix-match hit, remaining 320 prompt tokens to eval


Topic 43 → - TEFAP Bon Shrimp, short 1 word topic label


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    1202.03 ms /   320 tokens (    3.76 ms per token,   266.22 tokens per second)
llama_perf_context_print:        eval time =     496.64 ms /    31 runs   (   16.02 ms per token,    62.42 tokens per second)
llama_perf_context_print:       total time =    1703.18 ms /   351 tokens
Llama.generate: 20 prefix-match hit, remaining 264 prompt tokens to eval


Topic 44 → College Park Community Food Bank | | | | | | â€Š | | Valued Site | | â€Š | | email serves as a


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    1027.97 ms /   264 tokens (    3.89 ms per token,   256.82 tokens per second)
llama_perf_context_print:        eval time =     456.98 ms /    31 runs   (   14.74 ms per token,    67.84 tokens per second)
llama_perf_context_print:       total time =    1489.33 ms /   295 tokens
Llama.generate: 20 prefix-match hit, remaining 501 prompt tokens to eval


Topic 45 → TEFAP Plums Pitted


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    1878.67 ms /   501 tokens (    3.75 ms per token,   266.68 tokens per second)
llama_perf_context_print:        eval time =     471.64 ms /    31 runs   (   15.21 ms per token,    65.73 tokens per second)
llama_perf_context_print:       total time =    2355.24 ms /   532 tokens
Llama.generate: 20 prefix-match hit, remaining 318 prompt tokens to eval


Topic 46 → Schedule, rescheduling, week, weather, angela, our, angela, our, angela, our, angela, our,


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    1323.23 ms /   318 tokens (    4.16 ms per token,   240.32 tokens per second)
llama_perf_context_print:        eval time =     455.83 ms /    31 runs   (   14.70 ms per token,    68.01 tokens per second)
llama_perf_context_print:       total time =    1783.64 ms /   349 tokens
Llama.generate: 20 prefix-match hit, remaining 125 prompt tokens to eval


Topic 47 → Registration, meeting, attendance, registered, excited, teach, attend, register, meeting, registration, attendance, registered, excited, teach,


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     664.34 ms /   125 tokens (    5.31 ms per token,   188.16 tokens per second)
llama_perf_context_print:        eval time =     439.52 ms /    31 runs   (   14.18 ms per token,    70.53 tokens per second)
llama_perf_context_print:       total time =    1108.06 ms /   156 tokens
Llama.generate: 20 prefix-match hit, remaining 207 prompt tokens to eval


Topic 48 → PeanuT


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     877.79 ms /   207 tokens (    4.24 ms per token,   235.82 tokens per second)
llama_perf_context_print:        eval time =     452.62 ms /    31 runs   (   14.60 ms per token,    68.49 tokens per second)
llama_perf_context_print:       total time =    1334.79 ms /   238 tokens
Llama.generate: 20 prefix-match hit, remaining 71 prompt tokens to eval


Topic 49 → Turkey Gravy, Cranberry Sauce, Overdue


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     609.45 ms /    71 tokens (    8.58 ms per token,   116.50 tokens per second)
llama_perf_context_print:        eval time =     433.88 ms /    31 runs   (   14.00 ms per token,    71.45 tokens per second)
llama_perf_context_print:       total time =    1047.69 ms /   102 tokens
Llama.generate: 20 prefix-match hit, remaining 387 prompt tokens to eval


Topic 50 → "Produce Pound Limit"


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    1459.71 ms /   387 tokens (    3.77 ms per token,   265.12 tokens per second)
llama_perf_context_print:        eval time =     102.83 ms /     7 runs   (   14.69 ms per token,    68.07 tokens per second)
llama_perf_context_print:       total time =    1564.23 ms /   394 tokens
Llama.generate: 20 prefix-match hit, remaining 199 prompt tokens to eval


Topic 51 → Lorton Community Center


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     864.39 ms /   199 tokens (    4.34 ms per token,   230.22 tokens per second)
llama_perf_context_print:        eval time =     485.35 ms /    31 runs   (   15.66 ms per token,    63.87 tokens per second)
llama_perf_context_print:       total time =    1356.38 ms /   230 tokens
Llama.generate: 20 prefix-match hit, remaining 120 prompt tokens to eval


Topic 52 → Overage


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     657.78 ms /   120 tokens (    5.48 ms per token,   182.43 tokens per second)
llama_perf_context_print:        eval time =     440.76 ms /    31 runs   (   14.22 ms per token,    70.33 tokens per second)
llama_perf_context_print:       total time =    1102.86 ms /   151 tokens
Llama.generate: 20 prefix-match hit, remaining 454 prompt tokens to eval


Topic 53 → Cancellation of Food Distribution


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    1664.74 ms /   454 tokens (    3.67 ms per token,   272.72 tokens per second)
llama_perf_context_print:        eval time =     470.58 ms /    31 runs   (   15.18 ms per token,    65.88 tokens per second)
llama_perf_context_print:       total time =    2140.10 ms /   485 tokens
Llama.generate: 20 prefix-match hit, remaining 116 prompt tokens to eval


Topic 54 → Blvd, San, Blvd, Almaden, Blvd, San, Blvd, San, Blvd, San, Blvd, San


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     741.05 ms /   116 tokens (    6.39 ms per token,   156.53 tokens per second)
llama_perf_context_print:        eval time =     447.91 ms /    31 runs   (   14.45 ms per token,    69.21 tokens per second)
llama_perf_context_print:       total time =    1193.28 ms /   147 tokens
Llama.generate: 20 prefix-match hit, remaining 473 prompt tokens to eval


Topic 55 → - Legends


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    1750.73 ms /   473 tokens (    3.70 ms per token,   270.17 tokens per second)
llama_perf_context_print:        eval time =     472.23 ms /    31 runs   (   15.23 ms per token,    65.65 tokens per second)
llama_perf_context_print:       total time =    2227.57 ms /   504 tokens
Llama.generate: 20 prefix-match hit, remaining 79 prompt tokens to eval


Topic 56 → ERIN, JUAN, MONTHLY REPORT, MERCER MERCADO


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     593.89 ms /    79 tokens (    7.52 ms per token,   133.02 tokens per second)
llama_perf_context_print:        eval time =     437.79 ms /    31 runs   (   14.12 ms per token,    70.81 tokens per second)
llama_perf_context_print:       total time =    1035.91 ms /   110 tokens
Llama.generate: 20 prefix-match hit, remaining 96 prompt tokens to eval


Topic 57 → Sentences


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     591.60 ms /    96 tokens (    6.16 ms per token,   162.27 tokens per second)
llama_perf_context_print:        eval time =     445.25 ms /    31 runs   (   14.36 ms per token,    69.62 tokens per second)
llama_perf_context_print:       total time =    1041.08 ms /   127 tokens
Llama.generate: 20 prefix-match hit, remaining 435 prompt tokens to eval


Topic 58 → 1


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    1671.73 ms /   435 tokens (    3.84 ms per token,   260.21 tokens per second)
llama_perf_context_print:        eval time =     471.74 ms /    31 runs   (   15.22 ms per token,    65.71 tokens per second)
llama_perf_context_print:       total time =    2148.06 ms /   466 tokens
Llama.generate: 20 prefix-match hit, remaining 189 prompt tokens to eval


Topic 59 → Capital Christian Fellowship


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     831.60 ms /   189 tokens (    4.40 ms per token,   227.27 tokens per second)
llama_perf_context_print:        eval time =     262.06 ms /    18 runs   (   14.56 ms per token,    68.69 tokens per second)
llama_perf_context_print:       total time =    1096.90 ms /   207 tokens
Llama.generate: 20 prefix-match hit, remaining 393 prompt tokens to eval


Topic 60 → Food CoDing GuideLines Physical Copies | Sourcing Area


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    1474.84 ms /   393 tokens (    3.75 ms per token,   266.47 tokens per second)
llama_perf_context_print:        eval time =     466.06 ms /    31 runs   (   15.03 ms per token,    66.51 tokens per second)
llama_perf_context_print:       total time =    1945.59 ms /   424 tokens
Llama.generate: 20 prefix-match hit, remaining 340 prompt tokens to eval


Topic 61 → - Crisped Ricer, Cornflakes, Oatmeal, Oz, Milk, Poached Pork, Potatoes, R


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    1333.52 ms /   340 tokens (    3.92 ms per token,   254.96 tokens per second)
llama_perf_context_print:        eval time =     488.48 ms /    31 runs   (   15.76 ms per token,    63.46 tokens per second)
llama_perf_context_print:       total time =    1826.82 ms /   371 tokens
Llama.generate: 20 prefix-match hit, remaining 186 prompt tokens to eval


Topic 62 → ccda, communication, intentional, information, legal, legally, privileged, st, unlawful, uei, vaccination,


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =     879.81 ms /   186 tokens (    4.73 ms per token,   211.41 tokens per second)
llama_perf_context_print:        eval time =     460.86 ms /    31 runs   (   14.87 ms per token,    67.27 tokens per second)
llama_perf_context_print:       total time =    1345.14 ms /   217 tokens
Llama.generate: 20 prefix-match hit, remaining 297 prompt tokens to eval


Topic 63 → Request a Change in Delivery Date/Time


llama_perf_context_print:        load time =     772.28 ms
llama_perf_context_print: prompt eval time =    1257.21 ms /   297 tokens (    4.23 ms per token,   236.24 tokens per second)
llama_perf_context_print:        eval time =     464.28 ms /    31 runs   (   14.98 ms per token,    66.77 tokens per second)
llama_perf_context_print:       total time =    1726.32 ms /   328 tokens


Topic 64 → - Quality of produce inspection, quality of produce, quality of produce inspection, quality of produce inspections, quality of produce inspections

Generated Topic Labels:
    Topic                                    Generated_Label
0      -1                                [insufficient data]
1       0                                           Delivery
2       1               Order Produce Pound Limit Last Sales
3       2  - Add (1), Pallet (2), Potatoes (3), Add (4), ...
4       3                                          "Greeing"
..    ...                                                ...
61     60  Food CoDing GuideLines Physical Copies | Sourc...
62     61  - Crisped Ricer, Cornflakes, Oatmeal, Oz, Milk...
63     62  ccda, communication, intentional, information,...
64     63             Request a Change in Delivery Date/Time
65     64  - Quality of produce inspection, quality of pr...

[66 rows x 2 columns]


In [None]:
# train separate model for each category
from bertopic import BERTopic
from umap import UMAP
import os

def train_bertopic_per_request_category(
    json_path,
    min_topic_size,
    embedding_model_name="all-MiniLM-L6-v2",
    output_dir="bertopic_outputs_per_category"
):
    # Load & group data
    data = load_data(json_path)
    grouped_data = {}
    for item in data:
        category = item.get("Custom field (Request Category)")
        description = item.get("Description")
        if isinstance(category, str) and isinstance(description, str):
            grouped_data.setdefault(category, []).append(description)

    os.makedirs(output_dir, exist_ok=True)

    for category, descriptions in grouped_data.items():
        print(f"\n📂 Processing category: '{category}' ({len(descriptions)} items)")

        # Step 1: Clean
        docs_cleaned = [clean_for_bertopic(desc) for desc in descriptions]
        docs_cleaned = [remove_tags(doc) for doc in docs_cleaned]
        docs_filtered = filter_low_tfidf_words_justdescripton(docs_cleaned)

        if len(docs_filtered) < 10:
            print(f"⚠️  Skipped '{category}' — Not enough data after filtering ({len(docs_filtered)} docs)")
            continue

        # Step 2: Custom UMAP for small batches
        custom_umap = UMAP(
            n_neighbors=min(10, len(docs_filtered) - 1),
            n_components=5,
            metric='cosine'
        )

        # Step 3: Train model
        topic_model = BERTopic(
            min_topic_size=min_topic_size,
            embedding_model=embedding_model_name,
            umap_model=custom_umap
        )
        topics, _ = topic_model.fit_transform(docs_filtered)

        # Step 4: Save to CSV
        safe_category = category.replace("/", "_").replace(" ", "_").lower()
        save_path = os.path.join(output_dir, f"bertopic_{safe_category}.csv")
        save_bertopic_to_csv(
            model=topic_model,
            docs=docs_filtered,
            topics=topics,
            filename=save_path
        )
        print(f"✅ Saved results for '{category}' to: {save_path}")
train_bertopic_per_request_category(
    "data_processed.json",
    min_topic_size,
)

Using comment section and description 

In [57]:
data = load_data("data_bertopic.json")
data_all_desc = [item for item in data if isinstance(item.get("Description"), str)]
docs_all_fields = [clean_for_bertopic(combine_description_and_comments(ticket)) for ticket in data_all_desc]
topic_model_all_fields = BERTopic(min_topic_size=min_topic_size, embedding_model=embedding_model_name)
topics_all_fields, probs_all_fields = topic_model_all_fields.fit_transform(docs_all_fields)

save_bertopic_to_csv(
    model=topic_model_all_fields,
    docs=docs_all_fields,
    topics=topics_all_fields,
    filename=f"bertopic_summary_processeddata_min_topic_{min_topic_size}.csv"
)



Saved BERTopic summary to bertopic_summary_processeddata_min_topic_5.csv


In [None]:
import pandas as pd

def generate_ticket_headings(topic_model, save_path="ticket_headings.csv"):
    # Get topic info
    topics_info = topic_model.get_topic_info()
    topic_names = topics_info['Name'].tolist()
    topic_ids = topics_info['Topic'].tolist()
    
    # Generate headings from topic names
    ticket_headings = [f"Ticket - {name.replace('_', ' ').capitalize()}" for name in topic_names]
    
    # Create DataFrame
    df = pd.DataFrame({
        "Topic_ID": topic_ids,
        "Ticket_Heading": ticket_headings
    })
    
    # Save as CSV
    df.to_csv(save_path, index=False)
    print(f"Ticket headings saved to: {save_path}")


Ticket headings saved to: ticket_headings.csv


In [12]:
topic_model_all_fields.visualize_topics()


In [13]:
topic_model_all_fields.visualize_hierarchy()


LDA

Create corpus for the LDA model

In [217]:
from gensim.models import TfidfModel
from gensim import corpora

def bigram_trigram_generator(tokenized_words, min_occurrence=5, threshold_score=100):
    bigram_phrases = gensim.models.Phrases(tokenized_words, min_count=min_occurrence, threshold=threshold_score)
    trigram_phrases = gensim.models.Phrases(bigram_phrases[tokenized_words], threshold=threshold_score)

    bigram = gensim.models.phrases.Phraser(bigram_phrases)
    trigram = gensim.models.phrases.Phraser(trigram_phrases)

    def make_bigrams(texts):
        return([bigram[doc] for doc in texts])

    def make_trigrams(texts):
        return ([trigram[bigram[doc]] for doc in texts])

    data_bigrams = make_bigrams(tokenized_words)
    data_bigrams_trigrams = make_trigrams(data_bigrams)
    return data_bigrams_trigrams

def corpus_generator(corpus_words, threshold_tfidf=0.04):
    texts = corpus_words

    # Step 1: Build initial dictionary and corpus
    id2word = corpora.Dictionary(texts)
    corpus = [id2word.doc2bow(text) for text in texts]

    # Step 2: Create TF-IDF model
    tfidf = TfidfModel(corpus, id2word=id2word)

    low_value = threshold_tfidf  # Try a much higher threshold

    # Step 3: Clean both texts and corpus
    filtered_texts = []
    filtered_corpus = []

    for i, bow in enumerate(corpus):
        tfidf_weights = tfidf[bow]
        tfidf_ids = {id for id, score in tfidf_weights if score >= low_value}
        bow_ids = {id for id, _ in bow}
        allowed_ids = tfidf_ids & bow_ids

        # Filtered BoW and Text
        new_bow = [b for b in bow if b[0] in allowed_ids]
        new_text = [id2word[b[0]] for b in new_bow]

        filtered_corpus.append(new_bow)
        filtered_texts.append(new_text)

    # Step 4: Rebuild dictionary and final corpus from cleaned text
    id2word = corpora.Dictionary(filtered_texts)
    corpus = [id2word.doc2bow(text) for text in filtered_texts]
    return filtered_texts, corpus, id2word

Create LDA for each category

In [None]:
from collections import defaultdict
data = load_data("data_processed.json")
grouped_tickets = defaultdict(list)
for item in data:
    key = item.get("Custom field (Request Category)", "Unknown")
    grouped_tickets[key].append(item)
grouped_dict = dict(grouped_tickets)
grouped_ticket_by_request_category = [{"category": key, "items": value} for key, value in grouped_dict.items()]

descriptions_matrix_by_reqtype = []
for ticket_type in grouped_ticket_by_request_category:
    descriptions = [str(ticket["Description"]).split() for ticket in ticket_type["items"] if "Description" in ticket]
    descriptions_matrix_by_reqtype.append(descriptions)
words_matrix_by_reqtype = []
for descriptions_by_reqtype in descriptions_matrix_by_reqtype:
    words_matrix_by_reqtype.append(bigram_trigram_generator(descriptions_by_reqtype))
corpus_matrix_by_reqtype = []
for words_by_reqtype in words_matrix_by_reqtype:
    corpus_matrix_by_reqtype.append(corpus_generator(words_by_reqtype))
import gensim
import pyLDAvis
import pyLDAvis.gensim
from IPython.display import display, HTML

lda_models = []
lda_visualizations = []
lda_models_ = []
# Loop through each request type
for i, (words_by_reqtype, (filtered_texts, corpus, id2word)) in enumerate(zip(words_matrix_by_reqtype, corpus_matrix_by_reqtype)):
    
    # Skip if corpus is too small
    if len(corpus) < 10:
        print(f"Skipping request type index {i} (too few documents)")
        continue
    else:
        # Train LDA model
        lda_model = gensim.models.ldamodel.LdaModel(
            corpus=corpus,
            id2word=id2word,
            num_topics=8,
            random_state=100,
            update_every=1,
            chunksize=100,
            passes=10,
            alpha="auto",
            per_word_topics=True
        )
        lda_models_.append(lda_model)
        # Store model for reference
        lda_models.append((i, lda_model))

        # Create and display pyLDAvis visualization
        pyLDAvis.enable_notebook()
        vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
        lda_visualizations.append((i, vis))

        # Display with label
        request_type = grouped_ticket_by_request_category[i]['category']
        display(HTML(f"<h2 style='color: teal;'>Request Type: {request_type}</h2>"))
        display(vis)



Create a single LDA model for all descriptions

In [218]:
data = load_data("data_processed.json")
data_all_desc = [item for item in data if isinstance(item.get("Description"), str)]
tokenized_docs_lda = []
for doc in data_all_desc:
    tokenized_docs_lda.append(doc["Description"].split(" "))
bigrams_trigrams_LDA = bigram_trigram_generator(tokenized_docs_lda)
filtered_texts, corpus_lda, id2word_lda = corpus_generator(bigrams_trigrams_LDA, 0.2)
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_lda,
                                           id2word=id2word_lda,
                                           num_topics=17,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto",
                                           per_word_topics=True)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus_lda, id2word_lda, mds="mmds", R=30)
vis
pyLDAvis.save_html(vis, "lda_visualization_only_descriptions.html")


Create a single LDA model using descriptions and comments

In [221]:
data = load_data("data_processed.json")

# Step 1: Collect combined text (description + all comments)
combined_docs = []
for item in data:
    if isinstance(item.get("Description"), str):
        # Start with the description
        full_text = item["Description"]
        
        # Append available comment fields
        for i in range(20):  # From Comment to Comment.19
            comment_key = "Comment" if i == 0 else f"Comment.{i}"
            comment_text = item.get(comment_key)
            if isinstance(comment_text, str):
                full_text += " " + comment_text
        
        combined_docs.append(full_text)

# Step 2: Tokenize
tokenized_docs_lda = [doc.split(" ") for doc in combined_docs]

# Step 3: Generate bigrams/trigrams
bigrams_trigrams_LDA = bigram_trigram_generator(tokenized_docs_lda)

# Step 4: TF-IDF filtering and corpus preparation
filtered_texts, corpus_lda, id2word_lda = corpus_generator(bigrams_trigrams_LDA, threshold_tfidf=0.2)

# Step 5: Train LDA model
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus_lda,
    id2word=id2word_lda,
    num_topics=17,
    random_state=100,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha="auto",
    per_word_topics=True
)

# Step 6: Visualize
import pyLDAvis.gensim
import pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus_lda, id2word_lda, mds="mmds", R=30)
pyLDAvis.save_html(vis, "lda_visualization_with_comments.html")
vis
