JSON dataset

In [192]:
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#visualization
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

import re
stopwords = stopwords.words("english")

In [126]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [None]:
# DATA PREPROCESSING
def lemmatization(text, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    doc = nlp(text)
    new_text = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    return " ".join(new_text)

def gen_words(text):
    return gensim.utils.simple_preprocess(text, deacc=True)
def remove_tags(text):
    text = re.sub(r'^\s*(From|Sent|To|Subject).*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'<[^>]*>', '', text)
    text = re.sub(r'{[^}]*}', '', text)
    text = re.sub(r'"[^"]*"', '', text)
    text = re.sub(r'![^"]*!', '', text)
    text = re.sub(r'\[[^\]]*\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\b\d{3}-\d{3}-\d{4}\b', 'phone number', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

BERT TOPIC DATASET CREATION

Using all ticket descriptions

In [196]:
import re
from nltk.tokenize import word_tokenize
data = load_data("data_case2-1.json")
def clean_for_bertopic(text):
    # Remove email headers/footers and metadata
    text = re.sub(r'On .* wrote:', '', text)
    text = re.sub(r'(?i)Yvonne Brown.*?(Director)?', '', text)
    text = re.sub(r'\*P\.*.*?\d{3}-\d{3}-\d{4}', '', text)
    text = re.sub(r'Capital Area Food Bank.*', '', text)
    text = re.sub(r'\|', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [197]:
from bertopic import BERTopic
# Step 2: Fit BERTopic
data_all_desc = [item for item in data if isinstance(item.get("Description"), str)]
docs_all_descriptions = [clean_for_bertopic(ticket["Description"]) for ticket in data_all_desc]

docs_descriptions = [remove_tags(text) for text in docs_all_descriptions]




topic_model_all_descriptions = BERTopic(embedding_model="all-MiniLM-L6-v2")
topics_all_descriptions, probs_all_descriptions = topic_model_all_descriptions.fit_transform(docs_descriptions)


In [198]:
topic_model_all_descriptions.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,140,-1_to_the_we_of,"[to, the, we, of, and, you, for, is, food, our]",[Hi Maria Thank you for reaching out about the...
1,0,160,0_to_the_delivery_for,"[to, the, delivery, for, this, your, you, we, ...",[Good day I hope this finds you well Is it pos...
2,1,87,1_cases_add_10_please,"[cases, add, 10, please, following, 20, beans,...",[Please reduce 500372 chicken drumsticks from ...
3,2,80,2_cancel_order_please_thank,"[cancel, order, please, thank, for, you, to, l...",[Good morning Please we would like to cancel t...
4,3,66,3_add_produce_to_order,"[add, produce, to, order, available, potatoes,...",[Good morning Please we would like the item 30...
5,4,45,4_pallets_up_we_pallet,"[pallets, up, we, pallet, pick, of, driver, ha...",[Good morning Please we have some empty pallet...
6,5,41,5_their_limit_exceed_pound,"[their, limit, exceed, pound, sales, last, has...",[has exceed their Produce pound limit on their...
7,6,36,6_training_to_you_will,"[training, to, you, will, that, the, feature, ...",[I have already attended this training Angela ...
8,7,33,7_edit_invoice_payment_the,"[edit, invoice, payment, the, order, to, my, c...",[I am unable to edit my order because it was b...
9,8,23,8_dates_milk_best_shelf,"[dates, milk, best, shelf, stable, the, expira...",[When do you expect to get more 8 oz shelf sta...


In [190]:
topic_model_all_descriptions.get_topic(10)

[('the', 0.045789700421301614),
 ('we', 0.03899544832298534),
 ('of', 0.03810192831960621),
 ('that', 0.03355993903312951),
 ('had', 0.03298340532810616),
 ('and', 0.0317263716708201),
 ('received', 0.03116540485572979),
 ('to', 0.030759839664768442),
 ('in', 0.025159888843307443),
 ('boxes', 0.024399684272786586)]

In [191]:
topic_model_all_descriptions.get_representative_docs(10)

['Good afternoon Thank you sharing your concerns with your last order as well as photos I have included Partner Support in this response and they will work with the internal CAFB departments to convey the feedback and help you mitigate the issue Please use their email for any future order issues as well Thank you Augusta From Ileana Olea Sent Friday September 27 2024 1219 PM To CAFB MD Partnership Coordinator Cc Jorge Amaya Supriya Mordecai Subject Produce Boxes Damaged and Expired Food Good afternoon I am just contacting you as we received two shipments yesterday and today Both shipments have been damaged in one way or another The boxes of fresh produce came very damaged towards the bottom and some of the items inside had to be thrown out due to being squished Today we received expired peanut butter and will be unable to give that to our clients I have attached photos of both things Please let me know what we can do about this situation Thank you',
 'Hi Passing this along directly to 

In [None]:
# for item in data:
#     if isinstance(item["Description"], str):
#         original_text = item["Description"]
#         # Remove unwanted tags and patterns
#         cleaned_text = remove_tags(original_text)
#         # Apply lemmatization on the cleaned text
#         lemmatized_text = lemmatization(cleaned_text)
#         # Tokenize the lemmatized text
#         tokenized_words = gen_words(lemmatized_text)
#         # Update "Description" with the joined tokenized words
#         item["Description"] = " ".join(tokenized_words)
# write_data("data_processed.json", data)

#Extract values and collect unique ones using a set
# unique_categories = set()

# for item in data:
#     if "Custom field (Request Category)" in item:
#         unique_categories.add(item["Custom field (Request Category)"])

# # Print the unique values
# for value in unique_categories:
#     print(value)


# Extract values and collect unique ones using a set
# unique_categories = set()

# for item in data:
#     if "Custom field (Cause of issue)" in item:
#         unique_categories.add(item["Custom field (Cause of issue)"])

# # Print the unique values
# for value in unique_categories:
#     print(value)


Menu -> Discrepancy
Orders - Pre delivery -> Edit Order Items
Spam / Duplicate
Menu -> General Questions
Delivery / Pickup
PartnerLink -> New Shopper
Feedback - Concern / Negative -> Transportation
Orders - Pre delivery ->  Change to Pickup/vice versa
Orders - Pre delivery -> Data/Time Change - Reschedule
Agency Administration
Delivery / Pickup -> Missing Item - Redeliver
Menu -> Product Best By / Expiration Date
Feedback - Concern / Negative -> Quality - Produce
Orders - Pre delivery -> Produce Request
Delivery / Pickup -> Return - Quality
Orders - Pre delivery ->  Request unavailable window
Billing / Grants -> Grant Support
Billing / Grants -> Discrepancy in Invoice
Billing / Grants -> Questions for Accounting
Menu -> Inventory Availability
Delivery / Pickup -> General Questions
Spam / Duplicate -> Spam / Duplicate
Orders - Pre delivery -> Cancelation
Agency Administration -> Office Hours
Delivery / Pickup -> Pallet Pickup
Feedback - Concern / Negative -> Operations
Agency Administra

LDA

Create corpus for the LDA model

In [None]:
from gensim.models import TfidfModel
from gensim import corpora

def bigram_trigram_generator(tokenized_words, min_occurrence=5, threshold_score=50):
    bigram_phrases = gensim.models.Phrases(tokenized_words, min_count=min_occurrence, threshold=threshold_score)
    trigram_phrases = gensim.models.Phrases(bigram_phrases[tokenized_words], threshold=threshold_score)

    bigram = gensim.models.phrases.Phraser(bigram_phrases)
    trigram = gensim.models.phrases.Phraser(trigram_phrases)

    def make_bigrams(texts):
        return([bigram[doc] for doc in texts])

    def make_trigrams(texts):
        return ([trigram[bigram[doc]] for doc in texts])

    data_bigrams = make_bigrams(tokenized_words)
    data_bigrams_trigrams = make_trigrams(data_bigrams)
    return data_bigrams_trigrams

def corpus_generator(corpus_words, threshold_tfidf=0.04):
    texts = corpus_words

    # Step 1: Build initial dictionary and corpus
    id2word = corpora.Dictionary(texts)
    corpus = [id2word.doc2bow(text) for text in texts]

    # Step 2: Create TF-IDF model
    tfidf = TfidfModel(corpus, id2word=id2word)

    low_value = threshold_tfidf  # Try a much higher threshold

    # Step 3: Clean both texts and corpus
    filtered_texts = []
    filtered_corpus = []

    for i, bow in enumerate(corpus):
        tfidf_weights = tfidf[bow]
        tfidf_ids = {id for id, score in tfidf_weights if score >= low_value}
        bow_ids = {id for id, _ in bow}
        allowed_ids = tfidf_ids & bow_ids

        # Filtered BoW and Text
        new_bow = [b for b in bow if b[0] in allowed_ids]
        new_text = [id2word[b[0]] for b in new_bow]

        filtered_corpus.append(new_bow)
        filtered_texts.append(new_text)

    # Step 4: Rebuild dictionary and final corpus from cleaned text
    id2word = corpora.Dictionary(filtered_texts)
    corpus = [id2word.doc2bow(text) for text in filtered_texts]
    return filtered_texts, corpus, id2word

Create LDA for each category

In [None]:
from collections import defaultdict
data = load_data("data_processed.json")
grouped_tickets = defaultdict(list)
for item in data:
    key = item.get("Custom field (Request Category)", "Unknown")
    grouped_tickets[key].append(item)
grouped_dict = dict(grouped_tickets)
grouped_ticket_by_request_category = [{"category": key, "items": value} for key, value in grouped_dict.items()]

descriptions_matrix_by_reqtype = []
for ticket_type in grouped_ticket_by_request_category:
    descriptions = [str(ticket["Description"]).split() for ticket in ticket_type["items"] if "Description" in ticket]
    descriptions_matrix_by_reqtype.append(descriptions)
words_matrix_by_reqtype = []
for descriptions_by_reqtype in descriptions_matrix_by_reqtype:
    words_matrix_by_reqtype.append(bigram_trigram_generator(descriptions_by_reqtype))
corpus_matrix_by_reqtype = []
for words_by_reqtype in words_matrix_by_reqtype:
    corpus_matrix_by_reqtype.append(corpus_generator(words_by_reqtype))
import gensim
import pyLDAvis
import pyLDAvis.gensim
from IPython.display import display, HTML

lda_models = []
lda_visualizations = []
lda_models_ = []
# Loop through each request type
for i, (words_by_reqtype, (filtered_texts, corpus, id2word)) in enumerate(zip(words_matrix_by_reqtype, corpus_matrix_by_reqtype)):
    
    # Skip if corpus is too small
    if len(corpus) < 10:
        print(f"Skipping request type index {i} (too few documents)")
        continue
    else:
        # Train LDA model
        lda_model = gensim.models.ldamodel.LdaModel(
            corpus=corpus,
            id2word=id2word,
            num_topics=8,
            random_state=100,
            update_every=1,
            chunksize=100,
            passes=10,
            alpha="auto",
            per_word_topics=True
        )
        lda_models_.append(lda_model)
        # Store model for reference
        lda_models.append((i, lda_model))

        # Create and display pyLDAvis visualization
        pyLDAvis.enable_notebook()
        vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
        lda_visualizations.append((i, vis))

        # Display with label
        request_type = grouped_ticket_by_request_category[i]['category']
        display(HTML(f"<h2 style='color: teal;'>Request Type: {request_type}</h2>"))
        display(vis)



Create a single LDA model for all descriptions

In [None]:
data = load_data("data_processed.json")
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=13,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto",
                                           per_word_topics=True)

In [43]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=10)
vis

