In [None]:
# Install all required libraries
!pip install pandas numpy nltk spacy scikit-learn gensim

In [1]:
import pandas as pd
import numpy as np
import re, string, nltk, spacy
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from gensim import corpora
from gensim.models.ldamodel import LdaModel


In [1]:
import kagglehub

# Download data
path = kagglehub.dataset_download("shashwatwork/consume-complaints-dataset-fo-nlp")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shashwatwork/consume-complaints-dataset-fo-nlp?dataset_version_number=1...


100%|██████████| 19.8M/19.8M [00:00<00:00, 112MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/shashwatwork/consume-complaints-dataset-fo-nlp/versions/1


In [14]:
# 1. load data

import pandas as pd

# Construct the full path to the CSV file
csv_file_path = "/root/.cache/kagglehub/datasets/shashwatwork/consume-complaints-dataset-fo-nlp/versions/1/complaints_processed.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path, nrows=5000)

# Display the first few rows of the DataFrame
display(df.head())

Unnamed: 0.1,Unnamed: 0,product,narrative
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...
3,3,credit_reporting,payment history missing credit report speciali...
4,4,credit_reporting,payment history missing credit report made mis...


In [15]:

# 2. Preprocessing
nltk.download("stopwords")
nltk.download("punkt")
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))
punct_table = str.maketrans("", "", string.punctuation)

clean_texts = []
token_lists = []

for text in df["narrative"].astype(str):
    text = text.lower().translate(punct_table)
    doc = nlp(text)
    tokens = [t.lemma_ for t in doc if t.is_alpha and t.lemma_ not in stop_words and len(t) > 2]
    clean_texts.append(" ".join(tokens))
    token_lists.append(tokens)

df["clean_text"] = clean_texts
print("Example cleaned text:\n", df["clean_text"].iloc[0])




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Example cleaned text:
 purchase order day shipping amount receive product week send followup email exact verbiage pay two day shipping receive order company respond sorry inform due unusually high order volume order ship several week stock since early due high demand although continue take order guarantee receive order place due time mask order exact shipping date right however guarantee ship soon soon deliver product get small shipment shipping first come first serve basis appreciate patience fulfill order quickly recommend keeping order lose place line cancel distributor stock moment prefer cancel please note ask via email cancel accordance cancellation policy agree checkout electronic inventory online request order cancel refund issue cancel order send verification order cancel refunded item particulate respirator refunded subtotal shipping tax total usd visa end refund call dispute amount state nothing need submit address issue recharge item remove call back dispute amount transact

In [16]:
# 3. TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=5, max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df["clean_text"])
print("TF-IDF matrix shape:", X_tfidf.shape)

# 4. Word Embeddings (spaCy average vectors)
embeddings = []
for text in df["clean_text"]:
    doc = nlp(text)
    if len(doc) > 0:
        embeddings.append(doc.vector)
    else:
        embeddings.append(np.zeros(nlp.vocab.vectors_length))
X_emb = np.array(embeddings)
print("Embeddings shape:", X_emb.shape)



TF-IDF matrix shape: (5000, 2810)
Embeddings shape: (5000, 96)


In [8]:
# 5. Topic Modeling - LDA
dictionary = corpora.Dictionary(token_lists)
corpus = [dictionary.doc2bow(tokens) for tokens in token_lists]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=10, random_state=42)

print("\nLDA Topics:")
for idx, topic in lda.print_topics(num_words=8):
    print(f"Topic {idx+1}: {topic}")

# 6. Topic Modeling - NMF
nmf = NMF(n_components=5, random_state=42)
W = nmf.fit_transform(X_tfidf)
H = nmf.components_

feature_names = tfidf_vectorizer.get_feature_names_out()
print("\nNMF Topics:")
for i, topic in enumerate(H):
    top_indices = topic.argsort()[::-1][:8]
    print(f"Topic {i+1}: {' '.join([feature_names[j] for j in top_indices])}")




LDA Topics:
Topic 1: 0.031*"well" + 0.027*"fargo" + 0.020*"clearly" + 0.020*"fraudulent" + 0.016*"account" + 0.016*"card" + 0.016*"agency" + 0.012*"legal"
Topic 2: 0.043*"experian" + 0.033*"credit" + 0.016*"fee" + 0.013*"report" + 0.013*"company" + 0.013*"limit" + 0.013*"comenity" + 0.013*"bank"
Topic 3: 0.004*"report" + 0.003*"date" + 0.003*"account" + 0.003*"show" + 0.003*"last" + 0.003*"payment" + 0.003*"experian" + 0.003*"credit"
Topic 4: 0.027*"payment" + 0.026*"new" + 0.025*"issue" + 0.022*"history" + 0.022*"fix" + 0.019*"order" + 0.018*"report" + 0.018*"forbearance"
Topic 5: 0.069*"report" + 0.051*"date" + 0.051*"show" + 0.045*"account" + 0.042*"payment" + 0.038*"last" + 0.026*"credit" + 0.023*"information"

NMF Topics:
Topic 1: report date show account payment company credit make
Topic 2: issue history payment report get ask credit account
Topic 3: remove call one due issue request company get
Topic 4: credit company report year change name fact account
Topic 5: account also r

In [11]:
# Collect LDA topics
lda_topics = []
for idx, topic in lda.show_topics(num_topics=5, num_words=8, formatted=False):
    lda_topics.append([word for word, _ in topic])

# Collect NMF topics
nmf_topics = []
for i, topic in enumerate(H):
    top_indices = topic.argsort()[::-1][:8]
    nmf_topics.append([feature_names[j] for j in top_indices])

# Print side by side
print("\n=== LDA vs NMF Topic Comparison ===")
for i in range(5):
    print(f"Topic {i+1}:")
    print(f"  LDA: {', '.join(lda_topics[i])}")
    print(f"  NMF: {', '.join(nmf_topics[i])}")
    print()



=== LDA vs NMF Topic Comparison ===
Topic 1:
  LDA: well, fargo, clearly, fraudulent, account, card, agency, legal
  NMF: report, date, show, account, payment, company, credit, make

Topic 2:
  LDA: experian, credit, fee, report, company, limit, comenity, bank
  NMF: issue, history, payment, report, get, ask, credit, account

Topic 3:
  LDA: report, date, account, show, last, payment, experian, credit
  NMF: remove, call, one, due, issue, request, company, get

Topic 4:
  LDA: payment, new, issue, history, fix, order, report, forbearance
  NMF: credit, company, report, year, change, name, fact, account

Topic 5:
  LDA: report, date, show, account, payment, last, credit, information
  NMF: account, also, request, need, payment, time, year, remove

