<a href="https://colab.research.google.com/github/ACCMouli/chandu/blob/main/topicmodelling/02_LDA_Topic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LDA Topic Modeling (Bag-of-Words)


## Step 0: Install & Imports

In [1]:
# !pip install scikit-learn pandas
import pandas as pd

#Bag-of-words vectorizer that turns text into word-count features.
from sklearn.feature_extraction.text import CountVectorizer

#Brings in scikit-learn’s LDA implementation.
from sklearn.decomposition import LatentDirichletAllocation as LDA
import numpy as np, os, random
random.seed(42)

## Step 1: Load the CSV

In [2]:
#csv_path = r"topics_100.csv"
csv_path = "https://github.com/giridhar276/genai/raw/refs/heads/main/topicmodelling/topics_100.csv"
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,id,text
0,79,IPv6 users cannot reach the upload endpoint
1,4,Login form shows captcha error even for first ...
2,55,Audit logs missing entries for sensitive actions
3,3,SSO login loops back to the sign-in page repea...
4,72,Corporate proxy strips authorization headers


## Step 2: Counts Vectorizer

In [3]:
#CountVectorizer(...) – Creates a vectorizer with these settings:
#stop_words="english" – Drops very common English stopwords (e.g., the, and).
#max_features=20000 – Caps vocabulary size; here it’s high and won’t bind for 100 docs.
#min_df=2 – Keep only words/bigrams that appear in ≥ 2 documents
#ngram_range=(1,2) – Use unigrams and bigrams (e.g., “login”, “login error”).


vec = CountVectorizer(stop_words="english", max_features=20000, min_df=2, ngram_range=(1,2))


#Learns the vocabulary from text and creates a sparse matrix X of shape (n_docs, vocab_size) with raw counts.
X = vec.fit_transform(df["text"].astype(str).tolist())


#List of vocabulary strings in the same order as X’s columns.
terms = vec.get_feature_names_out()
X.shape

(100, 81)

## Step 3: Fit LDA

In [4]:
#n_topics = 10 – How many topics to extract
n_topics = 6


#LDA(...)Configure LDA:
#n_components=n_topics – Number of topics.
#learning_method="batch" – Full batch variational EM (stable for small/medium data). For very large corpora, try "online".
#random_state=42 – Reproducibility.
#max_iter=50 Max optimization iterations.

lda = LDA(n_components=n_topics, learning_method="batch", random_state=42, max_iter=50)

#W (docs × topics): how much of each topic per doc
#Fits the model to X and returns the document–topic matrix:
W = lda.fit_transform(X)

#H (topics × words): which words define each topic
#The topic–word matrix in counts space:
H = lda.components_


W.shape, H.shape

((100, 6), (6, 81))

## Step 4: Top Words

In [5]:
def top_words_per_topic(H, terms, topn=12):
    for k, row in enumerate(H):
        top_idx = row.argsort()[-topn:][::-1]
        print(f"LDA Topic {k}: " + ", ".join(terms[i] for i in top_idx))
top_words_per_topic(H, terms, topn=12)

LDA Topic 0: fails, profile, update, vpn, missing, large, codes, area, upload, sync, ip, fields
LDA Topic 1: login, does, date, change, invoice, page, processed, user, alert, cause, mobile, password
LDA Topic 2: shows, report, login, error, form, custom, headers, fails, duplicate, sso, save, resets
LDA Topic 3: notifications, api, users, arrive, responses, checkout, charged, plan, payment, wrong, webhook, app
LDA Topic 4: links, slowly, mode, older, devices, frequently, dashboard, limiting, retries, requests, triggers, push
LDA Topic 5: login, quickly, link, logout, pagination, charts, file, oauth, unexpectedly, returns, app, password


## Step 5: Assign & Save

In [6]:
df["dominant_topic"] = W.argmax(axis=1)
df.to_csv("lda_topics_assigned.csv", index=False)
df.head()

Unnamed: 0,id,text,dominant_topic
0,79,IPv6 users cannot reach the upload endpoint,3
1,4,Login form shows captcha error even for first ...,2
2,55,Audit logs missing entries for sensitive actions,0
3,3,SSO login loops back to the sign-in page repea...,1
4,72,Corporate proxy strips authorization headers,2


In [7]:
df

Unnamed: 0,id,text,dominant_topic
0,79,IPv6 users cannot reach the upload endpoint,3
1,4,Login form shows captcha error even for first ...,2
2,55,Audit logs missing entries for sensitive actions,0
3,3,SSO login loops back to the sign-in page repea...,1
4,72,Corporate proxy strips authorization headers,2
...,...,...,...
95,67,Embedded webview zooms unexpectedly on forms,5
96,38,Form labels overlap with placeholder text,2
97,33,Date picker resets value when switching months,1
98,17,Subscription renewal date does not match invoi...,1
