### Install and import required packages

In [19]:
!pip install pyLDAvis==3.4.0

Collecting pyLDAvis==3.4.0
  Downloading pyLDAvis-3.4.0-py3-none-any.whl.metadata (4.2 kB)
Downloading pyLDAvis-3.4.0-py3-none-any.whl (2.6 MB)
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   -------- ------------------------------- 0.5/2.6 MB 4.2 MB/s eta 0:00:01
   -------------------- ------------------- 1.3/2.6 MB 3.7 MB/s eta 0:00:01
   -------------------------------- ------- 2.1/2.6 MB 3.7 MB/s eta 0:00:01
   -------------------------------- ------- 2.1/2.6 MB 3.7 MB/s eta 0:00:01
   -------------------------------- ------- 2.1/2.6 MB 3.7 MB/s eta 0:00:01
   ------------------------------------ --- 2.4/2.6 MB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 2.6/2.6 MB 1.8 MB/s eta 0:00:00
Installing collected packages: pyLDAvis
Successfully installed pyLDAvis-3.4.0


In [23]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pyLDAvis
import pyLDAvis.sklearn
import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CHELSA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CHELSA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load and Preprocess the Data

In [27]:
# Load 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data

def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

processed_docs = [preprocess(doc) for doc in documents[:2000]] 

### Vectorization (Count Vectorizer)

In [28]:
vectorizer = CountVectorizer(max_df=0.9, min_df=10, stop_words='english')
X = vectorizer.fit_transform(processed_docs)

### Fit LDA Model

In [29]:
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(X)

### Show Top Keywords per Topic

In [45]:
feature_names = vectorizer.get_feature_names_out()
for idx, topic in enumerate(lda.components_):
    print(f"\nTopic #{idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))


Topic #1:
file image use key internet user information data address gif

Topic #2:
like right article bike really van think big shot new

Topic #3:
book history text bible read copy university greek school year

Topic #4:
ed fbi police agent arab batf said israel jewish april

Topic #5:
game year team player know like really good going think

Topic #6:
car time like problem know little way good light thing

Topic #7:
window card use drive work thanks program problem know using

Topic #8:
armenian space year state muslim people new station palestinian russian

Topic #9:
people think know say make right thing way like time

Topic #10:
god law jesus christian point say church human sin christ


### Monkey-patch CountVectorizer to make PyLDAvis work

In [55]:
# Monkey-patch for compatibility
from sklearn.feature_extraction.text import CountVectorizer

if not hasattr(CountVectorizer, 'get_feature_names'):
    CountVectorizer.get_feature_names = CountVectorizer.get_feature_names_out


### Visualize Topics with pyLDAvis

In [57]:
import pyLDAvis
import pyLDAvis.sklearn
import warnings
warnings.filterwarnings("ignore")

pyLDAvis.enable_notebook()

# Prepare the visualization panel
panel = pyLDAvis.sklearn.prepare(
    lda_model=lda, 
    dtm=X, 
    vectorizer=vectorizer
)
panel


### Save pyLDAvis --> HTML file

In [59]:
pyLDAvis.save_html(panel, 'lda_visualization.html')

### Testing the LDA Model

In [63]:
sample_texts = [
    "The resurrection of Jesus is a fundamental belief in Christianity.",
    "Intel just released their latest i9 processor with improved thermal performance.",
    "NASA’s new rover has landed on Mars to search for signs of life.",
    "The government’s decision on abortion laws has sparked protests."
]

def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(tokens)

processed_samples = [preprocess(text) for text in sample_texts]

X_new = vectorizer.transform(processed_samples)

topic_probs = lda.transform(X_new)

feature_names = vectorizer.get_feature_names_out()

for i, probs in enumerate(topic_probs):
    top_topic = probs.argmax()
    print(f"\nDocument {i+1}: {sample_texts[i]}")
    print(f"→ Most likely Topic #{top_topic} with probability {probs[top_topic]:.4f}")
    print(f"Top keywords in this topic:")
    print(" ".join([feature_names[i] for i in lda.components_[top_topic].argsort()[:-11:-1]]))


Document 1: The resurrection of Jesus is a fundamental belief in Christianity.
→ Most likely Topic #9 with probability 0.5004
Top keywords in this topic:
god law jesus christian point say church human sin christ

Document 2: Intel just released their latest i9 processor with improved thermal performance.
→ Most likely Topic #6 with probability 0.6833
Top keywords in this topic:
window card use drive work thanks program problem know using

Document 3: NASA’s new rover has landed on Mars to search for signs of life.
→ Most likely Topic #7 with probability 0.6382
Top keywords in this topic:
armenian space year state muslim people new station palestinian russian

Document 4: The government’s decision on abortion laws has sparked protests.
→ Most likely Topic #8 with probability 0.6097
Top keywords in this topic:
people think know say make right thing way like time
