In [None]:
# Import necessary libraries
import pandas as pd  # Data manipulation
import re  # Regular expressions for text cleaning
import gensim  # For LDA and word embeddings
import nltk  # Natural Language Toolkit
from nltk.corpus import stopwords  # Common stopwords (e.g., "the", "is")
from nltk.stem import WordNetLemmatizer  # Reduce words to base form (e.g., "running" → "run")
from string import punctuation  # Punctuation marks (e.g., ".", ",")
from gensim.corpora import Dictionary  # Create a word-to-id mapping for LDA
from nltk.tokenize import word_tokenize  # Split text into words
from gensim.models.ldamodel import LdaModel, CoherenceModel  # LDA model and evaluation
import pyLDAvis  # Interactive topic visualization
import pyLDAvis.gensim  # Gensim integration for pyLDAvis
import matplotlib.pyplot as plt  # Plotting (not used here but often helpful)
%matplotlib inline  # Display plots in Jupyter Notebook

# Load the dataset (20 Newsgroups JSON format)
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print("Dataset Preview:")
df.head()  # Show first 5 rows

# Text Preprocessing Functions

def removing_email(text):
    """Remove email addresses using regex."""
    text = re.sub(r'\S*@\S*\s?', ' ', text)  # Matches patterns like user@domain.com
    return text

def only_words(text):
    """Keep only alphanumeric words and spaces (removes special characters)."""
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)  # Replace non-alphabets with space
    return text

# Define stopwords (common words to exclude)
stop_words = (
    list(set(stopwords.words('english'))) +  # Standard English stopwords
    list(punctuation) +  # Punctuation marks
    ['\n', '----', '---\n\n\n\n\n']  # Additional noise (e.g., line breaks)
)

lem = WordNetLemmatizer()  # Initialize lemmatizer

def cleaning(text):
    """Full text cleaning pipeline:
    1. Lowercase
    2. Tokenize (split into words)
    3. Remove stopwords
    4. Filter short words (<3 chars)
    5. Lemmatize verbs (e.g., "running" → "run")
    """
    text = text.lower()
    words = word_tokenize(text)
    words = [w for w in words if w not in stop_words]  # Remove stopwords
    words = [w for w in words if len(w) >= 3]  # Keep meaningful words
    lemma = [lem.lemmatize(w, 'v') for w in words]  # Lemmatize verbs
    return lemma

# Apply preprocessing pipeline
df['without email'] = df['content'].apply(removing_email)
df['only words'] = df['without email'].apply(only_words)
df['clean content'] = df['only words'].apply(cleaning)
print("\nProcessed Data Preview:")
df.head()

# Prepare data for LDA
clean_doc = list(df['clean content'].values)  # List of tokenized documents

"""
Gensim Dictionary:
- Maps each word to a unique ID
- Filters extremes (optional, but improves model quality)
"""
dictionary = Dictionary(clean_doc)
# dictionary.filter_extremes(no_below=5, no_above=0.5)  # Optional: Remove rare/common words

"""
Corpus Creation:
- Converts documents into Bag-of-Words (BoW) format
- Each document: List of (word_id, frequency) tuples
"""
corpus = [dictionary.doc2bow(doc) for doc in clean_doc]

"""
LDA Model Training:
- num_topics=5: Number of latent topics to extract
- random_state=42: Reproducibility
- passes=50: Number of full corpus passes (more passes → better convergence)
- chunksize=100: Number of docs processed per training chunk
"""
ldamodel = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=5,
    random_state=42,
    update_every=1,
    passes=50,
    chunksize=100,
    alpha='auto',  # Let model learn topic distributions
    eta='auto'  # Let model learn word distributions
)

# Display Topics (Each topic: Top 10 words + weights)
print("\nDiscovered Topics:")
print(ldamodel.print_topics())

"""
Model Evaluation:
1. Perplexity: Lower = Better (Measures how well model predicts unseen data)
   - log_perplexity() returns the bound, not true perplexity
"""
print("\nLog Perplexity:", ldamodel.log_perplexity(corpus))

"""
2. Coherence Scores:
- c_v: Higher = Better (0-1, measures topic interpretability)
- u_mass: Closer to 0 = Better (can be negative)
"""
coherence_cv = CoherenceModel(
    model=ldamodel,
    texts=clean_doc,
    dictionary=dictionary,
    coherence='c_v'
)
print("\nCoherence (c_v):", coherence_cv.get_coherence())

coherence_umass = CoherenceModel(
    model=ldamodel,
    texts=clean_doc,
    dictionary=dictionary,
    coherence='u_mass'
)
print("Coherence (u_mass):", coherence_umass.get_coherence())

"""
Interactive Visualization with pyLDAvis:
- λ (lambda) slider adjusts term relevance
- Bubble size = Topic prevalence
- Distance between topics ≈ dissimilarity
"""
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
vis