In [None]:
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Download NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')

# Example DataFrame
data = {
    'tweets': [
        "Machine learning is fascinating.",
        "Natural language processing is a complex field.",
        "Deep learning is a subfield of machine learning.",
        "Artificial intelligence is a broad field.",
        "Supervised learning is a type of machine learning."
    ]
}

df = pd.DataFrame(data)

# Preprocessing function
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    return [word for word in words if word.isalnum() and word not in stop_words]

# Apply preprocessing to the DataFrame
df['processed'] = df['tweets'].apply(preprocess)

# Create Dictionary
id2word = corpora.Dictionary(df['processed'])

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in df['processed']]

# Build LDA model
num_topics = 2
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=num_topics,
                     random_state=42,
                     update_every=1,
                     chunksize=10,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")

# Prepare the visualization
vis_data = gensimvis.prepare(lda_model, corpus, id2word)

# Display the visualization
pyLDAvis.show(vis_data)
