### LDA

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation #Imports scikit-learn’s LDA implementation. 

documents = [
    "Neural networks are used for deep learning",
    "Machine learning models require data",
    "Deep learning uses neural networks and data"
]

vectorizer = CountVectorizer(stop_words='english')

#Learns the vocabulary from documents (fit) and builds the doc-term matrix (transform).
#X is a sparse matrix of shape (n_docs × n_terms) with term counts.
X = vectorizer.fit_transform(documents)

lda = LatentDirichletAllocation(n_components=2, random_state=0) #n_components=2 → we want 2 topics, random_state=0 makes results reproducible.
lda.fit(X) #Runs LDA inference on the count matrix X.

feature_names = vectorizer.get_feature_names_out() #Gets the learned vocabulary list aligned with columns of X

#Loops through each topic, lda.components_ has shape (n_topics × n_terms).
for topic_idx, topic in enumerate(lda.components_):
    print(f"\nTopic {topic_idx}:")
    print([feature_names[i] for i in topic.argsort()[-5:]])
    #topic.argsort() returns indices of terms sorted by increasing weight.
    #[-5:] takes the top 5 highest-weight terms for this topic.




Topic 0:
['used', 'learning', 'neural', 'deep', 'networks']

Topic 1:
['machine', 'models', 'require', 'learning', 'data']
