In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF #Imports Non-Negative Matrix Factorization (NMF) model for topic extraction

documents = [
    "Neural networks are used for deep learning",
    "Machine learning models require data",
    "Deep learning uses neural networks and data",
    "Data analysis and statistics are essential in data science",
    "Statistical models help understand patterns in data"
]

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)  # Learns the vocabulary from the text (fit), Converts text into TF-IDF weighted matrix (transform)


In [3]:
nmf = NMF(n_components=2, random_state=0) #Creates an NMF model asking for 2 topics.
W = nmf.fit_transform(X)   # Document → Topic matrix
H = nmf.components_        # Topic → Word matrix


In [4]:
feature_names = vectorizer.get_feature_names_out() #Retrieves the list of words in the TF-IDF vocabulary.
for topic_idx, topic in enumerate(H):
    print(f"\nTopic {topic_idx}:")
    top_terms = topic.argsort()[-6:] #sorts word indices by weight and selects 6 highest weight words
    print([feature_names[i] for i in top_terms])



Topic 0:
['uses', 'used', 'learning', 'deep', 'neural', 'networks']

Topic 1:
['understand', 'help', 'machine', 'require', 'models', 'data']


In [5]:
import pandas as pd
doc_topic_df = pd.DataFrame(W, columns=["Topic 0", "Topic 1"])
doc_topic_df


Unnamed: 0,Topic 0,Topic 1
0,0.675686,0.0
1,0.070122,0.638651
2,0.657539,0.079626
3,0.0,0.543569
4,0.0,0.660142
