In [8]:
from gensim import corpora, models
from pprint import pprint

In [9]:
# Sample documents
documents = [
    "Natural language processing is a subfield of artificial intelligence.",
    "Topic modeling is used to identify topics within a collection of documents.",
    "Latent Dirichlet Allocation is a popular algorithm for topic modeling.",
    "Python is a programming language commonly used in machine learning.",
    "Machine learning involves the use of algorithms to make predictions.",
    "Artificial intelligence is a broad field that aims to create intelligent agents.",
]

# Tokenize and preprocess the documents
tokenized_documents = [document.lower().split() for document in documents]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(tokenized_documents)

# Create a bag-of-words corpus
corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

# Apply LDA (Latent Dirichlet Allocation) model
lda_model = models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)

# Print the topics and associated words
pprint(lda_model.print_topics())

[(0,
  '0.053*"topic" + 0.053*"dirichlet" + 0.053*"popular" + 0.053*"latent" + '
  '0.053*"modeling." + 0.053*"algorithm" + 0.053*"allocation" + 0.053*"for" + '
  '0.053*"a" + 0.053*"is"'),
 (1,
  '0.074*"is" + 0.074*"a" + 0.040*"artificial" + 0.040*"language" + '
  '0.040*"used" + 0.040*"to" + 0.040*"of" + 0.023*"intelligent" + '
  '0.023*"field" + 0.023*"agents."'),
 (2,
  '0.053*"machine" + 0.053*"of" + 0.053*"to" + 0.053*"algorithms" + '
  '0.053*"learning" + 0.053*"predictions." + 0.053*"make" + 0.053*"use" + '
  '0.053*"the" + 0.053*"involves"')]


In [10]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(0, 1),
  (3, 1),
  (6, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1)],
 [(0, 1),
  (3, 1),
  (14, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1)],
 [(0, 1),
  (3, 1),
  (4, 1),
  (16, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1)],
 [(6, 1),
  (13, 1),
  (28, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1)],
 [(0, 1),
  (1, 1),
  (3, 1),
  (13, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1)]]