In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora
import pyLDAvis.gensim_models


# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Sample documents
documents = [
    "Machine learning is the study of computer algorithms that improve automatically through experience.",
    "Artificial intelligence is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals.",
    "Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning.",
    "Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language.",
    "Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from structured and unstructured data.",
]

# Preprocess documents
stop_words = set(stopwords.words('english'))

def preprocess(doc):
    tokens = word_tokenize(doc.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

processed_docs = [preprocess(doc) for doc in documents]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(processed_docs)

# Create a corpus: a list of bag-of-words representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Set the number of topics
num_topics = 3

# Build LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx + 1}: {topic}")




Topic 1: 0.124*"intelligence" + 0.050*"natural" + 0.049*"artificial" + 0.049*"contrast" + 0.049*"demonstrated" + 0.049*"animals" + 0.049*"displayed" + 0.049*"humans" + 0.049*"machines" + 0.012*"algorithms"
Topic 2: 0.082*"learning" + 0.044*"machine" + 0.044*"language" + 0.044*"computer" + 0.044*"artificial" + 0.025*"neural" + 0.025*"family" + 0.025*"based" + 0.025*"broader" + 0.025*"representation"
Topic 3: 0.073*"data" + 0.042*"algorithms" + 0.042*"science" + 0.042*"methods" + 0.042*"systems" + 0.042*"structured" + 0.042*"interdisciplinary" + 0.042*"uses" + 0.042*"unstructured" + 0.042*"scientific"


[nltk_data] Downloading package punkt to /home/nivedita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nivedita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Classify a new document
new_doc = "Neural networks and deep learning are gaining popularity in artificial intelligence."
new_doc_bow = dictionary.doc2bow(preprocess(new_doc))
topics = lda_model.get_document_topics(new_doc_bow)

print(f"New Document Topic Distribution: {topics}")

New Document Topic Distribution: [(0, 0.07574748), (1, 0.8762026), (2, 0.048049875)]
