In [1]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [8]:
from gensim import corpora
from gensim.models import LdaModel

In [11]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Azif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Azif\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Azif\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
documents = [
"Rafael Nadal Joins Roger Federer in Missing U.S. Open", "Rafael Nadal Is Out of the Australian Open",
"Biden Announces Virus Measures",
"Biden's Virus Plans Meet Reality",
"Where Biden's Virus Plan Stands"]

In [37]:
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)  # Corrected: Now we're calling word_tokenize with the input text
    tokens = [token for token in tokens if token.isalnum()]  # Remove punctuation
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize tokens
    return tokens

# Apply the preprocessing to each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]
print(preprocessed_documents)

[['Rafael', 'Nadal', 'Joins', 'Roger', 'Federer', 'Missing', 'Open'], ['Rafael', 'Nadal', 'Is', 'Out', 'Australian', 'Open'], ['Biden', 'Announces', 'Virus', 'Measures'], ['Biden', 'Virus', 'Plans', 'Meet', 'Reality'], ['Where', 'Biden', 'Virus', 'Plan', 'Stands']]


In [39]:
dictionary = corpora.Dictionary(preprocessed_documents)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

In [41]:
#corpus: bag-of-words representation of the documents
#num_topics: number of topics to be extracted by the model
#id2word=dictionary: dictionary mapping from word IDs to word
#passes: number of passes through the corpus during training 
# Train an LDA model on the corpus with 4 topics using Gensim’s LdaModel class
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)

In [43]:
article_labels = []

for i, doc in enumerate (preprocessed_documents):
	bow = dictionary.doc2bow(doc)
	topics = lda_model.get_document_topics(bow)
	dominant_topic = max(topics, key=lambda x: x[1])[0]
	article_labels.append(dominant_topic)

In [49]:
import pandas as pd

In [51]:
df = pd.DataFrame({'Article': documents, 'Topic': article_labels})

print('Table with Articles and Topic: ')
print(df)
print()

Table with Articles and Topic: 
                                             Article  Topic
0  Rafael Nadal Joins Roger Federer in Missing U....      0
1         Rafael Nadal Is Out of the Australian Open      1
2                     Biden Announces Virus Measures      0
3                   Biden's Virus Plans Meet Reality      1
4                    Where Biden's Virus Plan Stands      0



In [61]:
print('Top Terms for Each Topic:')
for idx, topic in lda_model.print_topics():
	print(f'Topic {idx}:')
	terms = [term.strip() for term in topic.split('+')]
	for term in terms:
		weight, word = term.split('*')
		print(f'- {word.strip()} (weight:  {weight.strip()}')
	print()

Top Terms for Each Topic:
Topic 0:
- "Virus" (weight:  0.097
- "Biden" (weight:  0.097
- "Missing" (weight:  0.057
- "Joins" (weight:  0.057
- "Where" (weight:  0.057
- "Roger" (weight:  0.057
- "Stands" (weight:  0.057
- "Federer" (weight:  0.057
- "Plan" (weight:  0.057
- "Open" (weight:  0.057

Topic 1:
- "Nadal" (weight:  0.072
- "Rafael" (weight:  0.072
- "Open" (weight:  0.072
- "Is" (weight:  0.071
- "Australian" (weight:  0.071
- "Out" (weight:  0.071
- "Reality" (weight:  0.071
- "Meet" (weight:  0.071
- "Plans" (weight:  0.071
- "Biden" (weight:  0.070

