In [1]:
!pip install wikipedia-api





[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
import os
import re
import string
import pickle
import wikipediaapi
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
import pyLDAvis
import pyLDAvis.gensim
import numpy as np
import pandas as pd


In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alexandru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alexandru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Alexandru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alexandru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
wiki_api = wikipediaapi.Wikipedia(language='en', user_agent='NLPHomeworkBot/1.0 (alexandru@example.com)')

topic_articles = {
    "MachineLearning": ["Machine learning", "Deep learning", "Neural network", "Supervised learning", "Unsupervised learning"],
    "Astronomy": ["Star", "Galaxy", "Black hole", "Solar System", "Exoplanet"],
    "History": ["World War II", "Roman Empire", "Industrial Revolution", "Ancient Greece", "French Revolution"]
}

os.makedirs("wiki_articles", exist_ok=True)

raw_texts = []
topic_labels = []

print("Starting Wikipedia article download...")

for category, titles in topic_articles.items():
    for title in titles:
        page = wiki_api.page(title)
        if page.exists():
            text = page.text[:5000]
            raw_texts.append(text)
            topic_labels.append(category)

            filename = f"wiki_articles/{title.replace(' ', '_')}.txt"
            with open(filename, "w", encoding="utf-8") as f:
                f.write(text)

            print(f"Downloaded and saved article: {title} (Category: {category}, Length: {len(text)} chars)")
        else:
            print(f"Skipped missing page: {title}")

print(f"\nTotal downloaded documents: {len(raw_texts)}")
print("Articles saved inside /wiki_articles/")


Starting Wikipedia article download...
Downloaded and saved article: Machine learning (Category: MachineLearning, Length: 5000 chars)
Downloaded and saved article: Deep learning (Category: MachineLearning, Length: 5000 chars)
Downloaded and saved article: Neural network (Category: MachineLearning, Length: 4185 chars)
Downloaded and saved article: Supervised learning (Category: MachineLearning, Length: 5000 chars)
Downloaded and saved article: Unsupervised learning (Category: MachineLearning, Length: 5000 chars)
Downloaded and saved article: Star (Category: Astronomy, Length: 5000 chars)
Downloaded and saved article: Galaxy (Category: Astronomy, Length: 5000 chars)
Downloaded and saved article: Black hole (Category: Astronomy, Length: 5000 chars)
Downloaded and saved article: Solar System (Category: Astronomy, Length: 5000 chars)
Downloaded and saved article: Exoplanet (Category: Astronomy, Length: 5000 chars)
Downloaded and saved article: World War II (Category: History, Length: 5000 c

In [14]:
stop_words_set = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", " ", text)
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words_set]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

print("\nStarting preprocessing...")
processed_texts = [clean_text(doc) for doc in raw_texts]
print(f"Finished preprocessing {len(processed_texts)} documents.")
print(f"Example of processed text (first 250 chars):\n{processed_texts[0][:250]}")



Starting preprocessing...
Finished preprocessing 15 documents.
Example of processed text (first 250 chars):
machine learning ml field study artificial intelligence concerned development study statistical algorithm learn data generalise unseen data thus perform task without explicit instruction within subdiscipline machine learning advance field deep learni


In [15]:
print("\nCreating vector representations...")

bow_vectorizer = CountVectorizer(max_features=2000)
bow_matrix = bow_vectorizer.fit_transform(processed_texts)

tfidf_vectorizer = TfidfVectorizer(max_features=2000)
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_texts)

print(f"Bag-of-Words matrix shape: {bow_matrix.shape}")
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Vocabulary sample (first 20 terms):\n{list(bow_vectorizer.get_feature_names_out()[:20])}")



Creating vector representations...
Bag-of-Words matrix shape: (15, 2000)
TF-IDF matrix shape: (15, 2000)
Vocabulary sample (first 20 terms):
['able', 'absorbing', 'abstraction', 'accelerating', 'acceleration', 'accession', 'accompanied', 'according', 'account', 'accountability', 'accretion', 'accuracy', 'accurately', 'achaemenid', 'across', 'act', 'action', 'actium', 'activation', 'active']


In [18]:
num_topics = 5
print("\nPerforming Latent Semantic Analysis (SVD)...")

lsa_model = TruncatedSVD(n_components=num_topics, random_state=42)
lsa_model.fit(tfidf_matrix)

lsa_terms = tfidf_vectorizer.get_feature_names_out()
for topic_index, component in enumerate(lsa_model.components_):
    top_words = [lsa_terms[i] for i in component.argsort()[:-11:-1]]
    print(f"\nLSA Topic {topic_index + 1}:")
    print(f"Top words: {', '.join(top_words)}")

print("\nLSA transformation complete.")



Performing Latent Semantic Analysis (SVD)...

LSA Topic 1:
Top words: learning, network, neural, algorithm, machine, layer, data, deep, unsupervised, training

LSA Topic 2:
Top words: star, planet, solar, galaxy, system, mass, sun, black, bc, hole

LSA Topic 3:
Top words: bc, roman, war, empire, century, classical, period, revolution, greece, industrial

LSA Topic 4:
Top words: revolution, industrial, planet, crisis, tax, solar, growth, economic, public, financial

LSA Topic 5:
Top words: galaxy, star, black, hole, revolution, catalogue, industrial, milky, way, light

LSA transformation complete.


In [19]:
print("\nPerforming Non-Negative Matrix Factorization...")

nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(tfidf_matrix)

nmf_terms = tfidf_vectorizer.get_feature_names_out()
for topic_index, component in enumerate(nmf_model.components_):
    top_words = [nmf_terms[i] for i in component.argsort()[:-11:-1]]
    print(f"\nNMF Topic {topic_index + 1}:")
    print(f"Top words: {', '.join(top_words)}")

print("\nNMF topic extraction complete.")



Performing Non-Negative Matrix Factorization...

NMF Topic 1:
Top words: learning, network, neural, algorithm, machine, layer, data, deep, unsupervised, training

NMF Topic 2:
Top words: planet, solar, system, sun, star, mass, exoplanets, exoplanet, object, orbit

NMF Topic 3:
Top words: bc, roman, war, empire, classical, greece, period, century, emperor, rome

NMF Topic 4:
Top words: revolution, industrial, crisis, tax, growth, economic, political, public, financial, production

NMF Topic 5:
Top words: star, galaxy, black, hole, catalogue, milky, way, astronomer, light, known

NMF topic extraction complete.


In [20]:
print("\nPerforming Latent Dirichlet Allocation (sklearn)...")

lda_model_sklearn = LatentDirichletAllocation(
    n_components=num_topics,
    random_state=42,
    learning_method='batch'
)
lda_model_sklearn.fit(bow_matrix)

lda_terms = bow_vectorizer.get_feature_names_out()
for topic_index, component in enumerate(lda_model_sklearn.components_):
    top_words = [lda_terms[i] for i in component.argsort()[:-11:-1]]
    print(f"\nLDA Topic {topic_index + 1}:")
    print(f"Top words: {', '.join(top_words)}")

print("\nLDA model training complete.")



Performing Latent Dirichlet Allocation (sklearn)...

LDA Topic 1:
Top words: war, black, hole, world, light, germany, japan, soviet, ii, star

LDA Topic 2:
Top words: bc, century, period, roman, empire, classical, political, power, state, greece

LDA Topic 3:
Top words: star, galaxy, planet, mass, milky, way, system, catalogue, astronomer, object

LDA Topic 4:
Top words: system, solar, sun, planet, cloud, orbit, body, within, object, au

LDA Topic 5:
Top words: learning, network, machine, neural, algorithm, data, layer, model, training, supervised

LDA model training complete.


In [21]:
print("\nEvaluating topic coherence using gensim...")

tokenized_texts = [doc.split() for doc in processed_texts]
gensim_dictionary = corpora.Dictionary(tokenized_texts)
gensim_corpus = [gensim_dictionary.doc2bow(text) for text in tokenized_texts]

gensim_lda_model = LdaModel(
    corpus=gensim_corpus,
    num_topics=num_topics,
    id2word=gensim_dictionary,
    passes=10,
    random_state=42
)

coherence_model = CoherenceModel(
    model=gensim_lda_model,
    texts=tokenized_texts,
    dictionary=gensim_dictionary,
    coherence='c_v'
)
lda_coherence_score = coherence_model.get_coherence()

print(f"Gensim LDA model created with {num_topics} topics.")
print(f"Topic coherence score (C_v): {lda_coherence_score:.3f}")



Evaluating topic coherence using gensim...
Gensim LDA model created with 5 topics.
Topic coherence score (C_v): 0.422


In [22]:
print("\nPreparing LDA visualization data...")

pyLDAvis.enable_notebook()
os.makedirs('./results', exist_ok=True)
visualization_path = os.path.join('./results', f'ldavis_prepared_{num_topics}.pkl')

lda_visualization = pyLDAvis.gensim.prepare(gensim_lda_model, gensim_corpus, gensim_dictionary)
with open(visualization_path, 'wb') as f:
    pickle.dump(lda_visualization, f)

pyLDAvis.save_html(lda_visualization, f'./results/ldavis_prepared_{num_topics}.html')

print(f"Visualization prepared and saved to: ./results/ldavis_prepared_{num_topics}.html")
lda_visualization



Preparing LDA visualization data...
Visualization prepared and saved to: ./results/ldavis_prepared_5.html
