# Task 1

In [102]:
import wikipediaapi
import random
import numpy as np
import string
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from sklearn.preprocessing import normalize

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
folder_path = "wikipedia_content"

[nltk_data] Downloading package stopwords to C:\Users\Ardeleanu
[nltk_data]     Gabriel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [103]:
def save_wikipedia_content_to_files(topic, num_titles=5):
    wiki_wiki = wikipediaapi.Wikipedia('MyProjectName (merlin@example.com)', 'en')
    search_page = wiki_wiki.page(topic)

    linked_pages = search_page.links
    linked_titles = list(linked_pages.keys())

    random_titles = random.sample(linked_titles, num_titles)

    directory = "wikipedia_content"
    os.makedirs(directory, exist_ok=True)

    result = []
    for title in random_titles:
        page = wiki_wiki.page(title)
        content = page.text
        result.append((title, content))
        
        filename = os.path.join(directory, f"{title}.txt")
        with open(filename, "w", encoding="utf-8") as f:
            f.write(content)

        print(f"Saved content to {filename}")

    return result

In [104]:
#save_wikipedia_content_to_files("movies")
#save_wikipedia_content_to_files("drinks")
#save_wikipedia_content_to_files("cars")

In [105]:
def process_text_files(folder_path):
    stemmer = PorterStemmer()
    count_vectorizer = CountVectorizer()
    tfidf_vectorizer = TfidfVectorizer()
    
    documents = []

    content = ""

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as file:
                content += file.read()

    tokens = content.split()
    processed_tokens = [stemmer.stem(word) for word in tokens if word.lower() not in stop_words]
    processed_text = " ".join(processed_tokens)

    documents.append(processed_text)

    #Bag-of-Words
    bow_matrix = count_vectorizer.fit_transform(documents)
    bow_feature_names = count_vectorizer.get_feature_names_out()

    #TF-IDF
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # Bag-of-Words vector
    print("\nBag-of-Words Vector:")
    bow_vector = bow_matrix.toarray()[0]
    bowdict = {word: bow_vector[idx] for idx, word in enumerate(bow_feature_names) if bow_vector[idx] > 0}
    #print(bow_dict)
    with open("BOW.txt", "w", encoding="utf-8") as file:
        for key, value in bowdict.items():
            #print(f"key {key}, value {value}")
            file.write(f"{key}: {value}\n")
    # TF-IDF vector
    print("\nTF-IDF Vector:")
    tfidf_vector = tfidf_matrix.toarray()[0]
    tfidfdict = {word: tfidf_vector[idx] for idx, word in enumerate(tfidf_feature_names) if tfidf_vector[idx] > 0}
    #print(tfidfdict)
    with open("TF_IDF.txt", "w", encoding="utf-8") as file:
        for key, value in tfidfdict.items():
            file.write(f"{key}: {value}\n")
    return bowdict, tfidfdict

In [106]:
folder_path = 'wikipedia_content'
bow_dict, tfidf_dict = process_text_files(folder_path)


Bag-of-Words Vector:

TF-IDF Vector:


# Task 2

In [107]:
terms = list(bow_dict.keys())  # or tfidf_dict.keys()
bow_values = np.array(list(bow_dict.values())).reshape(1, -1)  # Single-row matrix for BOW
tfidf_values = np.array(list(tfidf_dict.values())).reshape(1, -1)  # Single-row matrix for TF-IDF

In [108]:
# SVD on the single-row BOW vector
svd_bow = TruncatedSVD(n_components=3)
bow_lsa = svd_bow.fit_transform(bow_values)

# SVD on the single-row TF-IDF vector
svd_tfidf = TruncatedSVD(n_components=3)
tfidf_lsa = svd_tfidf.fit_transform(tfidf_values)

  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var


In [109]:
print("LSA result on BOW (single vector, 3 concepts):")
print(bow_lsa)

print("\nLSA result on TF-IDF (single vector, 3 concepts):")
print(tfidf_lsa)


LSA result on BOW (single vector, 3 concepts):
[[1062.42082058]]

LSA result on TF-IDF (single vector, 3 concepts):
[[1.]]


# Attempt 2

In [110]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    words = text.split()
    
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    return ' '.join(words)

In [111]:
folder_path = 'wikipedia_content'
documents = []

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        documents.append(preprocess_text(text))

num_topics = 3

In [112]:
# BOW Vectorization
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(documents)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

In [113]:
lsa_bow = TruncatedSVD(num_topics).fit_transform(bow_matrix)
lsa_tfidf = TruncatedSVD(num_topics).fit_transform(tfidf_matrix)

print("LSA result on BOW (3 topics):")
print(lsa_bow)

#normalized version
lsa_bow_normalized = normalize(lsa_bow, norm='l2', axis=1)
print()
print("Normalized LSA result on BOW (unit norm):")
print(lsa_bow_normalized)

print("\nLSA result on TF-IDF (3 topics):")
print(lsa_tfidf)

LSA result on BOW (3 topics):
[[ 1.00305373e+02  3.04051533e+02 -1.33107980e+02]
 [ 8.49288228e+01  1.43093802e+02  2.28747134e+02]
 [ 4.35814738e+02 -1.13501256e+02 -2.48573969e+01]
 [ 6.27357934e+00  8.25661747e+00  5.47861804e+00]
 [ 4.55903364e+01  4.17823726e+01  4.33690014e+01]
 [ 6.27016450e-01  4.14317495e-01  4.63677946e-01]
 [ 4.31632438e+01  3.68391819e+01  2.75433570e+01]
 [ 5.27078452e+00  5.23873518e+00  2.51240858e+00]
 [ 3.66028304e+00  1.92828088e+00  1.64052058e+00]
 [ 4.47843278e-01  2.76801087e-01  3.26206715e-01]
 [ 1.87592673e+01  1.89417629e+01  8.96718045e+00]
 [ 3.29019660e+01  3.01191328e+01  1.60391838e+01]
 [ 3.30866714e+01  5.54171412e+01  2.43024386e+01]
 [ 6.67643210e+00  5.22815271e+00  3.72729272e+00]
 [ 4.67968632e+00  3.87196476e+00  2.65146569e+00]]

Normalized LSA result on BOW (unit norm):
[[ 0.28928411  0.87689495 -0.38388794]
 [ 0.30024248  0.50586875  0.80867253]
 [ 0.96624925 -0.25164478 -0.05511159]
 [ 0.53492554  0.70401206  0.46714205]
 [ 0.

# Task 3

In [114]:
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(tfidf_matrix)

#topic-term matrix (num_topics, num_terms)
topic_term_matrix = nmf_model.components_

#document-topic matrix (num_documents, num_topics)
doc_topic_matrix = nmf_model.transform(tfidf_matrix)

print("Document-Topic Matrix (W):")
print(doc_topic_matrix)
print("\n")

print("Topic-Term Matrix (H):")
print(topic_term_matrix)
print("\n")


Document-Topic Matrix (W):
[[0.         0.         0.63759549]
 [0.         0.         0.6173946 ]
 [0.10844715 0.02265308 0.18100322]
 [0.         0.         0.42835028]
 [0.57239398 0.         0.01825651]
 [0.3451726  0.         0.        ]
 [0.5711115  0.00671449 0.00743697]
 [0.         0.         0.29572751]
 [0.00765448 0.53042129 0.00771921]
 [0.25013341 0.00410106 0.        ]
 [0.         0.09337505 0.36784966]
 [0.01111176 0.03007721 0.34233233]
 [0.00369974 0.         0.38181126]
 [0.         0.7750044  0.        ]
 [0.         0.73578363 0.        ]]


Topic-Term Matrix (H):
[[3.16045214e-04 6.32090428e-04 3.16045214e-04 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.70825930e-05 5.41651860e-05 2.70825930e-05 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.04197636e-04 6.08395272e-04 3.04197636e-04 ... 1.81484844e-03
  1.81484844e-03 4.51497306e-03]]




In [115]:
#feature names
terms = tfidf_vectorizer.get_feature_names_out()

top_topics = 5

for topic_idx, topic in enumerate(topic_term_matrix):
    top_terms_idx = topic.argsort()[-top_topics:][::-1] #topics - top n = top_topics words, descending order
    top_terms = [terms[i] for i in top_terms_idx]
    print(f"Topic #{topic_idx}:")
    print("Top terms:", " ".join(top_terms))


Topic #0:
Top terms: film documentari dunkirk nolan best
Topic #1:
Top terms: vehicl wheelbas tyre brake displaystyl
Topic #2:
Top terms: archaeolog africa african culinari seder


In [116]:
for doc_idx, doc in enumerate(doc_topic_matrix):
    #highest value topic
    top_topic_idx = doc.argmax()
    print(f"Document #{doc_idx + 1} is most strongly associated with Topic #{top_topic_idx + 1} - weight of {doc[top_topic_idx]:.4f}")


Document #1 is most strongly associated with Topic #3 - weight of 0.6376
Document #2 is most strongly associated with Topic #3 - weight of 0.6174
Document #3 is most strongly associated with Topic #3 - weight of 0.1810
Document #4 is most strongly associated with Topic #3 - weight of 0.4284
Document #5 is most strongly associated with Topic #1 - weight of 0.5724
Document #6 is most strongly associated with Topic #1 - weight of 0.3452
Document #7 is most strongly associated with Topic #1 - weight of 0.5711
Document #8 is most strongly associated with Topic #3 - weight of 0.2957
Document #9 is most strongly associated with Topic #2 - weight of 0.5304
Document #10 is most strongly associated with Topic #1 - weight of 0.2501
Document #11 is most strongly associated with Topic #3 - weight of 0.3678
Document #12 is most strongly associated with Topic #3 - weight of 0.3423
Document #13 is most strongly associated with Topic #3 - weight of 0.3818
Document #14 is most strongly associated with T

# Task 4

In [117]:
lda = LatentDirichletAllocation(num_topics, random_state=42)
lda.fit(bow_matrix)

#feature names
terms = bow_vectorizer.get_feature_names_out()

top_topics = 5

for topic_idx, topic in enumerate(lda.components_):
    top_terms_idx = topic.argsort()[-top_topics:][::-1]
    top_terms = [terms[i] for i in top_terms_idx]
    
    print(f"Topic #{topic_idx + 1}:")
    print("Top terms:", " ".join(top_terms))
    print("\n")


Topic #1:
Top terms: film documentari ekiben use tyre


Topic #2:
Top terms: disc bluray archaeolog use film


Topic #3:
Top terms: africa seder african peopl egypt


