Step 1: Load and Preprocess the Data

In [14]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

# Load the dataset
wiki_df = pd.read_csv("people_wiki.csv")

# Load index-to-word mapping
with open("4_map_index_to_word.json", "r") as f:
    index_to_word = json.load(f)

# Extract text data
documents = wiki_df['text'].tolist()
article_ids = wiki_df['URI'].tolist()

# Convert documents to TF-IDF features
vectorizer = TfidfVectorizer(max_features=10000)
tfidf_matrix = vectorizer.fit_transform(documents)

# Normalize the TF-IDF vectors to unit length
tfidf_matrix = normalize(tfidf_matrix, norm='l2')

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")


TF-IDF matrix shape: (59071, 10000)


Step 2: Initialization

In [16]:
import numpy as np
import scipy.sparse as sp
from sklearn.cluster import KMeans

K = 5

# Run k-means
print("Running k-means...")
kmeans = KMeans(n_clusters=K, random_state=42, n_init=10)
assignments = kmeans.fit_predict(tfidf_matrix)

means = kmeans.cluster_centers_

N = tfidf_matrix.shape[0]
weights = np.array([(assignments == k).sum() for k in range(K)]) / N

variances = np.zeros((K, tfidf_matrix.shape[1]))

for k in range(K):
    cluster_points = tfidf_matrix[assignments == k]
    if cluster_points.shape[0] > 0:
        mean_vec = sp.csr_matrix(means[k])
        mean_matrix = sp.vstack([mean_vec] * cluster_points.shape[0])
        diff = cluster_points - mean_matrix
        var = diff.multiply(diff).mean(axis=0)
        variances[k] = np.asarray(var).flatten()
    
    variances[k][variances[k] < 1e-8] = 1e-8

print("Initialization complete.")
print(f"Means shape: {means.shape}")
print(f"Weights: {weights}")
print(f"Variances shape: {variances.shape}")


Running k-means...
Initialization complete.
Means shape: (5, 10000)
Weights: [0.19212473 0.21255777 0.17201334 0.26554485 0.15775931]
Variances shape: (5, 10000)


Step 3: E-Step – Compute Responsibilities and Log-Likelihood

In [18]:
import numpy as np
import scipy.sparse as sp
from scipy.special import logsumexp

def compute_log_likelihood_sparse(tfidf_matrix, means, variances, weights):
    N, D = tfidf_matrix.shape
    K = means.shape[0]
    log_probs = np.zeros((N, K))

    for k in range(K):
        log_det = np.sum(np.log(variances[k]))
        const = -0.5 * (np.log(2 * np.pi) * D + log_det)

        inv_var = 1.0 / variances[k]

        x_sq_scaled = tfidf_matrix.multiply(tfidf_matrix).dot(inv_var)

        cross_term = tfidf_matrix.dot(means[k] * inv_var)

        mu_sq_term = np.sum((means[k] ** 2) * inv_var)

        mahalanobis = x_sq_scaled - 2 * cross_term + mu_sq_term
        log_probs[:, k] = const - 0.5 * mahalanobis + np.log(weights[k])

    return log_probs

#E-step
print("Computing memory-safe E-step...")
log_probs = compute_log_likelihood_sparse(tfidf_matrix, means, variances, weights)

log_sum = logsumexp(log_probs, axis=1)
log_responsibilities = log_probs - log_sum[:, np.newaxis]
responsibilities = np.exp(log_responsibilities)

# Log-likelihood
log_likelihood = np.sum(log_sum)
print(f"E-step complete. Log-likelihood: {log_likelihood:.4f}")


Computing memory-safe E-step...
E-step complete. Log-likelihood: 2290719663.5152


Step 4: M-Step (Maximization Step)

In [20]:
def m_step(tfidf_matrix, responsibilities):
    N, D = tfidf_matrix.shape
    K = responsibilities.shape[1]

    Nk = responsibilities.sum(axis=0)  
    
    weights = Nk / N

    means = np.zeros((K, D))
    for k in range(K):
        resp = responsibilities[:, k] 
        weighted_sum = tfidf_matrix.T.dot(resp) 
        means[k] = (weighted_sum / Nk[k]).A1 if sp.issparse(weighted_sum) else weighted_sum / Nk[k]

    variances = np.zeros((K, D))
    for k in range(K):
        mu = means[k]
        resp = responsibilities[:, k]
        
        x_sq = tfidf_matrix.multiply(tfidf_matrix) 
        x_sq_weighted_sum = x_sq.T.dot(resp)
        
        mean_sq = mu ** 2
        ex2 = x_sq_weighted_sum / Nk[k]
        var = ex2 - mean_sq
        var[var < 1e-8] = 1e-8  
        variances[k] = var

    return weights, means, variances


In [21]:
def run_em(tfidf_matrix, K=5, max_iter=20, threshold=1e-4):
    # Step 1: K-means initialization
    kmeans = KMeans(n_clusters=K, random_state=42, n_init=10)
    assignments = kmeans.fit_predict(tfidf_matrix)
    means = kmeans.cluster_centers_
    N, D = tfidf_matrix.shape
    weights = np.array([(assignments == k).sum() for k in range(K)]) / N

    # Initial variances
    variances = np.zeros((K, D))
    for k in range(K):
        cluster_points = tfidf_matrix[assignments == k]
        if cluster_points.shape[0] > 0:
            mean_vec = sp.csr_matrix(means[k])
            mean_matrix = sp.vstack([mean_vec] * cluster_points.shape[0])
            diff = cluster_points - mean_matrix
            var = diff.multiply(diff).mean(axis=0)
            variances[k] = np.asarray(var).flatten()
        variances[k][variances[k] < 1e-8] = 1e-8

    # Logs
    log_likelihoods = []

    for iteration in range(max_iter):
        # E-step
        log_probs = compute_log_likelihood_sparse(tfidf_matrix, means, variances, weights)
        log_sum = logsumexp(log_probs, axis=1)
        log_responsibilities = log_probs - log_sum[:, np.newaxis]
        responsibilities = np.exp(log_responsibilities)
        log_likelihood = np.sum(log_sum)
        log_likelihoods.append(log_likelihood)

        print(f"EM Iteration {iteration + 1}: Log-likelihood = {log_likelihood:.4f}")

        # Convergence check
        if iteration > 0 and abs(log_likelihoods[-1] - log_likelihoods[-2]) < threshold:
            print("Converged.")
            break

        # M-step
        weights, means, variances = m_step(tfidf_matrix, responsibilities)

    return responsibilities, weights, means, variances, log_likelihoods


In [22]:
responsibilities, weights, means, variances, log_likelihoods = run_em(tfidf_matrix)


EM Iteration 1: Log-likelihood = 2290719663.5152
EM Iteration 2: Log-likelihood = 2329189620.0526
EM Iteration 3: Log-likelihood = 2340799258.5286
EM Iteration 4: Log-likelihood = 2346019930.5890
EM Iteration 5: Log-likelihood = 2348922207.7441
EM Iteration 6: Log-likelihood = 2350580328.9211
EM Iteration 7: Log-likelihood = 2351656975.2758
EM Iteration 8: Log-likelihood = 2352472985.7280
EM Iteration 9: Log-likelihood = 2353059926.3402
EM Iteration 10: Log-likelihood = 2353577458.4542
EM Iteration 11: Log-likelihood = 2353996000.4438
EM Iteration 12: Log-likelihood = 2354236578.4606
EM Iteration 13: Log-likelihood = 2354439863.5986
EM Iteration 14: Log-likelihood = 2354592332.7663
EM Iteration 15: Log-likelihood = 2354609447.9154
EM Iteration 16: Log-likelihood = 2354619889.6355
EM Iteration 17: Log-likelihood = 2354628244.4076
EM Iteration 18: Log-likelihood = 2354631286.4642
EM Iteration 19: Log-likelihood = 2354633281.2667
EM Iteration 20: Log-likelihood = 2354633569.0225


Code to Generate cluster_assignments.txt

In [24]:
import numpy as np

cluster_ids = np.argmax(responsibilities, axis=1)

with open("cluster_assignments.txt", "w") as f:
    for article_id, cluster_id in zip(article_ids, cluster_ids):
        f.write(f"{article_id}\t{cluster_id}\n")

print("cluster_assignments.txt written successfully.")


cluster_assignments.txt written successfully.


In [25]:
import json
import numpy as np

with open("4_map_index_to_word.json", "r") as f:
    word_to_index = json.load(f)

index_to_word = {v: k for k, v in word_to_index.items()}

with open("cluster_stats.txt", "w") as f:
    for k in range(len(means)):
        f.write(f"Cluster {k}:\n")

        top_indices = np.argsort(means[k])[::-1][:5]

        for idx in top_indices:
            word = index_to_word.get(idx, f"[word_{idx}]")
            mean_val = means[k][idx]
            var_val = variances[k][idx]
            f.write(f"{word}\tmean={mean_val:.6f}\tvariance={var_val:.6f}\n")
        
        f.write("\n")

print("cluster_stats.txt written successfully.")


cluster_stats.txt written successfully.


In [26]:
with open("em_parameters.txt", "w") as f:
    for k in range(len(means)):
        f.write(f"Cluster {k}:\n")
        f.write(f"Weight: {weights[k]:.6f}\n")

        # Top 10 words in mean
        top_indices = np.argsort(means[k])[::-1][:10]

        f.write("Top words (mean, variance):\n")
        for idx in top_indices:
            word = index_to_word.get(idx, f"[word_{idx}]")
            mean_val = means[k][idx]
            var_val = variances[k][idx]
            f.write(f"{word}: mean={mean_val:.6f}, var={var_val:.6f}\n")

        f.write("\n")

print("em_parameters.txt written successfully.")


em_parameters.txt written successfully.


In [27]:
with open("convergence_log.txt", "w") as f:
    for i, ll in enumerate(log_likelihoods):
        f.write(f"Iteration {i + 1}: Log-likelihood = {ll:.6f}\n")

print("convergence_log.txt written successfully.")


convergence_log.txt written successfully.


In [28]:
with open("ascii_wordclouds.txt", "w") as f:
    for k in range(len(means)):
        f.write(f"Cluster {k} ASCII Word Cloud:\n")

        #top 15 words by mean value
        top_indices = np.argsort(means[k])[::-1][:15]

        for idx in top_indices:
            word = index_to_word.get(idx, f"[word_{idx}]")
            weight = means[k][idx]
            count = int(weight * 1000)  # Scale word size
            ascii_word = word * max(1, min(count, 20))  # Clip between 1 and 20
            f.write(f"{ascii_word}\n")

        f.write("\n" + "=" * 40 + "\n\n")

print("ascii_wordclouds.txt written successfully.")


ascii_wordclouds.txt written successfully.


In [30]:
from output_formatter import *

In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

# Load article data
wiki_df = pd.read_csv("people_wiki.csv")
documents = wiki_df['text'].tolist()
article_ids = wiki_df['URI'].tolist()

vectorizer = TfidfVectorizer(max_features=10000)
tfidf_matrix = vectorizer.fit_transform(documents)
tfidf_matrix = normalize(tfidf_matrix, norm='l2')


In [32]:
responsibilities, weights, means, variances, log_likelihoods = run_em(tfidf_matrix)


EM Iteration 1: Log-likelihood = 2290719663.5152
EM Iteration 2: Log-likelihood = 2329189620.0526
EM Iteration 3: Log-likelihood = 2340799258.5286
EM Iteration 4: Log-likelihood = 2346019930.5890
EM Iteration 5: Log-likelihood = 2348922207.7441
EM Iteration 6: Log-likelihood = 2350580328.9211
EM Iteration 7: Log-likelihood = 2351656975.2758
EM Iteration 8: Log-likelihood = 2352472985.7280
EM Iteration 9: Log-likelihood = 2353059926.3402
EM Iteration 10: Log-likelihood = 2353577458.4542
EM Iteration 11: Log-likelihood = 2353996000.4438
EM Iteration 12: Log-likelihood = 2354236578.4606
EM Iteration 13: Log-likelihood = 2354439863.5986
EM Iteration 14: Log-likelihood = 2354592332.7663
EM Iteration 15: Log-likelihood = 2354609447.9154
EM Iteration 16: Log-likelihood = 2354619889.6355
EM Iteration 17: Log-likelihood = 2354628244.4076
EM Iteration 18: Log-likelihood = 2354631286.4642
EM Iteration 19: Log-likelihood = 2354633281.2667
EM Iteration 20: Log-likelihood = 2354633569.0225


In [33]:
index_to_word = load_index_to_word("4_map_index_to_word.json")

save_cluster_assignments(article_ids, responsibilities)
save_cluster_stats(means, variances, index_to_word)
save_em_parameters(weights, means, variances, index_to_word)
save_convergence_log(log_likelihoods)
save_ascii_wordclouds(means, index_to_word)


In [54]:
save_ascii_wordclouds(means, index_to_word)


In [58]:
df = pd.read_csv("/Users/anirudhjp/Downloads/people_wiki.csv")

In [60]:
df

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...
...,...,...,...
59066,<http://dbpedia.org/resource/Olari_Elts>,Olari Elts,olari elts born april 27 1971 in tallinn eston...
59067,<http://dbpedia.org/resource/Scott_F._Crago>,Scott F. Crago,scott francis crago born july 26 1963 twin bro...
59068,<http://dbpedia.org/resource/David_Cass_(footb...,David Cass (footballer),david william royce cass born 27 march 1962 in...
59069,<http://dbpedia.org/resource/Keith_Elias>,Keith Elias,keith hector elias born february 3 1972 in lac...


In [1]:
import pandas as pd

# Load the full dataset
df = pd.read_csv("/Users/anirudhjp/Downloads/people_wiki.csv")

# Take a small sample, e.g. 500 articles
df_small = df.sample(n=150, random_state=42)

# Save to new CSV
df_small.to_csv("people_wiki_subset.csv", index=False)


In [3]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

# Load the dataset
wiki_df = pd.read_csv("people_wiki_subset.csv")

# Load index-to-word mapping
with open("4_map_index_to_word.json", "r") as f:
    index_to_word = json.load(f)

# Extract text data
documents = wiki_df['text'].tolist()
article_ids = wiki_df['URI'].tolist()

# Convert documents to TF-IDF features
vectorizer = TfidfVectorizer(max_features=10000)
tfidf_matrix = vectorizer.fit_transform(documents)

# Normalize the TF-IDF vectors to unit length
tfidf_matrix = normalize(tfidf_matrix, norm='l2')

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")


TF-IDF matrix shape: (150, 9160)


In [5]:
import pandas as pd

df = pd.read_csv("people_wiki.csv")
df_subset = df.sample(n=150, random_state=42)
df_subset.to_csv("people_wiki_subset.csv", index=False)


In [7]:
df = pd.read_csv("people_wiki_subset.csv")

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from em_algorithm import run_em

# Load the small subset
df = pd.read_csv("people_wiki_subset.csv")
documents = df["text"].tolist()
article_ids = df["URI"].tolist()

# TF-IDF processing
vectorizer = TfidfVectorizer(max_features=10000)
tfidf_matrix = vectorizer.fit_transform(documents)
tfidf_matrix = normalize(tfidf_matrix, norm='l2')

# Run EM
responsibilities, weights, means, variances, log_likelihoods = run_em(tfidf_matrix, K=5, max_iter=20)


EM Iteration 1: Log-likelihood = 9040701.4236
EM Iteration 2: Log-likelihood = 9040701.4236
Converged.


In [26]:
from output_formatter import *
from visualizer import save_ascii_wordclouds

index_to_word = load_index_to_word("4_map_index_to_word.json")

import os
os.makedirs("sample_outputs", exist_ok=True)

save_cluster_assignments(article_ids, responsibilities, filename="sample_outputs/cluster_assignments.txt")
save_cluster_stats(means, variances, index_to_word, filename="sample_outputs/cluster_stats.txt")
save_em_parameters(weights, means, variances, index_to_word, filename="sample_outputs/em_parameters.txt")
save_convergence_log(log_likelihoods, filename="sample_outputs/convergence_log.txt")
save_ascii_wordclouds(means, index_to_word, filename="sample_outputs/ascii_wordclouds.txt")


ASCII word clouds saved to sample_outputs/ascii_wordclouds.txt


In [28]:
with open("sample_outputs/ascii_wordclouds.txt", "r") as f:
    content = f.read()
    print(content)


Cluster 0 ASCII Word Cloud:
1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton1980slangton
vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988vikings1988
vivaciousvivaciousvivaciousvivaciousvivaciousvivaciousvivaciousvivaciousvivaciousvivaciousvivaciousvivaciousvivaciousvivaciousvivaciousvivaciousvivaciousvivaciousvivaciousvivacious
etcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalfetcmetcalf
biologymayorbiologymayorbiologymayorbiologymayorbiologymayorbiologymayorbiologymayorbiologymayorbiologymayorbiologymayorbiologym