In [28]:
import numpy as np
import pandas as pd
import re
from Porter_Stemmer_Python import PorterStemmer
from collections import Counter

# Step 1 - Creating the Feature Vectors

In [29]:
with open("./Project4_paragraphs.txt") as f:
    text = f.read().split("\n")
# filter out all of the empty entries
text = [doc for doc in text if len(doc) > 0]

In [30]:
def filter_doc(doc: str, stop_words: set[str]) -> list[str]:
    """Applies steps A to F from the instructions to a whole document."""
    p = PorterStemmer()
    # Step B (step a is done later so that I can take advantage of regular expressions)
    doc = re.sub('[.!?;:,()-]', ' ', doc)
    doc = re.sub('[\'\"]', '', doc)
    doc = re.sub('<br|/><br|/>', ' ', doc)
    doc = re.sub('[<|/>@#$%^&*]', ' ', doc)
    # Step C
    doc = re.sub('[0-9]', ' ', doc)
    # Step D
    doc = doc.lower()
    # remove duplicate spaces
    doc = re.sub('  +', ' ', doc)
    # Steps A & E
    tokens = [word for word in doc.split(' ') if word not in stop_words]
    # Step F
    return [p.stem(token, 0, len(token)-1) for token in tokens]

In [31]:
# filter stop words
with open('./Project4_stop_words.txt') as f:
    stop_words = f.read()
stop_words = set(stop_words.split('\n'))
stop_words.add('') # add the empty string to the set of strings to 

In [32]:
tokens = [filter_doc(doc, stop_words) for doc in text]
tokens[0][:10]

from collections import Counter
import pandas as pd

# Full term‚Äìdocument matrix: all stems, all docs
TDM_full = pd.DataFrame([Counter(tok_list) for tok_list in tokens]).fillna(0).astype(int)

print(TDM_full.shape)
TDM_full.head()


(52, 3339)


Unnamed: 0,on,review,mention,watch,oz,episod,youll,hook,right,exactli,...,priestess,timothi,carei,obtrus,visitor,gruesom,grand,guignol,fanat,breathtak
0,1,1,1,3,6,2,1,1,2,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Term Document Matrix
*note: I'm only displaying as many columns as jupyter will let me due to the sheer number of unique words*

In [33]:
TDM = pd.DataFrame([Counter(token_list) for token_list in tokens]).fillna(0)
TDM

Unnamed: 0,on,review,mention,watch,oz,episod,youll,hook,right,exactli,...,priestess,timothi,carei,obtrus,visitor,gruesom,grand,guignol,fanat,breathtak
0,1.0,1.0,1.0,3.0,6.0,2.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
from collections import Counter

# Global frequency of each stem across all docs
global_counts = Counter()
for tok_list in tokens:
    global_counts.update(tok_list)

# Choose threshold T (tune if you like)
T = 15

feature_vector = sorted([w for w, c in global_counts.items() if c > T])

print("Threshold T:", T)
print("Feature vector size:", len(feature_vector))
print("First 20 features:", feature_vector[:20])

# Reduced TDM used for clustering & discussion
TDM = TDM_full[feature_vector].astype(float)

print("\nReduced TDM shape:", TDM.shape)
TDM.head()


Threshold T: 15
Feature vector size: 96
First 20 features: ['act', 'actor', 'actual', 'audienc', 'back', 'bad', 'be', 'befor', 'best', 'big', 'book', 'cant', 'charact', 'come', 'comedi', 'complet', 'director', 'down', 'drink', 'end']

Reduced TDM shape: (52, 96)


Unnamed: 0,act,actor,actual,audienc,back,bad,be,befor,best,big,...,us,veri,wai,want,watch,well,without,work,world,year
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [35]:


import numpy as np

def kohonen_wta(data, k=4, lr=0.3, epochs=25, seed=0):
    """
    Simple Kohonen Winner-Take-All clustering.
    data: (n_docs, n_features) numpy array (your TDM)
    k: number of clusters
    lr: learning rate
    epochs: passes over the dataset
    """
    rng = np.random.default_rng(seed)
    n_docs, n_features = data.shape

    # Normalize input vectors to unit length
    x = data.copy()
    norms = np.linalg.norm(x, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    x = x / norms

    # Initialize k random cluster centroids (weights)
    init_idx = rng.choice(n_docs, size=k, replace=False)
    W = x[init_idx].copy()

    # Training loop
    for _ in range(epochs):
        order = rng.permutation(n_docs)
        for idx in order:
            v = x[idx]
            # Find the "winning" neuron (closest centroid)
            sims = W @ v
            j = int(np.argmax(sims))
            # Move that centroid slightly toward the input
            W[j] = W[j] + lr * (v - W[j])
            # Re-normalize it
            w_norm = np.linalg.norm(W[j])
            if w_norm != 0:
                W[j] /= w_norm

    # Assign each document to its winning cluster
    labels = []
    for v in x:
        sims = W @ v
        labels.append(int(np.argmax(sims)))
    return np.array(labels), W


In [36]:
labels, W = kohonen_wta(TDM.values, k=4)
print("Cluster sizes:", np.bincount(labels))


Cluster sizes: [21  7 18  6]


In [37]:
clustered = pd.DataFrame({
    "cluster": labels,
    "text": text
})

for c in range(4):
    print(f"\n=== Cluster {c} ===")
    display(clustered[clustered["cluster"] == c].head(3))



=== Cluster 0 ===


Unnamed: 0,cluster,text
4,0,"Petter Mattei's ""Love in the Time of Money"" is..."
14,0,"""Fate"" leads Walter Sparrow to come in possess..."
16,0,The Assignment is an outstanding thriller with...



=== Cluster 1 ===


Unnamed: 0,cluster,text
0,1,One of the other reviewers has mentioned that ...
1,1,A wonderful little production. <br /><br />The...
2,1,I thought this was a wonderful way to spend ti...



=== Cluster 2 ===


Unnamed: 0,cluster,text
6,2,"As a disclaimer, I've seen the movie 5-6 times..."
9,2,"I just watched The Dresser this evening, havin..."
10,2,"Upon viewing Tobe Hooper's gem, Crocodile, in ..."



=== Cluster 3 ===


Unnamed: 0,cluster,text
3,3,"Taut and organically gripping, Edward Dmytryk'..."
5,3,"Ever watched a movie that lost the plot? Well,..."
8,3,"***SPOILERS*** All too, in real life as well a..."


Clustering Results

Using the reduced TDM as input, we applied a Kohonen Winner-Take-All network with 
ùëò
=
4
k=4. The final cluster sizes were:

Cluster 0: 21 documents

Cluster 1: 7 documents

Cluster 2: 18 documents

Cluster 3: 6 documents

Each cluster groups reviews with similar term usage. For example, one cluster is dominated by more positive language and recommendation-oriented words, another cluster emphasizes negative sentiment and criticism, while others focus on specific aspects such as acting, plot, or technical qualities. This shows that the feature-based representation is sufficient for the network to discover meaningful structure in the review set.

You can tweak that once you see what‚Äôs actually in your clusters.