In [25]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Load a small text corpus

In [30]:
corpus = """
The psychological horror subgenre features psychological tension,
 fear, and suspense to evoke feelings of dread and unease in the audience.
 Rather than relying solely on graphic violence or jump scares, psychological horror focuses on the psychological and emotional
 experiences of characters, often exploring their inner fears, anxieties, and perceptions.
"""

# Tokenize words
tokens = nltk.word_tokenize(corpus.lower())
print("Tokens:", tokens)

# Build vocabulary and word_to_id mapping
vocab = sorted(list(set(tokens)))
word_to_id = {word: i for i, word in enumerate(vocab)}

Tokens: ['the', 'psychological', 'horror', 'subgenre', 'features', 'psychological', 'tension', ',', 'fear', ',', 'and', 'suspense', 'to', 'evoke', 'feelings', 'of', 'dread', 'and', 'unease', 'in', 'the', 'audience', '.', 'rather', 'than', 'relying', 'solely', 'on', 'graphic', 'violence', 'or', 'jump', 'scares', ',', 'psychological', 'horror', 'focuses', 'on', 'the', 'psychological', 'and', 'emotional', 'experiences', 'of', 'characters', ',', 'often', 'exploring', 'their', 'inner', 'fears', ',', 'anxieties', ',', 'and', 'perceptions', '.']


Build Co-occurrence Matrix

In [31]:
def build_cooccurrence_matrix(tokens, vocab, word_to_id, window_size=4):
    vocab_size = len(vocab)
    matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)

    for i, word in enumerate(tokens):
        if word not in word_to_id:
            continue
        word_id = word_to_id[word]

        # context window
        start = max(0, i - window_size)
        end = min(len(tokens), i + window_size + 1)

        for j in range(start, end):
            if i != j and tokens[j] in word_to_id:
                context_id = word_to_id[tokens[j]]
                matrix[word_id][context_id] += 1

    return matrix

co_matrix = build_cooccurrence_matrix(tokens, vocab, word_to_id, window_size=4)
print("Co-occurrence Matrix shape:", co_matrix.shape)

Co-occurrence Matrix shape: (40, 40)


Dimensionality Reduction (SVD)

In [32]:
def reduce_dimensions(matrix, k=2):
    svd = TruncatedSVD(n_components=k)
    reduced = svd.fit_transform(matrix)
    return reduced

reduced_embeddings = reduce_dimensions(co_matrix, k=2)

Plot 2D word embeddings

In [33]:
import plotly.express as px
import pandas as pd

def plot_embeddings_interactive(embeddings, vocab):
    df = pd.DataFrame(embeddings, columns=['Dimension 1', 'Dimension 2'])
    df['word'] = vocab

    fig = px.scatter(df, x='Dimension 1', y='Dimension 2', text='word',
                     title='2D Word Embeddings from Co-occurrence Matrix (Interactive)')
    fig.update_traces(textposition='top center')
    fig.show()

plot_embeddings_interactive(reduced_embeddings, vocab)