In [None]:


#Phase -2 SVD embeddings trial 
import re
import sys
import cupy as cp
def preprocess_text(text: str):
    text = text.lower()
    text = re.sub(r"[^\w\s']", " ", text)
    tokens = [t for t in text.split() if t.strip() != ""]
    return tokens
def build_vocab(tokens):
    vocab = sorted(set(tokens))
    word_to_idx = {w: i for i, w in enumerate(vocab)}
    return vocab, word_to_idx

def build_cooccurrence_matrix(tokens, word_to_idx, vocab_size, window_size=2):
    M = cp.zeros((vocab_size, vocab_size), dtype=cp.float64)
    n = len(tokens)
    for i, tok in enumerate(tokens):
        if tok not in word_to_idx:
            continue
        t_idx = word_to_idx[tok]
        start = max(0, i - window_size)
        end = min(n, i + window_size + 1)
        for j in range(start, end):
            if i == j:
                continue
            ctx_tok = tokens[j]
            c_idx = word_to_idx[ctx_tok]
            M[t_idx, c_idx] += 1.0
    return M
def compute_ppmi_matrix(co_matrix, epsilon=1e-8):
    M = co_matrix.astype(cp.float64)
    M_safe = M + epsilon
    total = M_safe.sum()
    if total == 0:
        return cp.zeros_like(M_safe)
    row_sums = M_safe.sum(axis=1)
    col_sums = M_safe.sum(axis=0)
    row_prob = row_sums / total
    col_prob = col_sums / total
    denom = row_prob[:, None] * col_prob[None, :]
    joint = M_safe / total
    pmi = cp.log((joint + epsilon) / (denom + epsilon))
    ppmi = cp.maximum(pmi, 0.0)
    zero_mask = (M <= (epsilon))
    ppmi[zero_mask] = 0.0
    return ppmi
def compute_svd_embeddings(ppmi, embedding_dim):
    if ppmi.size == 0:
        return cp.zeros((0, embedding_dim), dtype=cp.float64)
    try:
        U, s, Vh = cp.linalg.svd(ppmi, full_matrices=False)
    except cp.linalg.LinAlgError:
        import numpy as np
        U_n, s_n, Vh_n = np.linalg.svd(cp.asnumpy(ppmi), full_matrices=False)
        U = cp.asarray(U_n)
        s = cp.asarray(s_n)
        Vh = cp.asarray(Vh_n)
    k = min(embedding_dim, U.shape[1])
    U_k = U[:, :k]
    s_k = s[:k]
    sqrt_s = cp.sqrt(s_k)
    embeddings = U_k * sqrt_s[None, :]
    return embeddings

def main():
    text = (
        "Artificial Intelligence AI is the science of making machines think and learn like humans. "
        "It powers applications from voice assistants to self driving cars. "
        "AI learns from data recognizes patterns and makes smart decisions. "
        "It is transforming industries like healthcare education and finance. "
        "With responsible use AI can make life easier and solve complex global challenges."
    )
    print("Using CuPy version:", cp.__version__)
    tokens = preprocess_text(text)
    vocab, word_to_idx = build_vocab(tokens)
    vocab_size = len(vocab)
    print(f"Vocab size: {vocab_size}")
    co_matrix = build_cooccurrence_matrix(tokens, word_to_idx, vocab_size, window_size=2)
    ppmi = compute_ppmi_matrix(co_matrix)
    final_dim = min(10, vocab_size - 1 if vocab_size > 1 else 1)
    if final_dim < 1:
        final_dim = 1
    embeddings = compute_svd_embeddings(ppmi, final_dim)
    emb_cpu = cp.asnumpy(embeddings)
    print("\n=== GPU (CuPy) SVD-based Word Embeddings ===")
    for i, w in enumerate(vocab):
        vals = emb_cpu[i]
        vals_str = " ".join(f"{v: .6f}" for v in vals)
        print(f"{w:20s}: {vals_str}")

if __name__ == "__main__":
    try:
        dev_count = cp.cuda.runtime.getDeviceCount()
        if dev_count == 0:
            print("No CUDA devices detected.", file=sys.stderr)
        main()
    except Exception as e:
        print("Error running GPU pipeline:", str(e), file=sys.stderr)
        raise


Using CuPy version: 13.6.0
Vocab size: 47

=== GPU (CuPy) SVD-based Word Embeddings ===
ai                  : -0.753585  0.289086 -0.411306  0.611737  0.540482  0.155767 -0.536176  0.294353 -0.175743 -0.014936
and                 : -0.731931 -0.528126 -0.397059 -0.297126 -0.110417 -0.386720  0.478970 -0.096021  0.472334  0.056523
applications        : -0.513357  0.382044  0.356519 -0.413552 -0.116931 -0.096762 -0.095375 -0.331185 -0.821140 -0.126544
artificial          : -0.288899  0.068505 -0.111045  0.292250  0.289444  0.183311 -0.509606  0.335486 -0.137443 -0.135699
assistants          : -0.516722  0.936535  0.111454  0.019426 -0.359769  0.219686  0.327073 -0.181783  0.040327 -0.054465
can                 : -0.415953 -0.028510 -0.680778  0.336025  0.538639 -0.043069 -0.334698 -0.635307 -0.063687  0.770285
cars                : -0.481709  0.690057 -0.130391  0.320889 -0.022791  0.247646 -0.020438  0.245498  0.439685 -0.035320
challenges          : -0.168744 -0.250946 -0.678835 -0.335

: 