In [None]:

import re
import time
import numpy as np
import cupy as cp
import pandas as pd

assert cp.cuda.runtime.getDeviceCount() > 0, "No CUDA GPU detected!"
with cp.cuda.Device(0) as dev:
    props = cp.cuda.runtime.getDeviceProperties(dev.id)
    print(f"[GPU] Using: {props['name'].decode() if isinstance(props['name'], bytes) else props['name']}")

def clean_text(s):
    s = s.lower()
    s = re.sub(r"[^\w\s$]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def build_vocab(sentences):
    vocab = {}
    for s in sentences:
        for w in s.split():
            if w not in vocab:
                vocab[w] = len(vocab)
    return vocab

# TF–IDF from scratch (GPU)

def compute_tfidf_gpu(sentences, vocab):
    n_docs = len(sentences)
    vocab_size = len(vocab)
    mat = cp.zeros((n_docs, vocab_size), dtype=cp.float32)

    for i, s in enumerate(sentences):
        words = s.split()
        for w in words:
            if w in vocab:
                mat[i, vocab[w]] += 1.0

    # term frequency normalization
    row_sums = cp.sum(mat, axis=1, keepdims=True) + 1e-9
    tf = mat / row_sums

    # inverse document frequency
    df = cp.sum(mat > 0, axis=0) + 1.0
    idf = cp.log((n_docs + 1.0) / df)

    tfidf = tf * idf
    return tfidf

# NMF Multiplicative Updates 

def nmf_gpu(V, rank=50, iters=200):
    m, n = V.shape
    W = cp.random.rand(m, rank).astype(cp.float32)
    H = cp.random.rand(rank, n).astype(cp.float32)

    for i in range(iters):
        # Update H
        WH = W @ H
        H *= (W.T @ V) / (W.T @ WH + 1e-9)

        # Update W
        WH = W @ H
        W *= (V @ H.T) / (W @ (H @ H.T) + 1e-9)

        if (i+1) % 50 == 0:
            loss = cp.linalg.norm(V - W @ H)
            print(f"[NMF] Iter {i+1}, Loss={loss:.4f}")
    return W, H


# kernels

matmul_kernel_code = r'''
extern "C" __global__
void matmul_kernel(const float* A, const float* B, float* C, int M, int K, int N){
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    int col = blockDim.x * blockIdx.x + threadIdx.x;
    if(row < M && col < N){
        float sum = 0.0f;
        for(int k = 0; k < K; ++k){
            sum += A[row*K + k] * B[k*N + col];
        }
        C[row*N + col] = sum;
    }
}
'''
matmul_kernel = cp.RawKernel(matmul_kernel_code, 'matmul_kernel')

def matmul_manual(A, B):
    M, K = A.shape
    K2, N = B.shape
    assert K == K2
    C = cp.zeros((M, N), dtype=cp.float32)
    block = (16, 16, 1)
    grid = ((N+15)//16, (M+15)//16, 1)
    matmul_kernel(grid, block, (A, B, C, M, K, N))
    return C

softmax_kernel_code = r'''
extern "C" __global__
void row_softmax(float* X, int M, int N){
    int row = blockDim.x * blockIdx.x + threadIdx.x;
    if(row < M){
        float max_val = -1e20f;
        for(int j=0;j<N;++j){
            float v = X[row*N+j];
            max_val = v > max_val ? v : max_val;
        }
        float sum_exp=0.0f;
        for(int j=0;j<N;++j){
            float e = __expf(X[row*N+j]-max_val);
            X[row*N+j] = e;
            sum_exp += e;
        }
        float inv=1.0f/sum_exp;
        for(int j=0;j<N;++j){
            X[row*N+j]*=inv;
        }
    }
}
'''
softmax_kernel = cp.RawKernel(softmax_kernel_code, 'row_softmax')

def softmax_manual(X):
    M,N = X.shape
    block=(128,1,1)
    grid=((M+127)//128,1,1)
    softmax_kernel(grid,block,(X,M,N))
    return X

relu_kernel_code = r'''
extern "C" __global__
void relu(float* X, int MN){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx < MN){
        float v=X[idx];
        X[idx]=v>0.0f?v:0.0f;
    }
}
'''
relu_kernel = cp.RawKernel(relu_kernel_code,'relu')

def relu_manual(X):
    MN=X.size
    block=(256,1,1)
    grid=((MN+255)//256,1,1)
    relu_kernel(grid,block,(X,MN))
    return X

# Transformer  
class TransformerClassifierGPU:
    def __init__(self, input_dim, n_classes, d_model=64):
        self.W_embed = cp.random.randn(input_dim, d_model).astype(cp.float32)*0.1
        self.W_Q = cp.random.randn(d_model,d_model).astype(cp.float32)*0.1
        self.W_K = cp.random.randn(d_model,d_model).astype(cp.float32)*0.1
        self.W_V = cp.random.randn(d_model,d_model).astype(cp.float32)*0.1
        self.W_ff= cp.random.randn(d_model,d_model).astype(cp.float32)*0.1
        self.b_ff= cp.zeros(d_model,dtype=cp.float32)
        self.W_out=cp.random.randn(d_model,n_classes).astype(cp.float32)*0.1
        self.b_out=cp.zeros(n_classes,dtype=cp.float32)

    def forward(self,X):
        X_emb=matmul_manual(X,self.W_embed)
        Q=matmul_manual(X_emb,self.W_Q)
        K=matmul_manual(X_emb,self.W_K)
        V=matmul_manual(X_emb,self.W_V)
        scores=matmul_manual(Q,K.T.copy())/cp.sqrt(cp.float32(Q.shape[1]))
        attn=softmax_manual(scores.copy())
        attn_out=matmul_manual(attn,V)
        X_res=X_emb+attn_out
        mean=cp.mean(X_res,axis=1,keepdims=True)
        std=cp.std(X_res,axis=1,keepdims=True)+1e-6
        X_norm=(X_res-mean)/std
        FF=matmul_manual(X_norm,self.W_ff)+self.b_ff
        FF=relu_manual(FF)
        X_ff=X_norm+FF
        logits=matmul_manual(X_ff,self.W_out)+self.b_out
        return logits,X_ff

    def update(self,X_ff,y,logits,lr):
        exp_logits=cp.exp(logits-cp.max(logits,axis=1,keepdims=True))
        probs=exp_logits/cp.sum(exp_logits,axis=1,keepdims=True)
        one_hot=cp.zeros_like(probs)
        one_hot[cp.arange(y.size),y]=1.0
        grad_logits=(probs-one_hot)/y.size
        grad_W=matmul_manual(X_ff.T,grad_logits)
        grad_b=cp.sum(grad_logits,axis=0)
        self.W_out-=lr*grad_W
        self.b_out-=lr*grad_b

def cross_entropy_loss(logits,y):
    maxl=cp.max(logits,axis=1,keepdims=True)
    log_probs=logits-maxl-cp.log(cp.sum(cp.exp(logits-maxl),axis=1,keepdims=True))
    return -cp.mean(log_probs[cp.arange(y.size),y])

if __name__=="__main__":
    df=pd.read_csv(r"C:\Users\Pavani Akshaya\Downloads\fin_data_1.csv")
    sentences=[clean_text(s) for s in df["Sentence"].astype(str).tolist()]
    labels=df["Sentiment"].astype(str).str.lower().map({"positive":1,"pos":1,"1":1,
                                                        "negative":0,"neg":0,"0":0}).fillna(0).astype(int).to_numpy()

    vocab=build_vocab(sentences)
    print(f"[DATA] {len(sentences)} docs, {len(vocab)} vocab")

    tfidf=compute_tfidf_gpu(sentences,vocab)
    W,H=nmf_gpu(tfidf,rank=64,iters=200)
    X_gpu=W/(cp.linalg.norm(W,axis=1,keepdims=True)+1e-9)

    n_classes=len(set(labels))
    model=TransformerClassifierGPU(X_gpu.shape[1],n_classes)

    y_gpu=cp.asarray(labels)
    for epoch in range(500):
        logits,X_ff=model.forward(X_gpu)
        loss=cross_entropy_loss(logits,y_gpu)
        model.update(X_ff,y_gpu,logits,0.05)
        if (epoch+1)%5==0:
            acc=cp.mean((cp.argmax(logits,axis=1)==y_gpu).astype(cp.float32))
            print(f"Epoch {epoch+1} Loss={loss:.4f} Acc={float(acc)*100:.2f}%")


[GPU] Using: NVIDIA GeForce RTX 3060 Laptop GPU
[DATA] 5842 docs, 11554 vocab
[NMF] Iter 50, Loss=90.5770
[NMF] Iter 100, Loss=90.5362
[NMF] Iter 150, Loss=90.5292
[NMF] Iter 200, Loss=90.5245
Epoch 5 Loss=0.7491 Acc=61.26%
Epoch 10 Loss=0.7466 Acc=62.08%
Epoch 15 Loss=0.7448 Acc=62.60%
Epoch 20 Loss=0.7434 Acc=62.89%
Epoch 25 Loss=0.7421 Acc=63.08%
Epoch 30 Loss=0.7410 Acc=63.09%
Epoch 35 Loss=0.7401 Acc=62.96%
Epoch 40 Loss=0.7393 Acc=62.89%
Epoch 45 Loss=0.7388 Acc=62.89%
Epoch 50 Loss=0.7384 Acc=62.86%
Epoch 55 Loss=0.7382 Acc=62.86%
Epoch 60 Loss=0.7382 Acc=62.96%
Epoch 65 Loss=0.7384 Acc=62.96%
Epoch 70 Loss=0.7387 Acc=62.87%
Epoch 75 Loss=0.7393 Acc=62.86%
Epoch 80 Loss=0.7400 Acc=62.79%
Epoch 85 Loss=0.7410 Acc=62.68%
Epoch 90 Loss=0.7421 Acc=62.63%
Epoch 95 Loss=0.7434 Acc=62.31%
Epoch 100 Loss=0.7449 Acc=62.24%
Epoch 105 Loss=0.7466 Acc=62.14%
Epoch 110 Loss=0.7485 Acc=62.03%
Epoch 115 Loss=0.7506 Acc=61.90%
Epoch 120 Loss=0.7529 Acc=61.74%
Epoch 125 Loss=0.7553 Acc=61.74%
Ep