In [10]:
# Trial -3: SVD-based embeddings with GPU Transformer classifier

import cupy as cp
import pandas as pd
import re
import time
from collections import Counter
assert cp.cuda.runtime.getDeviceCount() > 0, "No CUDA GPU detected!"
with cp.cuda.Device(0) as dev:
    props = cp.cuda.runtime.getDeviceProperties(dev.id)
    print(f"[GPU] Using: {props['name'].decode()} (SMs={props['multiProcessorCount']})")

#  Text Preprocessing 
def clean_text(s: str) -> str:
    """Lowercase text, remove special characters, normalize spaces."""
    s = s.lower()
    s = re.sub(r"[^\w\s$]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def build_vocab(sentences):
    """Build vocabulary and mapping from word to index."""
    cleaned = [clean_text(s) for s in sentences]
    all_tokens = " ".join(cleaned).split()
    vocab = sorted(set(all_tokens))
    word_to_idx = {w: i for i, w in enumerate(vocab)}
    return cleaned, all_tokens, word_to_idx

#  GPU Kernels -

matmul_kernel_code = r'''
extern "C" __global__
void matmul_kernel(const float* A, const float* B, float* C, int M, int K, int N){
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    int col = blockDim.x * blockIdx.x + threadIdx.x;
    if(row < M && col < N){
        float sum = 0.0f;
        for(int k = 0; k < K; ++k){
            sum += A[row*K + k] * B[k*N + col];
        }
        C[row*N + col] = sum;
    }
}
'''
matmul_kernel = cp.RawKernel(matmul_kernel_code, 'matmul_kernel')

def matmul(A, B):
    M, K = A.shape
    K2, N = B.shape
    assert K == K2
    C = cp.zeros((M, N), dtype=cp.float32)
    block = (16,16,1)
    grid  = ((N + block[0]-1)//block[0], (M + block[1]-1)//block[1], 1)
    matmul_kernel(grid, block, (A,B,C,M,K,N))
    return C

softmax_kernel_code = r'''
extern "C" __global__
void row_softmax(float* X, int M, int N){
    int row = blockDim.x * blockIdx.x + threadIdx.x;
    if(row < M){
        float max_val = -1e20f;
        for(int j=0;j<N;j++){
            float v = X[row*N + j];
            max_val = v>max_val?v:max_val;
        }
        float sum_exp = 0.0f;
        for(int j=0;j<N;j++){
            float e = __expf(X[row*N+j]-max_val);
            X[row*N+j] = e;
            sum_exp += e;
        }
        float inv = 1.0f / sum_exp;
        for(int j=0;j<N;j++) X[row*N+j] *= inv;
    }
}
'''
softmax_kernel = cp.RawKernel(softmax_kernel_code, 'row_softmax')

def softmax(X):
    M, N = X.shape
    block = (128,1,1)
    grid  = ((M+block[0]-1)//block[0],1,1)
    softmax_kernel(grid, block, (X,M,N))
    return X

relu_kernel_code = r'''
extern "C" __global__
void relu(float* X, int MN){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx < MN){
        float v = X[idx];
        X[idx] = v>0.0f?v:0.0f;
    }
}
'''
relu_kernel = cp.RawKernel(relu_kernel_code,'relu')

def relu(X):
    MN = X.size
    block = (256,1,1)
    grid  = ((MN+block[0]-1)//block[0],1,1)
    relu_kernel(grid, block, (X,MN))
    return X

#  Co-occurrence & PPMI 
cooc_kernel_code = r'''
extern "C" __global__
void cooc_kernel(const int* tokens, float* co_matrix, int len, int window, int vocab_size){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx < len){
        int target = tokens[idx];
        for(int offset=-window;offset<=window;offset++){
            int j = idx+offset;
            if(j>=0 && j<len && j!=idx){
                int context = tokens[j];
                atomicAdd(&co_matrix[target*vocab_size+context],1.0f);
            }
        }
    }
}
'''
cooc_kernel = cp.RawKernel(cooc_kernel_code,'cooc_kernel')

def build_co_matrix(tokens, word_to_idx, window=2):
    token_ids = cp.asarray([word_to_idx[t] for t in tokens], dtype=cp.int32)
    vocab_size = len(word_to_idx)
    co_matrix = cp.zeros((vocab_size,vocab_size), dtype=cp.float32)
    block = (256,1,1)
    grid  = ((token_ids.size+block[0]-1)//block[0],1,1)
    cooc_kernel(grid, block, (token_ids,co_matrix,token_ids.size,window,vocab_size))
    return co_matrix

ppmi_kernel_code = r'''
extern "C" __global__
void ppmi_kernel(const float* co_matrix, float* ppmi, const float* row_sum, const float* col_sum, float total, int N){
    int j = blockDim.x * blockIdx.x + threadIdx.x;
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    if(i<N && j<N){
        float cij = co_matrix[i*N+j];
        if(cij>0.0f){
            float pij = cij/total;
            float pi = row_sum[i]/total;
            float pj = col_sum[j]/total;
            float val = logf(pij/(pi*pj));
            ppmi[i*N+j] = val>0.0f?val:0.0f;
        }else{
            ppmi[i*N+j] = 0.0f;
        }
    }
}
'''
ppmi_kernel = cp.RawKernel(ppmi_kernel_code,'ppmi_kernel')

def compute_ppmi(co_matrix):
    N = co_matrix.shape[0]
    row_sum = cp.sum(co_matrix, axis=1).astype(cp.float32)
    col_sum = cp.sum(co_matrix, axis=0).astype(cp.float32)
    total   = cp.sum(co_matrix).astype(cp.float32)
    ppmi = cp.zeros_like(co_matrix,dtype=cp.float32)
    block = (16,16,1)
    grid  = ((N+block[0]-1)//block[0],(N+block[1]-1)//block[1],1)
    ppmi_kernel(grid, block, (co_matrix,ppmi,row_sum,col_sum,float(total),N))
    return ppmi

#  SVD Embeddings 
def svd_embeddings(ppmi, dim=10, iters=30):
    N = ppmi.shape[0]
    U = cp.random.randn(N, dim).astype(cp.float32)
    for _ in range(iters):
        U = matmul(ppmi, matmul(ppmi.T,U))
        norms = cp.sqrt(cp.sum(U*U, axis=0, keepdims=True)+1e-8)
        U /= norms
    return U

def sentence_to_emb(sentence, word_to_idx, embeddings):
    words = sentence.split()
    idxs = [word_to_idx[w] for w in words if w in word_to_idx]
    if len(idxs)==0:
        return cp.zeros((embeddings.shape[1],), dtype=cp.float32)
    return cp.mean(embeddings[idxs], axis=0)

#  Transformer Classifier 
class TransformerGPU:
    def __init__(self,input_dim,n_classes,d_model=32):
        self.W_embed = cp.random.randn(input_dim,d_model).astype(cp.float32)*0.1
        self.W_Q = cp.random.randn(d_model,d_model).astype(cp.float32)*0.1
        self.W_K = cp.random.randn(d_model,d_model).astype(cp.float32)*0.1
        self.W_V = cp.random.randn(d_model,d_model).astype(cp.float32)*0.1
        self.W_ff= cp.random.randn(d_model,d_model).astype(cp.float32)*0.1
        self.b_ff= cp.zeros(d_model,dtype=cp.float32)
        self.W_out = cp.random.randn(d_model,n_classes).astype(cp.float32)*0.1
        self.b_out = cp.zeros(n_classes,dtype=cp.float32)
        self.d_model = d_model

    def forward(self,X):
        X_emb = matmul(X,self.W_embed)
        Q = matmul(X_emb,self.W_Q)
        K = matmul(X_emb,self.W_K)
        V = matmul(X_emb,self.W_V)
        scores = matmul(Q,K.T)/cp.sqrt(cp.float32(self.d_model))
        attention = softmax(scores.copy())
        out = matmul(attention,V)
        X_res = X_emb + out
        mean = cp.mean(X_res, axis=1, keepdims=True)
        std  = cp.std(X_res, axis=1, keepdims=True)+1e-6
        X_norm = (X_res-mean)/std
        FF = matmul(X_norm,self.W_ff)+self.b_ff
        FF = relu(FF)
        X_ff = X_norm + FF
        logits = matmul(X_ff,self.W_out)+self.b_out
        return logits, X_ff

    def update(self,X_ff,y,logits,lr=0.05):
        exp_logits = cp.exp(logits - cp.max(logits,axis=1,keepdims=True))
        probs = exp_logits/cp.sum(exp_logits,axis=1,keepdims=True)
        probs[cp.arange(y.size),y] -=1.0
        probs /= y.size
        self.W_out -= lr*matmul(X_ff.T,probs)
        self.b_out -= lr*cp.sum(probs,axis=0)

def cross_entropy(logits,y):
    max_logits = cp.max(logits, axis=1, keepdims=True)
    log_probs = logits - max_logits - cp.log(cp.sum(cp.exp(logits-max_logits),axis=1,keepdims=True))
    return -cp.mean(log_probs[cp.arange(y.size),y])


def oversample_gpu(X,y):
    counts = Counter(cp.asnumpy(y))
    max_count = max(counts.values())
    X_new, y_new = [X],[y]
    for label in counts:
        idxs = cp.where(y==label)[0]
        n_add = max_count - counts[label]
        if n_add>0:
            reps = cp.random.choice(idxs,n_add)
            X_new.append(X[reps])
            y_new.append(y[reps])
    return cp.concatenate(X_new,axis=0), cp.concatenate(y_new,axis=0)

def train_gpu(sentences, labels, epochs=100, lr=0.1, embedding_dim=32):
    start_time = time.time()
    cleaned, all_tokens, word_to_idx = build_vocab(sentences)
    co_matrix = build_co_matrix(all_tokens, word_to_idx)
    ppmi = compute_ppmi(co_matrix)
    embeddings = svd_embeddings(ppmi, dim=embedding_dim)

    X = cp.stack([sentence_to_emb(s, word_to_idx, embeddings) for s in cleaned])
    y = cp.asarray(labels,dtype=cp.int32)

    print("[INFO] Original class counts:", Counter(cp.asnumpy(y)))
    X, y = oversample_gpu(X, y)
    print("[INFO] After oversampling:", Counter(cp.asnumpy(y)))

    n_classes = len(set(cp.asnumpy(y)))
    model = TransformerGPU(X.shape[1], n_classes, d_model=embedding_dim)

    for epoch in range(epochs):
        perm = cp.random.permutation(X.shape[0])
        X_shuff = X[perm]
        y_shuff = y[perm]

        logits, X_ff = model.forward(X_shuff)
        loss = cross_entropy(logits, y_shuff)
        model.update(X_ff, y_shuff, logits, lr)

        pred = cp.argmax(logits, axis=1)
        acc = cp.mean((pred==y_shuff).astype(cp.float32))
        print(f"Epoch {epoch+1}/{epochs} - Loss: {float(loss):.4f} - Acc: {float(acc)*100:.2f}%")

    logits, _ = model.forward(X)
    pred = cp.argmax(logits, axis=1)
    total_acc = cp.mean((pred==y).astype(cp.float32))

    total_time = time.time() - start_time
    print(f"[INFO] Training finished in {total_time:.2f}s")
    print(f"[RESULT] Total Accuracy: {float(total_acc)*100:.2f}%")
    return model, embeddings, word_to_idx

if __name__=="__main__":
    df = pd.read_csv(r"C:\Users\Pavani Akshaya\Downloads\fin_data_1.csv")
    sentences = df["Sentence"].astype(str).tolist()
    lab = df["Sentiment"].astype(str).str.lower().str.strip()
    label_map = {"positive":1,"neg":0,"negative":0,"pos":1,"1":1,"0":0}
    labels = [label_map.get(x,1 if x in ("1","true") else 0) for x in lab]

    model, embeddings, word_to_idx = train_gpu(
        sentences, labels, epochs=50, lr=0.01, embedding_dim=32)  

[GPU] Using: NVIDIA GeForce RTX 3060 Laptop GPU (SMs=30)
[INFO] Original class counts: Counter({np.int32(0): 3990, np.int32(1): 1852})
[INFO] After oversampling: Counter({np.int32(1): 3990, np.int32(0): 3990})
Epoch 1/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 2/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 3/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 4/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 5/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 6/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 7/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 8/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 9/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 10/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 11/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 12/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 13/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 14/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 15/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 16/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 17/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 18/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 19/50 - Loss: 0.6931 - Acc: 50.00%
Epoch 20/50 - Loss: 