In [1]:

# trial -1 

#Shallow Transformer network   with the onehot encoding embeddings 
import re
import cupy as cp
import pandas as pd
import numpy as np

def clean_text(s):
    s = s.lower()
    s = re.sub(r"[^a-z\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def build_vocab(sentences):
    vocab = {}
    for s in sentences:
        for w in s.split():
            if w not in vocab:
                vocab[w] = len(vocab)
    return vocab

def encode_sentences_onehot(sentences, vocab):
    n_docs = len(sentences)
    vocab_size = len(vocab)
    mat = np.zeros((n_docs, vocab_size), dtype=np.float32)
    for i, s in enumerate(sentences):
        for w in s.split():
            if w in vocab:
                mat[i, vocab[w]] = 1.0
    return mat

def balance_classes(X, y):
    unique, counts = np.unique(y, return_counts=True)
    max_count = np.max(counts)
    new_X, new_y = [], []
    for cls in unique:
        cls_idx = np.where(y == cls)[0]
        cls_samples = X[cls_idx]
        cls_labels  = y[cls_idx]
        reps = max_count // len(cls_samples)
        rem  = max_count % len(cls_samples)
        new_X.append(np.tile(cls_samples, (reps,1)))
        new_y.append(np.tile(cls_labels, reps))
        if rem > 0:
            choice = np.random.choice(len(cls_samples), rem, replace=True)
            new_X.append(cls_samples[choice])
            new_y.append(cls_labels[choice])
    return np.vstack(new_X), np.hstack(new_y)

# CUDA kernels
matmul_kernel_code = r'''
extern "C" __global__
void matmul_kernel(const float* A, const float* B, float* C, int M, int K, int N){
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    int col = blockDim.x * blockIdx.x + threadIdx.x;
    if(row < M && col < N){
        float sum = 0.0f;
        for(int k = 0; k < K; ++k){
            sum += A[row*K + k] * B[k*N + col];
        }
        C[row*N + col] = sum;
    }
}
'''
matmul_kernel = cp.RawKernel(matmul_kernel_code, 'matmul_kernel')

def matmul_manual(A, B):
    M, K = A.shape
    K2, N = B.shape
    assert K == K2
    C = cp.zeros((M, N), dtype=cp.float32)
    block = (16, 16, 1)
    grid = ((N+15)//16, (M+15)//16, 1)
    matmul_kernel(grid, block, (A, B, C, M, K, N))
    return C

softmax_kernel_code = r'''
extern "C" __global__
void row_softmax(float* X, int M, int N){
    int row = blockDim.x * blockIdx.x + threadIdx.x;
    if(row < M){
        float max_val = -1e20f;  
        for(int j=0;j<N;++j){
            float v = X[row*N+j];
            max_val = v > max_val ? v : max_val;
        } 
        float sum_exp=0.0f;
        for(int j=0;j<N;++j){
            float e = __expf(X[row*N+j]-max_val);
            X[row*N+j] = e;
            sum_exp += e;
        }
        float inv=1.0f/sum_exp;
        for(int j=0;j<N;++j){
            X[row*N+j]*=inv;
        }
    }
}
'''
softmax_kernel = cp.RawKernel(softmax_kernel_code, 'row_softmax')

def softmax_manual(X):
    M,N = X.shape
    block=(128,1,1)
    grid=((M+127)//128,1,1)
    softmax_kernel(grid,block,(X,M,N))
    return X

relu_kernel_code = r'''
extern "C" __global__
void relu(float* X, int MN){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx < MN){
        float v=X[idx];
        X[idx]=v>0.0f?v:0.0f;
    }
}
'''
relu_kernel = cp.RawKernel(relu_kernel_code,'relu')

def relu_manual(X):
    MN=X.size
    block=(256,1,1)
    grid=((MN+255)//256,1,1)
    relu_kernel(grid,block,(X,MN))
    return X

# Transformer Classifier 
class TransformerClassifierGPU:
    def __init__(self, input_dim, n_classes, d_model=64):
        self.W_embed = cp.random.randn(input_dim, d_model).astype(cp.float32)*0.1
        self.W_Q = cp.random.randn(d_model,d_model).astype(cp.float32)*0.1
        self.W_K = cp.random.randn(d_model,d_model).astype(cp.float32)*0.1
        self.W_V = cp.random.randn(d_model,d_model).astype(cp.float32)*0.1
        self.W_ff= cp.random.randn(d_model,d_model).astype(cp.float32)*0.1
        self.b_ff= cp.zeros(d_model,dtype=cp.float32)
        self.W_out=cp.random.randn(d_model,n_classes).astype(cp.float32)*0.1
        self.b_out=cp.zeros(n_classes,dtype=cp.float32)

    def forward(self,X):
        X_emb=matmul_manual(X,self.W_embed)
        Q=matmul_manual(X_emb,self.W_Q)
        K=matmul_manual(X_emb,self.W_K)
        V=matmul_manual(X_emb,self.W_V)
        scores=matmul_manual(Q,K.T.copy())/cp.sqrt(cp.float32(Q.shape[1]))
        attn=softmax_manual(scores.copy())
        attn_out=matmul_manual(attn,V)
        X_res=X_emb+attn_out
        mean=cp.mean(X_res,axis=1,keepdims=True)
        std=cp.std(X_res,axis=1,keepdims=True)+1e-6
        X_norm=(X_res-mean)/std
        FF=matmul_manual(X_norm,self.W_ff)+self.b_ff
        FF=relu_manual(FF)
        X_ff=X_norm+FF
        logits=matmul_manual(X_ff,self.W_out)+self.b_out
        return logits,X_ff
    
    def update(self,X_ff,y,logits,lr):
        exp_logits=cp.exp(logits-cp.max(logits,axis=1,keepdims=True))
        probs=exp_logits/cp.sum(exp_logits,axis=1,keepdims=True)
        one_hot=cp.zeros_like(probs)
        one_hot[cp.arange(y.size),y]=1.0
        grad_logits=(probs-one_hot)/y.size
        grad_W=matmul_manual(X_ff.T,grad_logits)
        grad_b=cp.sum(grad_logits,axis=0)
        self.W_out-=lr*grad_W
        self.b_out-=lr*grad_b
#loss 
def cross_entropy_loss(logits,y):
    maxl=cp.max(logits,axis=1,keepdims=True)
    log_probs=logits-maxl-cp.log(cp.sum(cp.exp(logits-maxl),axis=1,keepdims=True))
    return -cp.mean(log_probs[cp.arange(y.size),y])

if __name__=="__main__":
    df=pd.read_csv(r"C:\Users\Pavani Akshaya\Downloads\fin_data_1.csv")
    sentences=[clean_text(s) for s in df["Sentence"].astype(str).tolist()]

    label_map={
        "positive":2,"pos":2,"1":2,
        "neutral":1,"neu":1,"2":1,
        "negative":0,"neg":0,"0":0
    }
    labels=df["Sentiment"].astype(str).str.lower().map(label_map)
    
    # Fallback: if unmapped, assign neutral (1)
    labels=labels.fillna(1).astype(int).to_numpy()

    idx = np.arange(len(sentences))
    np.random.seed(42)
    np.random.shuffle(idx)
    split = int(0.8*len(sentences))
    train_idx, test_idx = idx[:split], idx[split:]
    train_sent = [sentences[i] for i in train_idx]
    test_sent  = [sentences[i] for i in test_idx]
    train_lab  = labels[train_idx]
    test_lab   = labels[test_idx]

    vocab=build_vocab(train_sent)
    X_train_cpu=encode_sentences_onehot(train_sent,vocab)
    X_test_cpu=encode_sentences_onehot(test_sent,vocab)

    print("\n[Before Oversampling] Class distribution:")
    unique, counts = np.unique(train_lab, return_counts=True)
    for u,c in zip(unique,counts):
        print(f"Class {u}: {c}")

    X_train_bal, y_train_bal = balance_classes(X_train_cpu, train_lab)

    print("\n[After Oversampling] Class distribution:")
    unique, counts = np.unique(y_train_bal, return_counts=True)
    for u,c in zip(unique,counts):
        print(f"Class {u}: {c}")

    # Move to GPU
    X_train=cp.asarray(X_train_bal)
    X_test=cp.asarray(X_test_cpu)
    y_train=cp.asarray(y_train_bal)
    y_test=cp.asarray(test_lab)

    model=TransformerClassifierGPU(X_train.shape[1],3)

    # Training
    for epoch in range(100):   
        logits,X_ff=model.forward(X_train)
        loss=cross_entropy_loss(logits,y_train)
        model.update(X_ff,y_train,logits,0.01)
        if (epoch+1)%50==0:
            preds=cp.argmax(logits,axis=1)
            acc=cp.mean((preds==y_train).astype(cp.float32))
            print(f"Epoch {epoch+1} Loss={loss:.4f} Train Acc={float(acc)*100:.2f}%")

    # Final test evaluation
    logits,_=model.forward(X_test)
    y_pred=cp.argmax(logits,axis=1)
    acc=cp.mean((y_pred==y_test).astype(cp.float32))
    print(f"\n=== Final Test Accuracy: {float(acc)*100:.2f}% ===")



[Before Oversampling] Class distribution:
Class 0: 679
Class 1: 2537
Class 2: 1457

[After Oversampling] Class distribution:
Class 0: 2537
Class 1: 2537
Class 2: 2537
Epoch 50 Loss=1.2835 Train Acc=34.24%
Epoch 100 Loss=1.2746 Train Acc=34.29%

=== Final Test Accuracy: 34.39% ===
