In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
df = pd.read_csv('Sentiment_data\\train.csv', encoding='latin-1', usecols=['selected_text', 'sentiment'] , nrows=5000)
df = df.dropna()
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0, 'neutral': -1})
df.selected_text = df.selected_text.str.lower()
df.columns = ['text', 'sentiment']
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,text,sentiment
0,thanks,1
1,miss,0
2,"_jayr oh, i know how budgeting is.",-1
3,things aren`t just as easy and simple as they ...,0
4,i agree. everybody would`ve been excited and t...,-1


In [3]:
df.shape

(4999, 2)

In [4]:
import re

# Function to clean text
def clean_text(text):
    # Remove special characters and excessive punctuation
    text = re.sub(r'[^\w\s]', '', text)  # Keep only words and spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text.strip()

# Apply cleaning to the dataset
df['text'] = df['text'].apply(clean_text)

# Rebuild the vocabulary
from collections import Counter

# Tokenize and count word frequencies
all_words = ' '.join(df['text']).split()
word_counts = Counter(all_words)

# Filter out rare or unwanted tokens
min_frequency = 2  # Keep words that appear at least twice
vocab = [word for word, count in word_counts.items() if count >= min_frequency]
vocab.append('<UNK>')  # Add a placeholder for unknown words

# Create word2idx mapping
word2idx = {word: idx for idx, word in enumerate(vocab)}

In [5]:
idx2word = {idx: word for word, idx in word2idx.items()}
idx2word["<UNK>"] = len(idx2word)

print(len(word2idx), len(idx2word))

def one_hot_encode(word, word2idx):
    if word not in word2idx:
        word = "<UNK>"
    one_hot = np.zeros(len(word2idx))
    one_hot[word2idx[word]] = 1
    return one_hot
  

2124 2125


In [6]:
import numpy as np

# Initialize weights and biases
input_size = len(vocab)
hidden_size = 10
output_size = 3



In [7]:


np.random.seed(0)
whx = np.random.randn(hidden_size , input_size)  # (10,33)
whh = np.random.randn(hidden_size, hidden_size)  # (10,10)
why = np.random.randn(output_size, hidden_size)  # (3,10)
bh = np.zeros((hidden_size, 1))  # (10,1)
by = np.zeros((output_size, 1))  # (3,1)
o0 = np.zeros((hidden_size, 1))  # (10,1)


In [8]:
# forward pass

def forward_pass(sentence, whx, whh, why, bh, by, o0):
    for word in sentence.split():
        x = one_hot_encode(word, word2idx)
        x = x.reshape(-1, 1)
        h = np.tanh(np.dot(whx, x) + np.dot(whh, o0) + bh)  # (10,33) * (33,1) + (10,10) * (10,1) + (10,1) = (10,1)
    y = np.dot(why, h) + by  # (3,10) * (10,1) + (3,1) = (3,1)
    o0 = h
    return y, h, o0

In [11]:
def softmax(y):
    exp_y = np.exp(y - np.max(y))  # Subtract max for numerical stability
    return exp_y / np.sum(exp_y)

def cross_entropy_loss(y_true, y_pred):
    return -np.sum(y_true * np.log(y_pred + 1e-9))  # Add small value to avoid log(0)

def backward_pass(y_true, y_pred, o0, h, x):
    # Gradients for why and by
    dy = y_pred - y_true
    dwhy = np.dot(dy, h.T)
    dby = dy

    # Gradients for whh, bh, and whx
    dh = np.dot(why.T, dy) * (1 - h ** 2)  # Derivative of tanh
    dwhh = np.dot(dh, o0.T)
    dbh = dh
    dwhx = np.dot(dh, x.T)

    return dwhy, dby, dwhh, dbh, dwhx

learning_rate = 0.001
epochs = 200

# Corrected Training Loop
for epoch in tqdm(range(epochs)):
    for sentence, sentiment in zip(df["text"], df["sentiment"]):
        o0 = np.zeros((hidden_size, 1))  # Reset hidden state for each sentence
        y_true = np.zeros((output_size, 1))
        y_true[sentiment] = 1

        # Forward pass
        for word in sentence.split():
          
            x = one_hot_encode(word, word2idx).reshape(-1, 1)
            y_pred, h, o0 = forward_pass(word, whx, whh, why, bh, by, o0)

        y_pred = softmax(y_pred)
        loss = cross_entropy_loss(y_true, y_pred)

        # Backward pass
        dwhy, dby, dwhh, dbh, dwhx = backward_pass(y_true, y_pred, o0, h, x)

        # Update weights and biases
        whx -= learning_rate * dwhx
        whh -= learning_rate * dwhh
        why -= learning_rate * dwhy
        bh -= learning_rate * dbh
        by -= learning_rate * dby

    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss}")

  0%|          | 1/200 [00:01<05:48,  1.75s/it]

Epoch 0, Loss: 0.47590100240935584


 10%|█         | 21/200 [00:55<07:58,  2.67s/it]

Epoch 20, Loss: 0.482069393321903


 20%|██        | 41/200 [01:39<06:39,  2.51s/it]

Epoch 40, Loss: 0.48051310529828056


 30%|███       | 61/200 [02:33<06:08,  2.65s/it]

Epoch 60, Loss: 0.49077761084591165


 40%|████      | 81/200 [03:22<03:55,  1.98s/it]

Epoch 80, Loss: 0.489367847663648


 50%|█████     | 101/200 [04:09<03:34,  2.16s/it]

Epoch 100, Loss: 0.4856410858320649


 60%|██████    | 121/200 [04:59<03:28,  2.65s/it]

Epoch 120, Loss: 0.47991401877410705


 70%|███████   | 141/200 [05:48<02:28,  2.51s/it]

Epoch 140, Loss: 0.4736745456482739


 80%|████████  | 161/200 [06:38<01:46,  2.74s/it]

Epoch 160, Loss: 0.4750914531926067


 90%|█████████ | 181/200 [07:35<00:47,  2.51s/it]

Epoch 180, Loss: 0.4744818231062563


100%|██████████| 200/200 [08:27<00:00,  2.54s/it]


In [10]:
# testin the model
class_names = ["negative", "neutral", "positive"]
def predict(sentence, whx, whh, why, bh, by, o0):
    o0 = np.zeros((hidden_size, 1))
    valid = False
    for word in sentence.split():
        if word not in word2idx:
            continue
        valid = True
        x = one_hot_encode(word, word2idx).reshape(-1, 1)
        y_pred, h, o0 = forward_pass(word, whx, whh, why, bh, by, o0)

    if not valid:
        return "Invalid sentence"

    y_pred = softmax(y_pred)
    return class_names[np.argmax(y_pred)]

predict(' i do not know', whx, whh, why, bh, by, o0)  

'positive'