# Sentiment Analysis Notebook

Sentiment analysis notebook with a simple RNN by Basel.

## 1. Includes

In [34]:
# Basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re as reg

# NLP refs
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# THE BIG GUNS
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout


# Eval
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report



pd.set_option('display.max_colwidth', 100)
plt.style.use('ggplot')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\basel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\basel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\basel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\basel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2. Dataset info

In [35]:

splits = {'train': 'train_df.csv', 'validation': 'val_df.csv', 'test': 'test_df.csv'}
df = pd.read_csv("hf://datasets/Sp1786/multiclass-sentiment-analysis-dataset/" + splits["train"])
print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
df.head()

Dataset shape: (31232, 4)

First few rows:


Unnamed: 0,id,text,label,sentiment
0,9536,"Cooking microwave pizzas, yummy",2,positive
1,6135,Any plans of allowing sub tasks to show up in the widget?,1,neutral
2,17697,"I love the humor, I just reworded it. Like saying 'group therapy' instead`a 'gang banging'. Kee...",2,positive
3,14182,naw idk what ur talkin about,1,neutral
4,17840,That sucks to hear. I hate days like that,0,negative


In [36]:
print("nulls:")
df.isnull().sum()

nulls:


id           0
text         0
label        0
sentiment    0
dtype: int64

In [37]:
print("has links?")
df[df['text'].str.contains('https?')].head()

has links?


Unnamed: 0,id,text,label,sentiment
21,14653,(bye) plurk out muna. will be back when im done reading the book http://plurk.com/p/rq9c7,1,neutral
24,2900,http://twitpic.com/3ernb - Loving The Jumper,2,positive
54,19253,Download movie 'Ben 10: Alien Force' http://tinyurl.com/czb7b2 cool #movie,2,positive
64,172,URL in previous post (to timer job) should be http://bit.ly/a4Fdb. I`d removed space which messe...,0,negative
102,13643,_nicole http://twitpic.com/4t6qx - i have the exact same pic except it`s my hubby & my chihuahua...,2,positive


In [38]:
print("all english letters?")
df[df['text'].str.contains('[^\x00-\x7F]')].head()

all english letters?


Unnamed: 0,id,text,label,sentiment
91,6059,"The app is great, and I´ve started to use it rather than wunderlist. Just 3 stars because I don´...",1,neutral
174,8112,"💩👎💩👎💩👎💩 👻☠️ IMPOSSIBLE TO DELETE LIST, OR SUB LISTS (TABS)!!! ☠️👻 REGRET v2.0 UPGRADE!! 👎💩👎💩👎💩👎💩...",0,negative
185,4095,Es muy útil para mantener las aplicaciones del celular cerradas mientras trabajas.,2,positive
241,2091,the app and widɡet are very ɡood. Will change to 5 start when I can be able to add task using go...,1,neutral
414,26054,"ohhhh, how sad...I didnï¿½t get it!",0,negative


## 3. The Prep Kitchen (Preprocessing)

In [39]:
def cleaner(text):
    text = text.lower()
    text = reg.sub(r'http\S+|www\S+|https\S+', '', text)
    text = reg.sub(r'@\w+', '', text)
    text = reg.sub(r'[^a-zA-Z\s]', '', text)    
    text = reg.sub(r'\s+', ' ', text).strip()
    return text
df['cleaned_text'] = df['text'].apply(cleaner)
print("Any stragglers?")
df[df['cleaned_text'].str.contains('[^\x00-\x7F]')].head()


Any stragglers?


Unnamed: 0,id,text,label,sentiment,cleaned_text


In [None]:
def preprocesser(text):
    # tokenize
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # stem
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    processed_text = ' '.join(stemmed_tokens)
    
    return processed_text

df['processed_text'] = df['cleaned_text'].apply(preprocesser)
df['processed_text'].head()

0                                                           cook microwav pizza yummi
1                                                     plan allow sub task show widget
2    love humor reword like say group therapi insteada gang bang keep mom back hahaha
3                                                                   naw idk ur talkin
4                                                             suck hear hate day like
Name: processed_text, dtype: object

## 4. Divorcing the dataset (Splitting)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_text'], df['label'], test_size=0.2, random_state=42
)
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (24985,)
Testing set shape: (6247,)


## 5. CBOWing and stuff (Tokenization/Padding)

In [None]:
tokenizer = Tokenizer(num_words=1000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

sequences = tokenizer.texts_to_sequences(X_train)

padded_sequences = pad_sequences(sequences, 
                                maxlen=150,
                                padding='post',
                                truncating='post')

print("Original text:", X_train[0])
print("Tokenized sequence:", sequences[0])
print("Shape of padded sequences:", padded_sequences.shape)
print("Example of padded sequence:", padded_sequences[0])

X = padded_sequences
y = df['label']

Original text: cook microwav pizza yummi
Tokenized sequence: [1, 1, 921, 1]
Shape of padded sequences: (31232, 150)
Example of padded sequence: [  1   1 921   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0]


## 6. ***IT'S ALIVE*** (Training)

In [43]:
# Params
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128  
rnn_units = 64      
max_length = 150     

# The real deal
model = Sequential([
    # Embedding layer
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    
    SimpleRNN(rnn_units, activation='tanh', return_sequences=False),
    
    Dropout(0.5),
    
    # Output layer with sigmoid activation for binary classification
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Display model summary
model.summary()

# Train the model (uncomment when ready to train)
# history = model.fit(
#     padded_sequences,
#     labels,
#     epochs=10,
#     batch_size=32,
#     validation_split=0.2,
#     verbose=1
# )



## 7. Is he reaally tho? (Eval)

In [None]:
plt.figure(figsize=(12, 4), dpi=150)
plt.plot(history.history['accuracy'], label='train_accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Get model predictions (probabilities)
y_probs = model.predict(padded_sequences)

# Converting probs into Binary values
y_pred = np.argmax(y_probs, axis=1)

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(padded_sequences, y_test)
print(f"Loss: {loss}, Accuracy: {accuracy}")

In [None]:
# classification report
report = classification_report(y_test, y_pred)

# the report
print("Classification Report:")
print(report)

In [None]:
plt.figure(figsize=(6, 5))
sns.heatmap(cfm, annot=True, fmt="d", cmap="Blues", xticklabels=['Negative' , 'Positive' , 'Neutral'], yticklabels=['Negative' , 'Positive','Neutral'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()