<a href="https://colab.research.google.com/github/AnasTarek25/My-Projects/blob/main/Sentiment_Test_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recurrent Neural Network: Sentiment Analysis Project

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout

from tensorflow.keras.optimizers import Adam

In [3]:
df = pd.read_csv('/content/Sentiment Project Dataset.csv', encoding='latin-1')
df.head()

Unnamed: 0,index,id,date,query,username,tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [4]:
df['index'].value_counts()

Unnamed: 0_level_0,count
index,Unnamed: 1_level_1
0,799999
4,248576


our data is unbalanced

In [5]:
df['index'].replace({ 4:1}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['index'].replace({ 4:1}, inplace=True)


In [6]:
df.shape

(1048575, 6)

In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df.isnull().sum()

Unnamed: 0,0
index,0
id,0
date,0
query,0
username,0
tweet,0


# Clean The Text:  (get rid od @ ! ? / ...etc )

In [9]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
stop_words = set(stopwords.words('english'))  # remove is, a, the, etc...
stemmer = PorterStemmer()  # converts the word to its original form

In [12]:
def smart_clean_text(text, stem=False):
    text = str(text).lower()

    # Remove URLs
    text = re.sub(r"http\S+", "", text)           # Remove anything starting with http
    text = re.sub(r"www\.\S+", "", text)          # Remove anything like www.xyz.com

    # Remove @mentions
    text = re.sub(r"@\w+", "", text)

    # Remove punctuation, numbers, and special chars — keep only letters
    text = re.sub(r"[^a-z\s]", " ", text)

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize and remove stopwords
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)

    return " ".join(tokens)

In [13]:
df['tweet'] = df['tweet'].apply(smart_clean_text)

In [14]:
df['tweet']

Unnamed: 0,tweet
0,upset update facebook texting might cry result...
1,dived many times ball managed save rest go bounds
2,whole body feels itchy like fire
3,behaving mad see
4,whole crew
...,...
1048570,grandma making dinenr mum
1048571,mid morning snack time bowl cheese noodles yum
1048572,say like terminiator movies comes like words
1048573,im great thaanks wbuu


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   index     1048575 non-null  int64 
 1   id        1048575 non-null  int64 
 2   date      1048575 non-null  object
 3   query     1048575 non-null  object
 4   username  1048575 non-null  object
 5   tweet     1048575 non-null  object
dtypes: int64(2), object(4)
memory usage: 48.0+ MB


In [16]:
df.isnull().sum()

Unnamed: 0,0
index,0
id,0
date,0
query,0
username,0
tweet,0


# Trainning RNN model:

In [17]:
X = df['tweet'].values
y = df['index'].values

# Tokenizers:

**Tried to enhance my vocab by using a stronger tokenizer (tranform) from hugging face**

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
# obv my model needs numbers to work so we will tokenize our text: giving each word a value

vocab_size = 50000  # use only the top 50k words in my dataset
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")  # if data uses words out of the 30k it replaces it with (oov) out of vocab
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)

# Padding Text

In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 100
X_padded = pad_sequences(X_seq, maxlen=max_length, # my models needs all datasets to have a certain lengths  by adding 0s to the short datas
                         padding='post',
                         truncating='post')  # cuts longer sequences

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.4, random_state=42)

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, Bidirectional

# My data was inbalanced so i had to balance the weights

In [22]:
from sklearn.utils import class_weight
import numpy as np

class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights = dict(zip(np.unique(y_train), class_weights))

print("Class Weights:", class_weights)

Class Weights: {np.int64(0): np.float64(0.6551832944896059), np.int64(1): np.float64(2.1109981478498954)}


In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout

# Model architecture (binary)
model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=32,
              input_length=max_length),

    SimpleRNN(16), # 16 is no of neurons              # i tried to add dense RELU (more neurons) but it lessened the accuracy
    Dense(1, activation='sigmoid')  # Binary output
])



In [24]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [25]:
history = model.fit(
    X_train, y_train,
    epochs=4,
    batch_size=512,
    validation_split=0.1,
    class_weight=class_weights
)

Epoch 1/4
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 56ms/step - accuracy: 0.6404 - loss: 0.6268 - val_accuracy: 0.7032 - val_loss: 0.5586
Epoch 2/4
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 59ms/step - accuracy: 0.7303 - loss: 0.5341 - val_accuracy: 0.6947 - val_loss: 0.5709
Epoch 3/4
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 57ms/step - accuracy: 0.7551 - loss: 0.5017 - val_accuracy: 0.7731 - val_loss: 0.4800
Epoch 4/4
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 55ms/step - accuracy: 0.7786 - loss: 0.4716 - val_accuracy: 0.7773 - val_loss: 0.4772


i tried to enhance my vocab by increasing the token limits to maybe convert more words to tokens so my model can learn more words

# Model Testing

In [26]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict_sentiment(text, model, tokenizer, max_len=100):
    # Preprocess
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post')

    # Predict
    pred = model.predict(padded)[0]

    if len(pred) == 2:
        sentiment_label = "Positive" if np.argmax(pred) == 1 else "Negative"
        confidence = float(np.max(pred))  # ✅ Convert to float
    else:
        sentiment_label = "Positive" if pred > 0.5 else "Negative"
        confidence = float(pred) if pred > 0.5 else float(1 - pred)  # ✅

    return sentiment_label, confidence


In [28]:
text = input("Enter a sentence for sentiment analysis: ")
sentiment, confidence = predict_sentiment(text, model, tokenizer)
print(f"Text: '{text}'")
print(f"Prediction: {sentiment} (Confidence: {confidence:.0%})")

Enter a sentence for sentiment analysis: my mouth smells
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Text: 'my mouth smells'
Prediction: Negative (Confidence: 53%)


  confidence = float(pred) if pred > 0.5 else float(1 - pred)  # ✅


#  UI:

In [30]:
import gradio as gr

def analyze_sentiment(text):
    sentiment, confidence = predict_sentiment(text, model, tokenizer)
    if sentiment == "Positive":
        color = "green"
        emoji = "😊"
        message = f"<div style='color:{color}; font-size:1.3em; font-weight:bold;'>\
{emoji} Positive<br>Confidence: {confidence:.0%}</div>"
    else:
        color = "red"
        emoji = "😞"
        message = f"<div style='color:{color}; font-size:1.3em; font-weight:bold;'>\
{emoji} Negative<br>Confidence: {confidence:.0%}</div>"
    return message

custom_css = """
.gradio-container {background: linear-gradient(120deg, #f8fafc 0%, #e0e7ff 100%);}
footer {display:none !important;}
"""

with gr.Blocks(css=custom_css, title="Sentiment Analysis") as demo:
    gr.Markdown(
        """
        # 🌟 Sentiment Analysis UI
        Enter a sentence below to see if it's <span style='color:green;font-weight:bold;'>Positive</span> or <span style='color:red;font-weight:bold;'>Negative</span>!
        """
    )
    with gr.Row():
        with gr.Column():
            inp = gr.Textbox(lines=2, placeholder="Type your sentence here...", label="Input Sentence")
            btn = gr.Button("Analyze Sentiment", elem_id="analyze-btn")
        with gr.Column():
            out = gr.HTML(label="Result")

    btn.click(analyze_sentiment, inputs=inp, outputs=out)
    inp.submit(analyze_sentiment, inputs=inp, outputs=out)

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ecbdcf654c68ba1451.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


