In [51]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tensorflow.keras.callbacks import EarlyStopping
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [52]:
# Step 1: Load and Merge Datasets
print("----- Downloading and Merging Datasets -----")
url_set1 = '/content/drive/MyDrive/deep_learning/Set-I.csv'
url_set2 = '/content/drive/MyDrive/deep_learning/Set-II.csv'
set1 = pd.read_csv(url_set1)
set2 = pd.read_csv(url_set2)
dataset = pd.concat([set1, set2], ignore_index=True)
dataset.head(10)

----- Downloading and Merging Datasets -----


Unnamed: 0,Tweets,label
0,Asked #ChatGPT about what it thinks are the pr...,Positive
1,#ChatGPT tornado has already traveled around t...,Neutral
2,This is a great explanation of why #EVs are mo...,Positive
3,‘if you need to write a box-ticking social med...,Positive
4,Just saw an AI tool making my coffee for me. \...,Positive
5,Do I trust #chatgpt to write bug free code?\n\...,Negative
6,Tonight a friend wondered about a science topi...,Positive
7,The rise and popularity of ChatGPT do signal a...,Positive
8,ChatGPT may not replace programmers but it did...,Positive
9,1.4 billion people's work lives are about to c...,Negative


In [53]:
# Step 2: Data Preprocessing
print("----- Preprocessing Data -----")

# Map sentiment labels to numerical values
dataset['label'] = dataset['label'].map({'Positive': 0, 'Neutral': 1, 'Negative': 2})  # Multi-class mapping

# Extract features (tweets) and labels
X = dataset['Tweets'].values  # Feature: Tweets
y = dataset['label'].values   # Target: Labels (0, 1, 2)


# Verify label conversion
print("Mapped labels:")
print(dataset['label'].values)



----- Preprocessing Data -----
Mapped labels:
[0 1 0 ... 2 0 2]


In [54]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [55]:
# Tokenization and Padding
print("----- Tokenizing and Padding -----")
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train)
encoded_train = tokenizer.texts_to_sequences(X_train)
encoded_test = tokenizer.texts_to_sequences(X_test)
max_length = 40

----- Tokenizing and Padding -----


In [56]:
padded_train = tf.keras.preprocessing.sequence.pad_sequences(encoded_train, maxlen=max_length, padding='post')
padded_test = tf.keras.preprocessing.sequence.pad_sequences(encoded_test, maxlen=max_length, padding='post')
print(padded_train[0:2])

[[2558  505   10  589  957    1    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [ 297    5    2    8  401   43  342    9  788   18   14  104    3  506
  1185 1186 2563    2  681    6   18  279  297 2564   44   97   18    5
   958  207  682   20  297  298   43    5  280  683  507    1]]


In [57]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)




6883


In [58]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes=3)

In [63]:
# Step 3: Define the RNN Model for Multi-Class Classification
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=24, input_length=max_length),
    tf.keras.layers.SimpleRNN(24, return_sequences=False),  # RNN layer with 24 units
    tf.keras.layers.Dense(64, activation='relu'),           # Dense layer with 64 units
    tf.keras.layers.Dropout(0.7),                           # Dropout for regularization
    tf.keras.layers.Dense(32, activation='relu'),           # Dense layer with 32 units
    tf.keras.layers.Dropout(0.7),                           # Dropout for regularization
    tf.keras.layers.Dense(3, activation='softmax')          # Output layer for 3 classes
])

model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
early_stop = EarlyStopping(monitor='val_accuracy', patience=20, restore_best_weights=True)



In [64]:
history = model.fit(
    x=padded_train,                 # Preprocessed and padded input sequences
    y=y,                        # One-hot encoded labels
    epochs=100,                 # Number of epochs
    validation_split=0.2,       # Split 20% of data for validation
    callbacks=[early_stop],     # Early stopping
    batch_size=32,              # Batch size for training
    verbose=1                   # Display training progress
)


Epoch 1/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.4769 - loss: 1.0525 - val_accuracy: 0.6039 - val_loss: 0.9806
Epoch 2/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.6081 - loss: 0.9964 - val_accuracy: 0.6039 - val_loss: 0.9786
Epoch 3/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.6372 - loss: 0.9045 - val_accuracy: 0.6039 - val_loss: 0.9814
Epoch 4/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6294 - loss: 0.8824 - val_accuracy: 0.6039 - val_loss: 0.9763
Epoch 5/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.6291 - loss: 0.8509 - val_accuracy: 0.6039 - val_loss: 1.0485
Epoch 6/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.6547 - loss: 0.7264 - val_accuracy: 0.6039 - val_loss: 1.1180
Epoch 7/100
[1m32/32[0m [

In [65]:
 #Step 5: Save the Model
os.makedirs("models", exist_ok=True)
model.save("/content/models/sentiment_model.h5")




In [70]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def predict_sentiment(sentence):
    # Preprocess the sentence (e.g., tokenization, padding)
    # For example, assuming you have a tokenizer and max_length:
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])  # Tokenizing the sentence
    padded_sentence = pad_sequences(tokenized_sentence, maxlen=max_length)  # Padding the sentence

    # Predict the sentiment class (0, 1, or 2)
    prediction = model.predict(padded_sentence)
    predicted_class = np.argmax(prediction, axis=1)  # Get the class with the highest probability

    # Map the predicted class back to the original label
    label_map = {0: 'Positive', 1: 'Neutral', 2: 'Negative'}
    return label_map[predicted_class[0]]

# Example prediction:
sentence = "I love this product!"
predicted_sentiment = predict_sentiment(sentence)
print(f"Predicted sentiment: {predicted_sentiment}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Predicted sentiment: Positive
