In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from joblib import dump
import re

In [2]:
# Load the data
data = pd.read_csv("Twitter_Data.csv")

In [3]:
# Preprocess the data
data['text'] = data['text'].astype(str)
data['text'] = data['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s.]+', '', x).lower())

In [4]:
# Split data into features (x) and labels (y)
x = data['text']
y = data['sentiment']

In [5]:
# Replace sentiment labels with integers
y = y.replace({'negative': 0, 'neutral': 1, 'positive': 2})

In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)


In [7]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=9000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [8]:
# Pad sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=250, padding='pre')
X_test_pad = pad_sequences(X_test_seq, maxlen=250, padding='pre')

In [9]:
# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [10]:
# Convert labels to one-hot encoding
num_classes = len(y.unique())
y_train_onehot = tf.keras.utils.to_categorical(y_train_encoded, num_classes=num_classes)
y_test_onehot = tf.keras.utils.to_categorical(y_test_encoded, num_classes=num_classes)


In [12]:
# Build the model
model = tf.keras.Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

In [13]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [14]:
# Train the model
model.fit(X_train_pad, y_train_onehot, epochs=12, batch_size=64, validation_data=(X_test_pad, y_test_onehot))

Epoch 1/12
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 627ms/step - accuracy: 0.4855 - loss: 0.9982 - val_accuracy: 0.6957 - val_loss: 0.7122
Epoch 2/12
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 577ms/step - accuracy: 0.7544 - loss: 0.6146 - val_accuracy: 0.7240 - val_loss: 0.6640
Epoch 3/12
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 612ms/step - accuracy: 0.8240 - loss: 0.4735 - val_accuracy: 0.7198 - val_loss: 0.7118
Epoch 4/12
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 459ms/step - accuracy: 0.8600 - loss: 0.3898 - val_accuracy: 0.7109 - val_loss: 0.8057
Epoch 5/12
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 582ms/step - accuracy: 0.8814 - loss: 0.3300 - val_accuracy: 0.7022 - val_loss: 0.8338
Epoch 6/12
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 502ms/step - accuracy: 0.9078 - loss: 0.2676 - val_accuracy: 0.7008 - val_loss: 0.9774
Epoc

<keras.src.callbacks.history.History at 0x1a80ee81b50>

In [15]:
# Save the model and tokenizer
model.save('sentiment_model.h5')
dump(tokenizer, 'tokenizer.joblib')



['tokenizer.joblib']