<a href="https://colab.research.google.com/github/Ftaj03/FYP-Sentiment-analysis/blob/main/LSTM_with_Fastext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

Load DataSet

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/twitter sentiment analysis.csv')
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])

Preprocess Data

In [None]:
import re
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text
df['Text'] = df['Text'].apply(clean_text)

Split Dataset Into Training Set and Test Set

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Label'], test_size=0.2, random_state=42)


Tokenize Text

In [None]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform input size
max_sequence_length = 50
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')


Load Fastext Embedding

In [None]:
def load_fasttext_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

fasttext_path = '/content/drive/MyDrive/Colab Notebooks/crawl-300d-2M-subword.vec'
embeddings_index = load_fasttext_embeddings(fasttext_path)

Create Embedding Matrix

In [None]:
# Create an embedding matrix
embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Build LSTM Model

In [None]:
# Build the LSTM model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=True),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')  # 3 classes: positive, negative, neutral
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])




In [None]:
# Hyperparameters
batch_size = 128  # Start with 64, experiment with 32 or 128
epochs = 5  # Start with 20, use early stopping to avoid overfitting
callbacks = [
    EarlyStopping(patience=3, restore_best_weights=True),  # Stop if no improvement for 3 epochs
    ReduceLROnPlateau(factor=0.1, patience=2)  # Reduce learning rate if no improvement for 2 epochs
]
# Train the model with callbacks
history = model.fit(
    X_train_padded, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(X_test_padded, y_test),
    callbacks=callbacks
)
# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

# Save Model
model.save("lstm_fasttext_sentiment.h5")

print("Training Complete & Model Saved!")


Epoch 1/5
[1m1073/1073[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 224ms/step - accuracy: 0.3547 - loss: nan - val_accuracy: 0.3543 - val_loss: nan - learning_rate: 1.0000e-05
Epoch 2/5
[1m1073/1073[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 218ms/step - accuracy: 0.3534 - loss: nan - val_accuracy: 0.3543 - val_loss: nan - learning_rate: 1.0000e-05
Epoch 3/5
[1m1073/1073[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 220ms/step - accuracy: 0.3510 - loss: nan - val_accuracy: 0.3543 - val_loss: nan - learning_rate: 1.0000e-06
Epoch 4/5
[1m1073/1073[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 220ms/step - accuracy: 0.3548 - loss: nan - val_accuracy: 0.3543 - val_loss: nan - learning_rate: 1.0000e-06
[1m1073/1073[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 32ms/step - accuracy: 0.3533 - loss: nan




Test Accuracy: 0.3543
Training Complete & Model Saved!


In [None]:
def predict_sentiment(text, tokenizer, model, max_length=100):
  seq = tokenizer.texts_to_sequences([text])
  padded = pad_sequences(seq, maxlen=max_length, padding='post')
  pred = model.predict(padded)
  labels = ["negative", "positive", "neutral"]
  return labels[np.argmax(pred)]

# Example Predictions
print(predict_sentiment("I hate this movie!", tokenizer, model))
print(predict_sentiment("This is the worst experience ever.", tokenizer, model))
print(predict_sentiment("The product is special.", tokenizer, model))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
negative
