In [2]:
import numpy as np
import nltk
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('twitter_samples')

# Load the Twitter Samples dataset
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

# Preprocess the data
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_data(data):
    processed_data = []
    for tweet in data:
        tweet = word_tokenize(tweet)
        tweet = ' '.join([lemmatizer.lemmatize(word.lower()) for word in tweet if word.isalpha()])
        tweet = ' '.join([word for word in tweet.split() if word not in stop_words])
        processed_data.append(tweet)
    return processed_data

positive_tweets = preprocess_data(positive_tweets)
negative_tweets = preprocess_data(negative_tweets)

# Combine positive and negative tweets and create labels
tweets = positive_tweets + negative_tweets
labels = [1] * len(positive_tweets) + [0] * len(negative_tweets)

# Tokenize the tweets
vocabulary_size = 5000
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(tweets)
sequences = tokenizer.texts_to_sequences(tweets)

# Pad sequences to the same length
max_sequence_length = 250
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Convert to numpy arrays
padded_sequences = np.array(padded_sequences)
labels = np.array(labels)

# Build the model
embedding_size = 32
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping criteria
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Train the model
batch_size = 64
epochs = 10
model.fit(padded_sequences, labels, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(padded_sequences, labels)
print("Loss:", loss)
print("Accuracy:", accuracy)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Loss: 0.3314696252346039
Accuracy: 0.8806999921798706
