In [None]:
import nltk
import random
from nltk.corpus import twitter_samples
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.classify.util import accuracy as nltk_accuracy

# Download the NLTK data
nltk.download('twitter_samples')
nltk.download('stopwords')
nltk.download('punkt')
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

import re
import string
stopwords_english = stopwords.words('english')
stemmer = PorterStemmer()

def preprocess(tweet):
    # Remove stock market tickers like $GE
    tweet = re.sub(r'\$\w+', '', tweet)

    # Remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    # Remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

    # Remove hashtags (only removing the hash # sign from the word)
    tweet = re.sub(r'#', '', tweet)

    # Tokenize the tweet
    tweet_tokens = word_tokenize(tweet)

    # Remove stopwords and stemming
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)

    return tweets_clean

# Preprocess the positive and negative tweets
positive_tweets_clean = []
negative_tweets_clean = []

for tweet in positive_tweets:
    positive_tweets_clean.append(preprocess(tweet))

for tweet in negative_tweets:
    negative_tweets_clean.append(preprocess(tweet))

def get_tweets_for_model(cleaned_tweets):
    for tweet_tokens in cleaned_tweets:
        yield dict([token, True] for token in tweet_tokens)

positive_tweets_model = get_tweets_for_model(positive_tweets_clean)
negative_tweets_model = get_tweets_for_model(negative_tweets_clean)

# Split the dataset into train and test sets
positive_dataset = [(tweet_dict, 'Positive') for tweet_dict in positive_tweets_model]
negative_dataset = [(tweet_dict, 'Negative') for tweet_dict in negative_tweets_model]

dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Combine positive and negative cleaned tweets
all_cleaned_tweets = positive_tweets_clean + negative_tweets_clean

# Create tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_cleaned_tweets)

# Convert tweets to sequences
sequences = tokenizer.texts_to_sequences(all_cleaned_tweets)

# Pad sequences
max_length = max([len(x) for x in sequences])
X = pad_sequences(sequences, maxlen=max_length, padding='post')

# Prepare labels
labels = np.array([1]*len(positive_tweets_clean) + [0]*len(negative_tweets_clean))

# Split the dataset into train and test sets
train_indices = random.sample(range(len(X)), 7000)
test_indices = [i for i in range(len(X)) if i not in train_indices]

X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = labels[train_indices], labels[test_indices]


In [None]:
# Create the LSTM model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
lstm_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(lstm_units))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Train the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3)]
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.1, callbacks=callbacks)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy:.2f}")


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Test accuracy: 0.76


In [None]:
def predict_sentiment(text):
    cleaned_text = preprocess(text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    prediction = model.predict(padded_sequence)
    return "Positive" if prediction >= 0.5 else "Negative"

# Test string
test_string = "I love this new phone!"
sentiment = predict_sentiment(test_string)
print(f"The sentiment of the test string '{test_string}' is {sentiment}.")


The sentiment of the test string 'I love this new phone!' is Positive.
