In [None]:
import nltk
import random
from nltk.corpus import twitter_samples
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.classify.util import accuracy as nltk_accuracy

# Download the NLTK data
nltk.download('twitter_samples')
nltk.download('stopwords')
nltk.download('punkt')
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

import re
import string
stopwords_english = stopwords.words('english')
stemmer = PorterStemmer()

def preprocess(tweet):
    # Remove stock market tickers like $GE
    tweet = re.sub(r'\$\w+', '', tweet)

    # Remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    # Remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

    # Remove hashtags (only removing the hash # sign from the word)
    tweet = re.sub(r'#', '', tweet)

    # Tokenize the tweet
    tweet_tokens = word_tokenize(tweet)

    # Remove stopwords and stemming
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)

    return tweets_clean

# Preprocess the positive and negative tweets
positive_tweets_clean = []
negative_tweets_clean = []

for tweet in positive_tweets:
    positive_tweets_clean.append(preprocess(tweet))

for tweet in negative_tweets:
    negative_tweets_clean.append(preprocess(tweet))

def get_tweets_for_model(cleaned_tweets):
    for tweet_tokens in cleaned_tweets:
        yield dict([token, True] for token in tweet_tokens)

positive_tweets_model = get_tweets_for_model(positive_tweets_clean)
negative_tweets_model = get_tweets_for_model(negative_tweets_clean)

# Split the dataset into train and test sets
positive_dataset = [(tweet_dict, 'Positive') for tweet_dict in positive_tweets_model]
negative_dataset = [(tweet_dict, 'Negative') for tweet_dict in negative_tweets_model]

dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Combine positive and negative tweets
all_tweets = positive_tweets + negative_tweets

# Tokenize and pad the tweets
max_length = 50
vocab_size = 10000
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(all_tweets)

sequences = tokenizer.texts_to_sequences(all_tweets)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Create labels
labels = np.array([1] * len(positive_tweets) + [0] * len(negative_tweets))

# Shuffle and split the dataset into train and test sets
indices = np.arange(padded_sequences.shape[0])
np.random.shuffle(indices)

padded_sequences = padded_sequences[indices]
labels = labels[indices]

train_size = 7000
X_train = padded_sequences[:train_size]
y_train = labels[:train_size]
X_test = padded_sequences[train_size:]
y_test = labels[train_size:]

# Create a simple RNN model
model = Sequential([
    Embedding(vocab_size, 32, input_length=max_length),
    SimpleRNN(32),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

# Test string
test_string = "I am very happy to learn about RNN for twitter data!"

# Preprocess the test string
test_string_sequence = tokenizer.texts_to_sequences([test_string])
test_string_padded = pad_sequences(test_string_sequence, maxlen=max_length, padding='post')

# Predict the sentiment of the test string
predicted_sentiment = model.predict(test_string_padded)
predicted_label = 'Positive' if predicted_sentiment > 0.5 else 'Negative'
print("The predicted sentiment for the test string is:", predicted_label)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
The predicted sentiment for the test string is: Positive
