In [None]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download('movie_reviews')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def load_imdb_dataset():
    positive_reviews = nltk.corpus.movie_reviews.fileids('pos')
    positive_reviews = [nltk.corpus.movie_reviews.raw(fileid) for fileid in positive_reviews]
    negative_reviews = nltk.corpus.movie_reviews.fileids('neg')
    negative_reviews = [nltk.corpus.movie_reviews.raw(fileid) for fileid in negative_reviews]
    dataset = [(review, 'positive') for review in positive_reviews] + [(review, 'negative') for review in negative_reviews]
    return dataset

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

# Load and preprocess the dataset
dataset = load_imdb_dataset()
texts, labels = zip(*dataset)
texts = [' '.join(preprocess_text(text)) for text in texts]

# Tokenize and pad the sequences
max_length = 100
vocab_size = 10000
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_length, truncating='post', padding='post')

# Encode the labels
label_map = {'positive': 1, 'negative': 0}
encoded_labels = [label_map[label] for label in labels]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, encoded_labels, test_size=0.2, random_state=42)

# Create the LSTM model
model = Sequential([
    Embedding(vocab_size, 128, input_length=max_length),
    LSTM(128, return_sequences=True),
    LSTM(128),
    Dense(1, activation='sigmoid')
])

# Compile and train the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, np.array(y_train), validation_data=(X_test, np.array(y_test)), epochs=5, batch_size=32)

# Test the model with a sample string
test_string = "The movie was great!"
test_tokens = preprocess_text(test_string)
test_sequence = tokenizer.texts_to_sequences([' '.join(test_tokens)])
test_padded_sequence = pad_sequences(test_sequence, maxlen=max_length, truncating='post', padding='post')
prediction = model.predict(test_padded_sequence)
print("Prediction: ", "positive" if prediction > 0.5 else "negative")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Prediction:  negative
