In [137]:
#!pip install nltk
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import os
import tensorflow as tf
from tensorflow import keras


In [138]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')  # new addition
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [139]:
# 1. Parse review_text from file
def load_reviews_from_file(filepath, label=None):
    with open(filepath, encoding="utf-8") as f:
        text = f.read()
    # Extract all <review_text>...</review_text> blocks
    reviews = re.findall(r"<review_text>(.*?)</review_text>", text, re.S)
    cleaned_reviews = [clean_text(r) for r in reviews]
    labels = [label] * len(cleaned_reviews) if label is not None else None
    return cleaned_reviews, labels


In [140]:

# 2. Clean text function
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())  # remove punctuation, lowercase
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Remove outliers (reviews < 5 words or > 500 words)
def remove_outliers(reviews, labels):
    cleaned_reviews, cleaned_labels = [], []
    for review, label in zip(reviews, labels):
        word_count = len(review.split())
        if 5 <= word_count <= 500:
            cleaned_reviews.append(review)
            cleaned_labels.append(label)
    return cleaned_reviews, cleaned_labels


In [141]:
# 3. Load all categories
categories = ["books", "electronics", "dvd", "kitchen_&_housewares"]

all_reviews = []
all_labels = []

for category in categories:
    pos_file = os.path.join(category, "positive.review")
    neg_file = os.path.join(category, "negative.review")
    unlabeled_file = os.path.join(category, "unlabeled.review")

    pos_reviews, pos_labels = load_reviews_from_file(pos_file, label=1)
    neg_reviews, neg_labels = load_reviews_from_file(neg_file, label=0)

    all_reviews.extend(pos_reviews + neg_reviews)
    all_labels.extend(pos_labels + neg_labels)

In [142]:
# 4. Tokenization and padding
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer

vocab_size = 20000
oov_token = "<OOV>"


tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(all_reviews)
sequences = tokenizer.texts_to_sequences(all_reviews)

# Save tokenizer
with open("tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

sequence_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=sequence_length, padding='post')


In [143]:
# In training code, after you set sequence_length
with open("sequence_length.txt", "w") as f:
    f.write(str(sequence_length))


In [144]:
# 5. Train/validation/test split
X_train, X_temp, y_train, y_temp = train_test_split(
    padded_sequences, all_labels, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

X_train = np.array(X_train, dtype=np.int32)
X_val   = np.array(X_val, dtype=np.int32)
X_test  = np.array(X_test, dtype=np.int32)

y_train = np.array(y_train, dtype=np.float32)
y_val   = np.array(y_val, dtype=np.float32)
y_test  = np.array(y_test, dtype=np.float32)

In [145]:
# 6. Model
model = keras.Sequential([
    keras.layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=sequence_length),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [146]:
# 7. Train
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_val, y_val)
)

# 8. Evaluate
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")



Epoch 1/10
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 103ms/step - accuracy: 0.4915 - loss: 0.6986 - val_accuracy: 0.4875 - val_loss: 0.6960
Epoch 2/10
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 83ms/step - accuracy: 0.4935 - loss: 0.6955 - val_accuracy: 0.5125 - val_loss: 0.6940
Epoch 3/10
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 106ms/step - accuracy: 0.4953 - loss: 0.6946 - val_accuracy: 0.4875 - val_loss: 0.6961
Epoch 4/10
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 77ms/step - accuracy: 0.5030 - loss: 0.6973 - val_accuracy: 0.4875 - val_loss: 0.6935
Epoch 5/10
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 95ms/step - accuracy: 0.5012 - loss: 0.6963 - val_accuracy: 0.5125 - val_loss: 0.6928
Epoch 6/10
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 95ms/step - accuracy: 0.5008 - loss: 0.6961 - val_accuracy: 0.4875 - val_loss: 0.6934
Epoch 7/10
[1

In [147]:


# Save model
model.save('sentiment_model.h5')

# Inference function
def predict_sentiment(text):
    cleaned_text = clean_text(text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded = pad_sequences(sequence, maxlen=sequence_length)
    prediction = model.predict(padded)[0][0]
    return 'Positive review' if prediction > 0.5 else 'Negative review'



In [150]:
predict_sentiment("I hated this product")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 225ms/step


'Negative review'