In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Embedding, SimpleRNN, LSTM, Dense, Dropout, Input, concatenate
from keras.utils import to_categorical
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm





In [26]:

# Load the dataset
data = pd.read_csv("amazon_reviews.csv")

# Data Pre-processing
nltk.download('punkt')
nltk.download('stopwords')

# Set stopwords
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
# Function for text preprocessing
def preprocess_text(text):
    if isinstance(text, str):
        tokens = word_tokenize(text)
        filtered_tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
        return " ".join(filtered_tokens)
    else:
        return ""
    
def save_model(model, model_name):
    model_json = model.to_json()
    with open(f"{model_name}.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(f"{model_name}.h5")
    print("Saved model to disk")


In [28]:

# Loop over different splitting ratios and sequence padding lengths
results = []
results_rnn = []
results_lstm = []
for split_ratio in [0.7, 0.8]:
    for padding_length in [50, 100]:
        # Data Splitting
        X_text_train, X_text_test, y_train, y_test = train_test_split(
            data['cleaned_review'], data['sentiments'], test_size=(1 - split_ratio), random_state=42)

        # Text Preprocessing
        X_text_train = X_text_train.apply(preprocess_text)
        X_text_test = X_text_test.apply(preprocess_text)

        # Tokenization and Padding for text
        max_words = 10000
        tokenizer = Tokenizer(num_words=max_words)
        tokenizer.fit_on_texts(X_text_train)

        X_train_seq = tokenizer.texts_to_sequences(X_text_train)
        X_test_seq = tokenizer.texts_to_sequences(X_text_test)

        X_train_padded = pad_sequences(X_train_seq, maxlen=padding_length)
        X_test_padded = pad_sequences(X_test_seq, maxlen=padding_length)

        # Convert sentiments to categorical
        class_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
        y_train = y_train.map(class_mapping)
        y_test = y_test.map(class_mapping)

        y_train = to_categorical(y_train)
        y_test = to_categorical(y_test)

        # Model Training - Simple RNN
        # Define Simple RNN model
        text_input = Input(shape=(padding_length,), dtype='int32', name='text_input')


        embedding_layer = Embedding(max_words, 128, input_length=padding_length)(text_input)
        rnn_layer = SimpleRNN(128)(embedding_layer)

       
        output = Dense(3, activation='softmax')(rnn_layer)

        model_rnn = Model(inputs=text_input, outputs=output)
        model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

        # Train the Simple RNN model
        model_rnn.fit(X_train_padded, y_train, epochs=5, batch_size=64, verbose=0)

        # Evaluate the Simple RNN model
        loss, accuracy = model_rnn.evaluate(X_test_padded, y_test, verbose=0)

        # Save Simple RNN results
        results_rnn.append({'split_ratio': split_ratio,
                            'padding_length': padding_length,
                            'accuracy': accuracy})

        # Model Training - LSTM
        # Define LSTM model
        lstm_input = Input(shape=(padding_length,), dtype='int32', name='lstm_input')

        embedding_layer = Embedding(max_words, 128, input_length=padding_length)(lstm_input)
        lstm_layer = LSTM(128)(embedding_layer)

        output = Dense(3, activation='softmax')(lstm_layer)

        model_lstm = Model(inputs=lstm_input, outputs=output)
        model_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

        # Train the LSTM model
        model_lstm.fit(X_train_padded, y_train, epochs=5, batch_size=64, verbose=0)

        # Evaluate the LSTM model
        loss, accuracy = model_lstm.evaluate(X_test_padded, y_test, verbose=0)

        # Save LSTM results
        results_lstm.append({'split_ratio': split_ratio,
                             'padding_length': padding_length,
                             'accuracy': accuracy})

        # Save results
        results.append({'split_ratio': split_ratio,
                        'padding_length': padding_length,
                        'accuracy': accuracy})
        
        # Save the model

        #save_model(model_lstm, f"model_lstm_{split_ratio}_{padding_length}")
        #save_model(model_rnn, f"model_rnn_{split_ratio}_{padding_length}")



Saved model to disk
Saved model to disk
Saved model to disk
Saved model to disk
Saved model to disk
Saved model to disk
Saved model to disk
Saved model to disk


In [29]:
# Convert results to DataFrame
results_df_lstm = pd.DataFrame(results)
results_df_rnn = pd.DataFrame(results_rnn)

# Print results
print("Results Summary (LSTM):")
print(results_df_lstm)

print("\nResults Summary (RNN):")
print(results_df_rnn)







Results Summary (LSTM):
   split_ratio  padding_length  accuracy
0          0.7              50  0.864693
1          0.7             100  0.857390
2          0.8              50  0.868224
3          0.8             100  0.874279

Results Summary (RNN):
   split_ratio  padding_length  accuracy
0          0.7              50  0.853546
1          0.7             100  0.848741
2          0.8              50  0.860150
3          0.8             100  0.854095


In [30]:

# Find the best performing model
best_model_lstm = results_df_lstm.sort_values(by=['accuracy'], ascending=False).iloc[0]
best_model_rnn = results_df_rnn.sort_values(by=['accuracy'], ascending=False).iloc[0]

# Print the best performing model
print(f"Best LSTM model: {best_model_lstm}")
print(f"Best RNN model: {best_model_rnn}")

Best LSTM model: split_ratio         0.800000
padding_length    100.000000
accuracy            0.874279
Name: 3, dtype: float64
Best RNN model: split_ratio        0.80000
padding_length    50.00000
accuracy           0.86015
Name: 2, dtype: float64
