In [1]:
#import required libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

#download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
#mount Google Drive
from google.colab import drive
from tabulate import tabulate
drive.mount('/content/drive')
csv = pd.read_csv("/content/drive/MyDrive/Applied_Data_Science/amazon.csv")

#load the dataset and print its info
print(tabulate(csv.head(), headers='keys', tablefmt='grid'), csv.info())

Mounted at /content/drive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413840 entries, 0 to 413839
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Product Name  413840 non-null  object 
 1   Brand Name    348669 non-null  object 
 2   Price         407907 non-null  float64
 3   Rating        413840 non-null  int64  
 4   Reviews       413770 non-null  object 
 5   Review Votes  401544 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 18.9+ MB
+----+-----------------------------------------------------------------------------------------------------------+--------------+---------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [3]:
#preprocess the text data
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    text = text.lower()  # Convert to lowercase
    tokens = text.split()  # Tokenize
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    return ' '.join(tokens)

csv.dropna(subset=['Reviews', 'Rating'], inplace=True)
csv['Reviews'] = csv['Reviews'].apply(preprocess_text)

#labels (1 for positive, 0 for negative)
csv = csv[csv['Rating'] != 3]  # Drop neutral reviews
csv['Sentiment'] = (csv['Rating'] > 3).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  csv['Sentiment'] = (csv['Rating'] > 3).astype(int)


In [4]:
#tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(csv['Reviews'])
X = tokenizer.texts_to_sequences(csv['Reviews'])
X = pad_sequences(X, maxlen=100)
y = csv['Sentiment'].values

In [5]:
#data split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#build the model
embedding_dim = 100  # Dimensionality of the embedding space
model = Sequential([
    Embedding(input_dim=10000, output_dim=embedding_dim, input_length=100),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])



In [None]:
#compile and train the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

#model evaluation
loss, accuracy = model.evaluate(X_test, y_test)
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(f'Test Accuracy: {accuracy}')
print('ROC AUC Score:', roc_auc_score(y_test, y_pred))

Epoch 1/5
[1m9551/9551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1790s[0m 187ms/step - accuracy: 0.8984 - loss: 0.2520 - val_accuracy: 0.9315 - val_loss: 0.1809
Epoch 2/5
[1m6090/9551[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m10:09[0m 176ms/step - accuracy: 0.9367 - loss: 0.1686