<a href="https://colab.research.google.com/github/CSSamarasinghe/SE4050_Assignment/blob/IT21222740/%F0%9F%9B%925_class_Classification_Amazon_Reviews_%5BGRU_Three_Layer%5D_45_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'amazon-reviews-for-sentianalysis-finegrained-csv:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2078107%2F3499094%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240930%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240930T150806Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D0384286882e57ed5979a453d3b36e2d5ac7c5683c21364a514272ce697b7a7430176301d285f6982230aa12ded3cf24f63136365e5939a1b4cf6a627f90b21b558772ac2b852ceeda8c9115147d5d45101662dbded2dc256b64c8607b23b8a2e138946585e223ab27c2e6ced66542d64b390b7a29fb5f28667574314b9b5d6a1e728981f43b957023ed099b2adfc9609101a051eb013f254b576ffd5c986f20ab81845f69f45dcde2f9c5e05f1e3fc15e3aed90b0112a274ad37e14053eb8e6d8635f649c8362c1478bc8b587d074f6474c488f679d16695f5ed6a20ddea00f6dc13ab4b1746b75013f78e36b46727d42ca74c2ba6b6d35bd214cd27c5751ebb'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading amazon-reviews-for-sentianalysis-finegrained-csv, 654512809 bytes compressed
Downloaded and uncompressed: amazon-reviews-for-sentianalysis-finegrained-csv
Data source import complete.


In [3]:
# Step 1: Load and Preprocess the Data
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [4]:
# Load the dataset
data = pd.read_csv('../input/amazon-reviews-for-sentianalysis-finegrained-csv/amazon_review_fine-grained_5_classes_csv/train.csv')
data = data.sample(n=120000, random_state=42)

In [5]:
# Step 2: Check for Missing Values
missing_values = data.isnull().sum()
print("Missing Values in Dataset:\n", missing_values)

# Optionally, remove missing values if any
data.dropna(inplace=True)

Missing Values in Dataset:
 class_index     0
review_title    8
review_text     0
dtype: int64


In [6]:
# Step 3: Clean the Text Data
# Function to clean the text
def clean_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

In [7]:
# Apply the clean_text function to both review_title and review_text columns
data['review_text'] = data['review_text'].apply(clean_text)
data['review_title'] = data['review_title'].apply(clean_text)

In [8]:
# Preview cleaned text
print("Cleaned Text Preview:\n", data[['review_title', 'review_text']].head())

Cleaned Text Preview:
                                     review_title  \
2945667           not koontzs best but good read   
2352586            addictiveexcept for the price   
1531260                             same problem   
941910   just say no to slimfast low carb shakes   
2582125                happy thanksgiving turkey   

                                               review_text  
2945667  really i should give it three and a half stars...  
2352586  mmmmmmm love this balm smells divine and makes...  
1531260  pitch black plays perfectly but for some reaso...  
941910   i am currently on a slimfast diet and found it...  
2582125  im not much on writing reviews so i will be br...  


In [9]:
# Step 4: Tokenize and Pad Sequences
# Initialize the Tokenizer
tokenizer = Tokenizer(num_words=5000)  # Limit to top 5000 words
tokenizer.fit_on_texts(data['review_text'])  # Fit on review text

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(data['review_text'])

# Pad the sequences to ensure uniform length
maxlen = 100  # Choose an appropriate max length
padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding='post')

# Convert class index to labels
labels = data['class_index'] - 1  # Assuming class index ranges from 1 to 5

# Display shape of padded sequences and labels
print(f'Shape of Padded Sequences: {padded_sequences.shape}')
print(f'Shape of Labels: {labels.shape}')

Shape of Padded Sequences: (119992, 100)
Shape of Labels: (119992,)


In [10]:
# Step 5: Split the Data into Training and Validation Sets
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [11]:
# Step 6: Build the GRU Model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=maxlen))
model.add(GRU(128, return_sequences=True))  # First GRU layer
model.add(GRU(64, return_sequences=False))  # Second GRU layer
model.add(Dropout(0.5))  # Dropout layer to prevent overfitting
model.add(Dense(5, activation='softmax'))  # Output layer for 5 classes

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()  # Show the model summary




In [12]:
# Step 7: Train the Model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val))

Epoch 1/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m883s[0m 585ms/step - accuracy: 0.2359 - loss: 1.5797 - val_accuracy: 0.4738 - val_loss: 1.1992
Epoch 2/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m827s[0m 551ms/step - accuracy: 0.4902 - loss: 1.1657 - val_accuracy: 0.4989 - val_loss: 1.1452
Epoch 3/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m833s[0m 556ms/step - accuracy: 0.5276 - loss: 1.0843 - val_accuracy: 0.5049 - val_loss: 1.1279
Epoch 4/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m865s[0m 558ms/step - accuracy: 0.5613 - loss: 1.0208 - val_accuracy: 0.5075 - val_loss: 1.1419
Epoch 5/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m854s[0m 553ms/step - accuracy: 0.5872 - loss: 0.9598 - val_accuracy: 0.5051 - val_loss: 1.1665


In [15]:
# Step 7: Continue training the model with more epochs
# Add more epochs to continue training
history_more = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val), initial_epoch=5)


Epoch 6/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m836s[0m 558ms/step - accuracy: 0.6198 - loss: 0.8922 - val_accuracy: 0.5037 - val_loss: 1.1980
Epoch 7/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m925s[0m 599ms/step - accuracy: 0.6528 - loss: 0.8303 - val_accuracy: 0.4923 - val_loss: 1.2704
Epoch 8/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m870s[0m 565ms/step - accuracy: 0.6924 - loss: 0.7543 - val_accuracy: 0.4858 - val_loss: 1.3419
Epoch 9/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m855s[0m 560ms/step - accuracy: 0.7330 - loss: 0.6724 - val_accuracy: 0.4781 - val_loss: 1.4624
Epoch 10/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m864s[0m 561ms/step - accuracy: 0.7715 - loss: 0.5939 - val_accuracy: 0.4697 - val_loss: 1.6554


In [17]:
# Step 7: Continue training the model with more epochs
# Add more epochs to continue training
history_more = model.fit(X_train, y_train, epochs=13, batch_size=64, validation_data=(X_val, y_val), initial_epoch=10)

Epoch 11/13
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m694s[0m 462ms/step - accuracy: 0.8051 - loss: 0.5207 - val_accuracy: 0.4685 - val_loss: 1.8135
Epoch 12/13
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m737s[0m 459ms/step - accuracy: 0.8383 - loss: 0.4469 - val_accuracy: 0.4609 - val_loss: 1.9282
Epoch 13/13
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m693s[0m 462ms/step - accuracy: 0.8636 - loss: 0.3901 - val_accuracy: 0.4545 - val_loss: 2.0959


In [18]:
# Step 7: Continue training the model with more epochs
# Add more epochs to continue training
history_more = model.fit(X_train, y_train, epochs=15, batch_size=64, validation_data=(X_val, y_val), initial_epoch=13)

Epoch 14/15
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m691s[0m 461ms/step - accuracy: 0.8848 - loss: 0.3387 - val_accuracy: 0.4497 - val_loss: 2.2556
Epoch 15/15
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m684s[0m 456ms/step - accuracy: 0.8996 - loss: 0.3007 - val_accuracy: 0.4546 - val_loss: 2.3946


In [19]:
# Step 8: Load the Test Dataset
test = pd.read_csv('../input/amazon-reviews-for-sentianalysis-finegrained-csv/amazon_review_fine-grained_5_classes_csv/test.csv')

# Step 9: Check for Missing Values in Test Dataset
missing_values_test = test.isnull().sum()
print("Missing Values in Test Dataset:\n", missing_values_test)

# Optionally, remove missing values if any
test.dropna(inplace=True)

# Step 10: Clean the Text Data in Test Dataset
# Apply the same clean_text function to test dataset
test['review_text'] = test['review_text'].apply(clean_text)
test['review_title'] = test['review_title'].apply(clean_text)

# Step 11: Tokenize and Pad Sequences for Test Data
# Convert test review text to sequences using the same tokenizer
test_sequences = tokenizer.texts_to_sequences(test['review_text'])

# Pad the test sequences to ensure uniform length (using the same maxlen)
test_padded_sequences = pad_sequences(test_sequences, maxlen=maxlen, padding='post')

# Convert class index to labels for test data
test_labels = test['class_index'] - 1  # Assuming class index ranges from 1 to 5

# Display shape of padded test sequences and test labels
print(f'Shape of Test Padded Sequences: {test_padded_sequences.shape}')
print(f'Shape of Test Labels: {test_labels.shape}')

# Step 12: Evaluate the Model on Test Data
test_loss, test_accuracy = model.evaluate(test_padded_sequences, test_labels)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

Missing Values in Test Dataset:
 class_index      0
review_title    26
review_text      0
dtype: int64
Shape of Test Padded Sequences: (649974, 100)
Shape of Test Labels: (649974,)
[1m20312/20312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1384s[0m 68ms/step - accuracy: 0.4547 - loss: 2.3944
Test Loss: 2.3891024589538574
Test Accuracy: 0.4544120132923126
