<a href="https://colab.research.google.com/github/CSSamarasinghe/SE4050_Assignment/blob/IT21263194/Model_change_to_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'amazon-reviews-for-sentianalysis-finegrained-csv:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2078107%2F3499094%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241003%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241003T164549Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D25a30a2ad799910d48f45f67793fb1d09f31560ae3773fa14870108018fa213261ae62aeb020867952a30d8094a21359c2546a6475193ac986bc2645a191d848c7223dbd5c68600aeb8f17d5a979be9750827328816b6743cf0d74ca79e5a28878f8606f4d3cd2176678d18b8a70a20bbf9be8037aa238d4b56bd1a0ba1a8d1fb4003bbd12f51fa6d438aaa8a7e8fa90a9291e62969f7df78c34dc6a71c76fb2f1c11e2964127136ec4d5554e9f1bdd7c530f8ddf987a08dd6cb91a849e4e98a2089149d22cb00ca687688c756071af4270e392fd79699c1f4a35ca16597523ca8bfe37fea11d5911465ea51d6eb17c1e3663da806763c8ef4e96e1f0a9ae9ab'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')

Downloading amazon-reviews-for-sentianalysis-finegrained-csv, 654512809 bytes compressed
Downloaded and uncompressed: amazon-reviews-for-sentianalysis-finegrained-csv
Data source import complete.


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.regularizers import l1_l2
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load and preprocess data
train_data = pd.read_csv('../input/amazon-reviews-for-sentianalysis-finegrained-csv/amazon_review_fine-grained_5_classes_csv/train.csv')
test_data = pd.read_csv('../input/amazon-reviews-for-sentianalysis-finegrained-csv/amazon_review_fine-grained_5_classes_csv/test.csv')

# Sample the dataset for manageable size (adjust as necessary)
train_data = train_data.sample(n=60000, random_state=42)
test_data = test_data.sample(n=25000, random_state=42)

# Initialize the tokenizer and fit on training data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['review_text'])

# Convert text to sequences
X_train = tokenizer.texts_to_sequences(train_data['review_text'])
X_test = tokenizer.texts_to_sequences(test_data['review_text'])

# Pad sequences to ensure uniform input size
max_length = max(max(len(seq) for seq in X_train), max(len(seq) for seq in X_test))
X_train_padded = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test, maxlen=max_length, padding='post')

# Prepare labels (one-hot encoding)
Y_train = train_data['class_index'].values.reshape(-1, 1)
Y_test = test_data['class_index'].values.reshape(-1, 1)

encoder = OneHotEncoder(sparse_output=False)
Y_train_onehot = encoder.fit_transform(Y_train)
Y_test_onehot = encoder.transform(Y_test)

# Split training data into training and validation sets
X_train_padded, X_val_padded, Y_train_onehot, Y_val_onehot = train_test_split(
    X_train_padded, Y_train_onehot, test_size=0.2, random_state=42)

# Set hyperparameters
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size (plus padding)
embedding_dim = 100  # Dimension of embeddings

# Build the enhanced LSTM model
model = Sequential()

# Embedding Layer
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))

# First LSTM Layer with Batch Normalization and Dropout
model.add(LSTM(128, return_sequences=True, kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
model.add(BatchNormalization())
model.add(Activation('relu'))  # Adding ReLU after LSTM
model.add(Dropout(0.5))  # Add dropout for regularization

# Second LSTM Layer with Batch Normalization and Dropout
model.add(LSTM(128, return_sequences=True, kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
model.add(BatchNormalization())
model.add(Activation('relu'))  # Adding ReLU after LSTM
model.add(Dropout(0.5))  # Add dropout for regularization

# Third LSTM Layer with Batch Normalization (not returning sequences)
model.add(LSTM(128, kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
model.add(BatchNormalization())
model.add(Activation('relu'))  # Adding ReLU after LSTM

# Dense Layer with Dropout
model.add(Dense(64, kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
model.add(Activation('relu'))  # Adding ReLU before output layer
model.add(Dropout(0.5))  # Add dropout for regularization

# Output Layer with Softmax for multi-class classification (5 classes)
model.add(Dense(5, activation='softmax'))

# Compile the model
optimizer = Adam(learning_rate=0.0001)  # Reduce learning rate for more stable updates
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Define ReduceLROnPlateau callback to adjust learning rate based on validation loss
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)

# Train the model with validation data and callbacks
history = model.fit(
    X_train_padded,
    Y_train_onehot,
    epochs=10,
    batch_size=64,
    validation_data=(X_val_padded, Y_val_onehot),
    callbacks=[reduce_lr]
)

# Evaluate on test data and print results
test_loss, test_accuracy = model.evaluate(X_test_padded, Y_test_onehot)

print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%')



Epoch 1/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1386s[0m 2s/step - accuracy: 0.2005 - loss: 75.4813 - val_accuracy: 0.2027 - val_loss: 16.1173 - learning_rate: 1.0000e-04
Epoch 2/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1342s[0m 2s/step - accuracy: 0.1996 - loss: 10.1182 - val_accuracy: 0.1962 - val_loss: 3.0701 - learning_rate: 1.0000e-04
Epoch 3/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1352s[0m 2s/step - accuracy: 0.1990 - loss: 2.4733 - val_accuracy: 0.1989 - val_loss: 1.6706 - learning_rate: 1.0000e-04
Epoch 4/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1345s[0m 2s/step - accuracy: 0.2053 - loss: 1.6461 - val_accuracy: 0.1989 - val_loss: 1.6297 - learning_rate: 1.0000e-04
Epoch 5/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1343s[0m 2s/step - accuracy: 0.2011 - loss: 1.6297 - val_accuracy: 0.1989 - val_loss: 1.6297 - learning_rate: 1.0000e-04
Epoch 6/10
[1m750/750[0m [32m━━━━━