In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Load the dataset from JSON file
file_path = '/Users/aqib/Desktop/YEAR3/project/2500.json'
with open(file_path, 'r') as f:
    lines = f.readlines()

# Each line in the file is a separate JSON object, handle potential empty lines
data = []
for line in lines:
    line = line.strip()
    if line:
        data.append(json.loads(line))

# Convert to DataFrame
df = pd.DataFrame(data)

# Extract review texts and labels
texts = df['reviewText'].values
labels = df['overall'].values

# Preprocess labels
labels = labels - 1  # Adjust labels to be 0-4 for a 5-class classification

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
maxlen = 100
data = pad_sequences(sequences, maxlen=maxlen)

# Build the model
model = Sequential()
model.add(Embedding(10000, 128, input_length=maxlen))
model.add(SpatialDropout1D(0.4))  # Increased dropout
model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(LSTM(64, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(5, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Callbacks for early stopping and saving the best model
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, min_delta=0.0001),
    ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss')
]

# Train the model
history = model.fit(data, labels, epochs=20, batch_size=32, validation_split=0.2, callbacks=callbacks)

# Evaluate the model
accr = model.evaluate(data, labels)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))

2024-07-11 13:23:16.948633: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/20




[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 201ms/step - accuracy: 0.5893 - loss: 1.3324 - val_accuracy: 0.6575 - val_loss: 0.9940
Epoch 2/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 202ms/step - accuracy: 0.6410 - loss: 1.0419 - val_accuracy: 0.6575 - val_loss: 1.0029
Epoch 3/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 189ms/step - accuracy: 0.6183 - loss: 1.0262 - val_accuracy: 0.6678 - val_loss: 0.9101
Epoch 4/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 185ms/step - accuracy: 0.6488 - loss: 0.8535 - val_accuracy: 0.6747 - val_loss: 0.9061
Epoch 5/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 182ms/step - accuracy: 0.7483 - loss: 0.7009 - val_accuracy: 0.6370 - val_loss: 0.9729
Epoch 6/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 200ms/step - accuracy: 0.7463 - loss: 0.6307 - val_accuracy: 0.6336 - val_loss: 1.0434
Epoch 7/20
[1m37/37[0m [32m━━━━━━━