In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

# Load the dataset from JSON file
file_path = '/Users/aqib/Desktop/YEAR3/project/2500.json'
with open(file_path, 'r') as f:
    lines = f.readlines()

# Each line in the file is a separate JSON object, handle potential empty lines
data = []
for line in lines:
    line = line.strip()
    if line:
        data.append(json.loads(line))

# Convert to DataFrame
df = pd.DataFrame(data)

# Extract review texts and labels
texts = df['reviewText'].values
labels = df['overall'].values

# Preprocess labels
labels = labels - 1  # Adjust labels to be 0-4 for a 5-class classification

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
maxlen = 100
data = pad_sequences(sequences, maxlen=maxlen)

# Build the model
model = Sequential()
model.add(Embedding(10000, 128, input_length=maxlen))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Train the model
model.fit(data, labels, epochs=10, batch_size=32, validation_split=0.2)

# Save the model
model.save('sentiment_model.h5')

2024-07-11 12:54:43.978996: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10




[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 97ms/step - accuracy: 0.5489 - loss: 1.4208 - val_accuracy: 0.6575 - val_loss: 0.9929
Epoch 2/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 76ms/step - accuracy: 0.6407 - loss: 1.0508 - val_accuracy: 0.6575 - val_loss: 0.9817
Epoch 3/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 75ms/step - accuracy: 0.6194 - loss: 1.0192 - val_accuracy: 0.6541 - val_loss: 0.9370
Epoch 4/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 76ms/step - accuracy: 0.6996 - loss: 0.8205 - val_accuracy: 0.6541 - val_loss: 0.9507
Epoch 5/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 73ms/step - accuracy: 0.7664 - loss: 0.6519 - val_accuracy: 0.6164 - val_loss: 0.9822
Epoch 6/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 80ms/step - accuracy: 0.8332 - loss: 0.5072 - val_accuracy: 0.6507 - val_loss: 1.0763
Epoch 7/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━

