<a href="https://colab.research.google.com/github/Becky0214/COMP8240-Major-Project-Individual-Dataset-Yelp/blob/main/Individual_dataset_Classification_on_Yelp_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Install necessary packages
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers


In [None]:
#!pip install datasets
#install the dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**C-LSTM FINE-GRAINED CLASSIFICATTION ON THE YELP DATASET**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Step 1: Load and preprocess the Yelp dataset
dataset = load_dataset("yelp_review_full", split="train")

# Extract reviews and ratings
texts = dataset['text']
ratings = dataset['label']  # Labels are integers from 0 to 4

# Convert ratings to binary labels (e.g., 4–5 stars = positive (1), 1–3 stars = negative (0))
binary_labels = [1 if rating >= 3 else 0 for rating in ratings]

# Tokenization and padding
VOCAB_SIZE = 10000
MAX_LEN = 300

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
x_data = pad_sequences(sequences, maxlen=MAX_LEN)
y_data = np.array(binary_labels)

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Step 2: Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim=300):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim=300):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load pre-trained GloVe embeddings (update this path)
glove_file_path = "/content/drive/MyDrive/glove.6B.300d.txt"
glove_embeddings = load_glove_embeddings(glove_file_path)
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE)

# Step 3: Define the C-LSTM Model for Binary Classification
class CLSTMBinaryClassifierYelp(tf.keras.Model):
    def __init__(self, vocab_size, embedding_matrix, max_length, num_filters=150, lstm_units=150, embedding_dim=300, dropout_rate=0.5, l2_reg_lambda=0.001):
        super(CLSTMBinaryClassifierYelp, self).__init__()
        self.embedding = layers.Embedding(input_dim=vocab_size,
                                          output_dim=embedding_dim,
                                          input_length=max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)
        self.embedding_dropout = layers.Dropout(rate=dropout_rate)

        # Convolutional layer with filter size 3
        self.conv_layer = layers.Conv2D(filters=num_filters,
                                        kernel_size=(3, embedding_dim),
                                        activation='relu', padding='valid')
        self.batch_norm = layers.BatchNormalization()

        # LSTM layer to capture dependencies
        self.lstm = layers.LSTM(lstm_units, return_sequences=False)
        self.dropout = layers.Dropout(rate=dropout_rate)

        # Output layer for binary classification
        self.fc = layers.Dense(1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.L2(l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.embedding_dropout(x, training=training)
        x = tf.expand_dims(x, -1)

        # Apply convolutional layer and batch normalization
        conv_out = self.conv_layer(x)
        conv_out = self.batch_norm(conv_out, training=training)
        conv_out = tf.squeeze(conv_out, 2)

        # Pass through LSTM
        rnn_outputs = self.lstm(conv_out)
        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Output for binary classification
        binary_output = self.fc(rnn_outputs)
        return binary_output

# Step 4: Initialize and compile the model
model = CLSTMBinaryClassifierYelp(vocab_size=VOCAB_SIZE,
                              embedding_matrix=embedding_matrix,
                              max_length=MAX_LEN)

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# Step 5: Train the model
history = model.fit(
    x_train, y_train,
    batch_size=64,
    epochs=10,
    validation_data=(x_test, y_test),
    verbose=1
)

# Step 6: Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Accuracy: {test_acc}')

Epoch 1/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 24ms/step - accuracy: 0.8322 - loss: 0.3679 - val_accuracy: 0.8928 - val_loss: 0.2503
Epoch 2/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 24ms/step - accuracy: 0.8907 - loss: 0.2593 - val_accuracy: 0.8938 - val_loss: 0.2480
Epoch 3/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 24ms/step - accuracy: 0.8963 - loss: 0.2478 - val_accuracy: 0.8918 - val_loss: 0.2575
Epoch 4/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 24ms/step - accuracy: 0.9001 - loss: 0.2411 - val_accuracy: 0.9018 - val_loss: 0.2360
Epoch 5/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 24ms/step - accuracy: 0.9031 - loss: 0.2343 - val_accuracy: 0.9029 - val_loss: 0.2315
Epoch 6/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 23ms/step - accuracy: 0.9038 - loss: 0.2310 - val_accuracy: 0.9039 - val_loss: 0.233

**C-LSTM FINE-GRAINED CLASSIFICATTION ON THE YELP DATASET**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Step 1: Load and preprocess the Yelp dataset
dataset = load_dataset("yelp_review_full", split="train")

# Extract reviews and ratings
texts = dataset['text']
fine_grained_labels = dataset['label']  # Labels are integers from 1 to 5

# Tokenization and padding
VOCAB_SIZE = 10000
MAX_LEN = 300

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
x_data = pad_sequences(sequences, maxlen=MAX_LEN)
y_data = np.array(fine_grained_labels)

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Step 2: Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim=300):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim=300):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load pre-trained GloVe embeddings (update this path)
glove_file_path = "/content/drive/MyDrive/glove.6B.300d.txt"
glove_embeddings = load_glove_embeddings(glove_file_path)
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE)

# Step 3: Define the C-LSTM Model for Fine-Grained Classification (5 classes)
class CLSTMFineGrainedClassifierYelp(tf.keras.Model):
    def __init__(self, vocab_size, embedding_matrix, max_length, num_filters=150, lstm_units=150, num_classes=5, embedding_dim=300, dropout_rate=0.5, l2_reg_lambda=0.001):
        super(CLSTMFineGrainedClassifierYelp, self).__init__()
        self.embedding = layers.Embedding(input_dim=vocab_size,
                                          output_dim=embedding_dim,
                                          input_length=max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)
        self.embedding_dropout = layers.Dropout(rate=dropout_rate)

        # Convolutional layer with filter size 3
        self.conv_layer = layers.Conv2D(filters=num_filters,
                                        kernel_size=(3, embedding_dim),
                                        activation='relu', padding='valid')
        self.batch_norm = layers.BatchNormalization()

        # LSTM layer to capture dependencies
        self.lstm = layers.LSTM(lstm_units, return_sequences=False)
        self.dropout = layers.Dropout(rate=dropout_rate)

        # Output layer for fine-grained classification
        self.fc = layers.Dense(num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.embedding_dropout(x, training=training)
        x = tf.expand_dims(x, -1)

        # Apply convolutional layer and batch normalization
        conv_out = self.conv_layer(x)
        conv_out = self.batch_norm(conv_out, training=training)
        conv_out = tf.squeeze(conv_out, 2)

        # Pass through LSTM
        rnn_outputs = self.lstm(conv_out)
        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Output for fine-grained classification
        multi_class_output = self.fc(rnn_outputs)
        return multi_class_output

# Step 4: Initialize and compile the model
model = CLSTMFineGrainedClassifierYelp(vocab_size=VOCAB_SIZE,
                                  embedding_matrix=embedding_matrix,
                                  max_length=MAX_LEN,
                                  num_classes=5)

model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Step 5: Train the model
history = model.fit(
    x_train, y_train,
    batch_size=64,
    epochs=10,
    validation_data=(x_test, y_test),
    verbose=1
)

# Step 6: Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Accuracy: {test_acc}')


Epoch 1/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 24ms/step - accuracy: 0.5242 - loss: 1.0898 - val_accuracy: 0.6268 - val_loss: 0.8569
Epoch 2/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 23ms/step - accuracy: 0.6252 - loss: 0.8667 - val_accuracy: 0.6445 - val_loss: 0.8207
Epoch 3/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 23ms/step - accuracy: 0.6411 - loss: 0.8351 - val_accuracy: 0.6503 - val_loss: 0.8155
Epoch 4/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 23ms/step - accuracy: 0.6494 - loss: 0.8186 - val_accuracy: 0.6518 - val_loss: 0.8090
Epoch 5/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 23ms/step - accuracy: 0.6540 - loss: 0.8067 - val_accuracy: 0.6554 - val_loss: 0.8000
Epoch 6/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 23ms/step - accuracy: 0.6573 - loss: 0.7999 - val_accuracy: 0.6575 - val_loss: 0.792