In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import keras
import joblib

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D, LSTM
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torcheval.metrics import MultilabelAccuracy
import torch
from prettytable import PrettyTable

from matplotlib import pyplot as plt

In [None]:
train_df = joblib.load('data/train_df.joblib')
test_df = joblib.load('data/test_df.joblib')

In [None]:
category_idx_to_text = {0: 'ambience', 1: 'food', 2: 'other', 3: 'price', 4: 'service'}
polarity_idx_to_text = {0: 'negative', 1: 'neutral', 2: 'positive'}
joint_idx_to_text = {0: 'ambience#negative', 1: 'ambience#neutral', 2: 'ambience#positive', 3: 'food#negative', 4: 'food#neutral', 5: 'food#positive', 6: 'other#negative', 7: 'other#neutral', 8: 'other#positive', 9: 'price#negative', 10: 'price#neutral', 11: 'price#positive', 12: 'service#negative', 13: 'service#neutral', 14: 'service#positive'}

In [None]:
(train_df['text'].apply(len) > 180).sum()

6

In [None]:
maxlen = 180   # only 6 reviews longer than 180 char
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(train_df["text"])

def get_features(text_series):
    """
    transforms text data to feature_vectors that can be used in the ml model.
    tokenizer must be available.
    """
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)


def prediction_to_label(prediction):
    tag_prob = [(labels[i], prob) for i, prob in enumerate(prediction.tolist())]
    return dict(sorted(tag_prob, key=lambda kv: kv[1], reverse=True))

In [None]:
class LSTMmodel:
    def __init__(self, num_classes):
        filter_length = 300

        model = Sequential()
        model.add(Embedding(max_words, 20, input_length=maxlen))
        model.add(Dropout(0.1))
        model.add(LSTM(filter_length, return_sequences=True))
        model.add(GlobalMaxPool1D())
        model.add(Dense(num_classes))
        model.add(Activation('sigmoid'))

        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
        model.summary()

        self.model = model

In [None]:
def model_metrics(test_labels, predictions):
    accuracy = accuracy_score(test_labels, predictions)

    exact_accuracy = MultilabelAccuracy(criteria='exact_match')
    exact_accuracy.update(target=torch.Tensor(test_labels), input=torch.Tensor(predictions))

    overlap_accuracy = MultilabelAccuracy(criteria='overlap')
    overlap_accuracy.update(target=torch.Tensor(test_labels), input=torch.Tensor(predictions))

    macro_f1 = f1_score(test_labels, predictions, average='macro')
    micro_f1 = f1_score(test_labels, predictions, average='micro')

    metrics = {}
    metrics = {
        'accuracy' : accuracy,
        'exact_match_accuracy': exact_accuracy.compute().detach().item(),
        'overlap_accuracy': overlap_accuracy.compute().detach().item(),
        'macro_f1': macro_f1,
        'micro_f1': micro_f1
    }

    return metrics

def pretty_table(dict):
    table = PrettyTable()
    table.field_names = ['metric', 'value']
    for k,v in dict.items():
        table.add_row([k, v])
    print(table)

In [None]:
def plot_history(history, path):
    # Create a figure and a 1x2 grid of subplots
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))

    # Plot the first subplot on the left
    axes[0].plot(history.history['categorical_accuracy'])
    axes[0].plot(history.history['val_categorical_accuracy'])
    axes[0].set_title('model categorical accuracy')
    axes[0].set_ylabel('accuracy')
    axes[0].set_xlabel('epoch')
    axes[0].legend(['train', 'val'], loc='upper left')

        # Plot the first subplot on the left
    axes[1].plot(history.history['loss'])
    axes[1].plot(history.history['val_loss'])
    axes[1].set_title('model loss')
    axes[1].set_ylabel('loss')
    axes[1].set_xlabel('epoch')
    axes[1].legend(['train', 'val'], loc='upper left')

    # Adjust layout for better spacing
    plt.tight_layout()
    plt.savefig(path)
    plt.show()

## Category classification

In [None]:
category_labels = category_idx_to_text.values()

x_train_category = get_features(train_df["text"])
x_test_category = get_features(test_df["text"])

y_train_category = np.array(train_df["category_labels"].tolist())
y_test_category = np.array(test_df["category_labels"].tolist())

In [None]:
num_classes  = 5

callbacks = [
            ReduceLROnPlateau(),
            EarlyStopping(patience=4),
            ModelCheckpoint(filepath='category_model.model', save_best_only=True)
        ]

category_model = LSTMmodel(num_classes)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 180, 20)           100000    
                                                                 
 dropout (Dropout)           (None, 180, 20)           0         
                                                                 
 lstm (LSTM)                 (None, 180, 300)          385200    
                                                                 
 global_max_pooling1d (Glob  (None, 300)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 5)                 1505      
                                                                 
 activation (Activation)     (None, 5)                 0         
                                                        

In [None]:
history_category = category_model.model.fit(x_train_category, y_train_category,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


In [None]:
lstm_model_category = keras.models.load_model('category_model.model')

In [None]:
lstm_model_category.evaluate(x_test_category, y_test_category)



[0.31499937176704407, 0.6862483024597168]

In [None]:
predict_category = lstm_model_category.predict(x_test_category)
threshold = 0.5

# Convert to binary format
predict_category = (predict_category > threshold).astype(int)



In [None]:
metrics_category = model_metrics(predict_category, y_test_category)

In [None]:
pretty_table(metrics_category)

+----------------------+--------------------+
|        metric        |       value        |
+----------------------+--------------------+
|       accuracy       | 0.5834445927903872 |
| exact_match_accuracy | 0.5834445953369141 |
|   overlap_accuracy   | 0.7636849284172058 |
|       macro_f1       | 0.5011922015182886 |
|       micro_f1       | 0.7412333736396615 |
+----------------------+--------------------+


In [None]:
plot_history(history_category, '/lstm_category_plot.png')

## Sentiment polarity

In [None]:
polarity_labels = polarity_idx_to_text.values()

x_train_polarity = get_features(train_df["text"])
x_test_polarity = get_features(test_df["text"])

y_train_polarity = np.array(train_df["polarity_labels"].tolist())
y_test_polarity = np.array(test_df["polarity_labels"].tolist())

In [None]:
polarity_labels

dict_values(['negative', 'neutral', 'positive'])

In [None]:
num_classes  = len(polarity_labels)

callbacks = [
            ReduceLROnPlateau(),
            EarlyStopping(patience=4),
            ModelCheckpoint(filepath='polarity_model.model', save_best_only=True)
        ]

polarity_model = LSTMmodel(num_classes)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 180, 20)           100000    
                                                                 
 dropout_1 (Dropout)         (None, 180, 20)           0         
                                                                 
 lstm_1 (LSTM)               (None, 180, 300)          385200    
                                                                 
 global_max_pooling1d_1 (Gl  (None, 300)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_1 (Dense)             (None, 3)                 903       
                                                                 
 activation_1 (Activation)   (None, 3)                 0         
                                                      

In [None]:
history_polarity = polarity_model.model.fit(x_train_polarity, y_train_polarity,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


In [None]:
lstm_model_polarity = keras.models.load_model('polarity_model.model')

In [None]:
lstm_model_polarity.evaluate(x_test_polarity, y_test_polarity)



[0.3941097855567932, 0.7356475591659546]

In [None]:
predict_polarity = lstm_model_polarity.predict(x_test_polarity)
threshold = 0.5

# Convert to binary format
predict_polarity = (predict_polarity > threshold).astype(int)



In [None]:
metrics_polarity = model_metrics(predict_polarity, y_test_polarity)

In [None]:
pretty_table(metrics_polarity)

+----------------------+--------------------+
|        metric        |       value        |
+----------------------+--------------------+
|       accuracy       | 0.6261682242990654 |
| exact_match_accuracy | 0.6261682510375977 |
|   overlap_accuracy   | 0.644859790802002  |
|       macro_f1       | 0.4078508319753456 |
|       micro_f1       | 0.7129151291512915 |
+----------------------+--------------------+


In [None]:
plot_history(history_polarity, 'lstm_polarity_plot.png')

## Joint classification

In [None]:
joint_labels = joint_idx_to_text.values()

x_train_joint = get_features(train_df['text'])
x_test_joint = get_features(test_df['text'])

y_train_joint = np.array(train_df['joint_labels'].tolist())
y_test_joint = np.array(test_df['joint_labels'].tolist())

In [None]:
joint_labels

dict_values(['ambience#negative', 'ambience#neutral', 'ambience#positive', 'food#negative', 'food#neutral', 'food#positive', 'other#negative', 'other#neutral', 'other#positive', 'price#negative', 'price#neutral', 'price#positive', 'service#negative', 'service#neutral', 'service#positive'])

In [None]:
num_classes  = len(joint_labels)

callbacks = [
            ReduceLROnPlateau(),
            EarlyStopping(patience=4),
            ModelCheckpoint(filepath='joint_model.model', save_best_only=True)
        ]

joint_model = LSTMmodel(num_classes)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 180, 20)           100000    
                                                                 
 dropout_2 (Dropout)         (None, 180, 20)           0         
                                                                 
 lstm_2 (LSTM)               (None, 180, 300)          385200    
                                                                 
 global_max_pooling1d_2 (Gl  (None, 300)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_2 (Dense)             (None, 15)                4515      
                                                                 
 activation_2 (Activation)   (None, 15)                0         
                                                      

In [None]:
history_joint = joint_model.model.fit(x_train_joint, y_train_joint,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20


In [None]:
lstm_model_joint = keras.models.load_model('joint_model.model')

In [None]:
lstm_model_joint.evaluate(x_test_joint, y_test_joint)



[0.1909979283809662, 0.49799734354019165]

In [None]:
predict_joint = lstm_model_joint.predict(x_test_joint)
threshold = 0.5

# Convert to binary format
predict_joint = (predict_joint > threshold).astype(int)



In [None]:
metrics_joint = model_metrics(predict_joint, y_test_joint)

In [None]:
pretty_table(metrics_joint)

+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       |  0.3164218958611482 |
| exact_match_accuracy | 0.31642189621925354 |
|   overlap_accuracy   |  0.4419225752353668 |
|       macro_f1       | 0.17767885291063154 |
|       micro_f1       |  0.504225352112676  |
+----------------------+---------------------+


In [None]:
plot_history(history_joint, 'lstm_joint_plot.png')