In [111]:
import pandas as pd
import numpy as np
import string
import tensorflow as tf
import matplotlib.pyplot as plt
import time
import random
import requests
import io
from collections import defaultdict
from itertools import product

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout, Concatenate, BatchNormalization
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, mean_absolute_error
from sklearn.model_selection import StratifiedKFold

# Loading the data


In [112]:
url = 'https://raw.githubusercontent.com/Aero-Sol/Deep-Learning-Exam/main/input_data.pkl'
response = requests.get(url)
data = pd.read_pickle(io.BytesIO(response.content))

In [113]:
data.shape

(13772, 10)

In [114]:
data.head()

Unnamed: 0,Hotel_Address,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Hotel_number_reviews,Reviewer_number_reviews,Review_Score,Review,Review_Type
88526,Scarsdale Scarsdale Place Kensington Kensingto...,5/2/2017,8.1,Copthorne Tara Hotel London Kensington,United Kingdom,7105,2,6.7,Expensive room rate that didn t include parki...,Bad_review
42019,53 53 59 Kilburn High Road Maida Vale London C...,8/4/2016,7.1,BEST WESTERN Maitrise Hotel Maida Vale,United Kingdom,1877,8,5.8,Bedroom in the basement No windows Very small...,Bad_review
80574,Pelai Pelai 28 Ciutat Vella 08002 Barcelona Spain,11/17/2016,8.6,Catalonia Ramblas 4 Sup,United Kingdom,4276,2,6.3,Room ready for a makeover Location,Bad_review
27131,3 3 Place du G n ral Koenig 17th arr 75017 Par...,2/4/2016,7.1,Hyatt Regency Paris Etoile,United Kingdom,3973,3,5.8,Firstly the lady at the check in desk was qui...,Bad_review
63857,Epping Epping Forest 30 Oak Hill London IG8 9N...,7/27/2016,7.5,Best Western PLUS Epping Forest,United Kingdom,587,7,3.3,Not being able to park my vehicle due to the ...,Bad_review


# Important:
Using the full dataframe the total time the notebook takes to run is around 25 minutes. To avoid having long training times only a subset of the full data will be used.
Because of this some values dependent on the number of samples will be different than the ones we would get when using the full dataset.
For example when talking about the number of samples the value will be 2754, while if we were using the full dataset that would be 13772.

In [115]:
df = data.sample(frac=0.2, random_state=42).reset_index(drop=True)


In [116]:
df.shape

(2754, 10)

# INPUT (2)

## (a) Preprocessing

First we can drop the column 'Average_score' as instructed. We can also drop the 'Hotel_Address' as written in the exam, since it does not add any useful information.

In [117]:
df.drop(['Average_Score'], axis=1, inplace=True)
df.drop(['Hotel_Address'], axis=1, inplace=True)

As for the preprocessing, I will first handle the preprocessing of the 'Review' feature, since it is the longest.
The initial step is defining a function that cleans the text as explained on the exam.

In [118]:
def preprocess_text(text):
    text = text.replace('--', ' ')  # replace double dashes with space
    words = text.split() # split into tokens by white space
    table = str.maketrans('', '', string.punctuation) # remove punctuation from each token
    words = [w.translate(table) for w in words]
    words = [w.lower() for w in words if w.isalpha()] # remove remaining tokens that are not alphabetic
    words = [word.lower() for word in words] # make lower case
    return words

# Apply to 'Review'
df['cleaned_review'] = df['Review'].apply(preprocess_text)

Then I create a dictionary of unique words, associating an integer to each.
After this I pad the resulting sequences to a reasonable lenght.

In [119]:
# Build a dictionary of unique words
word_freq = defaultdict(int)
for tokens in df['cleaned_review']:
    for word in tokens:
        word_freq[word] += 1

word2idx = {word: idx + 1 for idx, word in enumerate(sorted(word_freq))}  # Reserve 0 for padding


In [120]:
# calculating the 90th percentile
percentile = np.percentile(df['Review'].str.len(), 90)
mean = np.mean(df['Review'].str.len())
print(f"90th percentile: {percentile}")
print(f"mean: {mean}")

90th percentile: 366.8000000000011
mean: 139.44589687726943


Due to the mean being substantially smaller than the 90th percentile I will pad the sequences at 200, which seems to be the most efficient option.

In [121]:
# Convert tokens to integer sequences and pad
df['review_sequence'] = df['cleaned_review'].apply(lambda tokens: [word2idx[word] for word in tokens])
MAX_SEQUENCE_LENGTH = 200
df['padded_sequence'] = pad_sequences(df['review_sequence'], maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post').tolist()

Let's see how many integers have been assigned in total:

In [122]:
dict_size = len(word2idx) + 1
dict_size

5194

The +1 is the extra 0, used for padding, besides it \we have a total of 5193 unique integers.

Now we preprocess the 'Review_Date' feature. First we convert the column from string to datetime object, and then scale each value chronologically.

In [123]:
# Convert Review_Date to datetime and scale chronologically
df['Review_Date'] = pd.to_datetime(df['Review_Date'], format='%m/%d/%Y')
df['review_time_scaled'] = (df['Review_Date'] - df['Review_Date'].min()) / (df['Review_Date'].max() - df['Review_Date'].min())

Now we can preprocess the rest of the values. They will be divided in 2 groups based on if they are categorical or numerical.
For categorical features ('Hotel_Name', 'Reviewer_Nationality') we one-hot encode them, while the numerical features ('Hotel_number_reviews', 'Reviewer_number_reviews') will be scaled with a minmaxscaler, to have a float value between 0 and 1. The feature 'review_time_scaled' will be added to the numerical ones, since it also consists of a value between 0 and 1.
At the end we an concatenate both groups in an unique array, called 'structured_features'.

In [124]:
# One-hot encode categorical features
categorical_features = ['Hotel_Name', 'Reviewer_Nationality']
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
categorical_encoded = ohe.fit_transform(df[categorical_features])

# Scale numerical features
numerical_features = ['Hotel_number_reviews', 'Reviewer_number_reviews']
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(df[numerical_features])
review_time_array = df[['review_time_scaled']].values
numerical_scaled = np.hstack([numerical_scaled, review_time_array])

# Concatenate structured features
structured_features = np.hstack([categorical_encoded, numerical_scaled])

In [125]:
structured_features

array([[0.        , 0.        , 0.        , ..., 0.12481185, 0.        ,
        0.86575342],
       [0.        , 0.        , 0.        , ..., 0.21361912, 0.04895105,
        0.42739726],
       [0.        , 0.        , 0.        , ..., 0.09266061, 0.        ,
        0.21643836],
       ...,
       [0.        , 0.        , 0.        , ..., 0.10897706, 0.        ,
        0.70958904],
       [0.        , 0.        , 0.        , ..., 0.13480643, 0.06993007,
        0.00958904],
       [0.        , 0.        , 0.        , ..., 0.03335541, 0.04195804,
        0.2630137 ]])

## (b) Input of the model

### Structured features

Let's check if the shape of the array does have sense:
Its lenght should be composed of a 3 singular values (one for each numerical feature) summed with the unique values of the categorical features.

In [126]:
for column in categorical_features:
    print(f"Number of unique values for column '{column}': {df[column].nunique()}")
    print("-" * 20)

Number of unique values for column 'Hotel_Name': 894
--------------------
Number of unique values for column 'Reviewer_Nationality': 106
--------------------


Thus the array shoud have a shape = ( num_samples, 894 + 106 + 3 ) = (2754, 1003). Let's see if it's true:

In [127]:
structured_features.shape

(2754, 1003)

The value domain of the input is [0, 1]

### Review

After preprocessing, each review is transformed into a fixed lenght sequence of integers.
Thus the shape of this will be (2754, 200), since we defined 200 as the sequence lenght. The integers will have a value domain equal to  dict_size = [0, 5193], since there are a total of 5194 unique values.

## Preparing the dataset

In [128]:
# Inputs
X_text = np.array(df['padded_sequence'].tolist())
X_struct = structured_features

# Targets
y_type = (df['Review_Type'] == 'Good_review').astype(int).values
y_type = np.array(y_type)
score_min = 2.5
score_max = 10.0
y_score = (df['Review_Score'] - score_min) / (score_max - score_min)
y_score = np.array(y_score)

Splitting the dataset:

In [129]:
X_text_train_full, X_text_test, X_struct_train_full, X_struct_test, y_type_train_full, y_type_test, y_score_train_full, y_score_test = train_test_split(
    X_text, X_struct, y_type, y_score, test_size=0.2, random_state=42
)


# MODEL, OUTPUT, LOSS and MODEL CONFIGURATION (1, 3, 4, 5)

The model used will be a multi-layer perceptron, with multiple inputs and multiple outputs.
There will be 2 inputs: for the text input the encoded array of shape (2754, 200), with values in the domain [0, 5193], is passed through an Embedding layer, transforming each review into a dense matrix of shape (200, 50), with 50 the value of the embedding dimension, where each token is represented by a real-valued vector.
The resulting matrix is then flattened into a single vector and concatenated with the structured input features to be used as input to the MLP.



## Output
As written in the exam there will be 2 outputs:


*   Review_score : regression task, it will be a dense layer with a singular unit using a sigmoid activation function. To do this the target review scores are scaled to the range [0, 1] before training. Predictions are later rescaled back to their original possible values range [2.5, 10].
*   Review_type : binary classification task, it will also be a dense layer with only one unit, using a sigmoid activation function with a threshold = 0.5. To make this work I first will have to transoform the 2 possible output labels to binary, with *bad_review* = 0 and *good_review* = 1.



## Loss
Since we have 2 outputs, we will also have 2 loss functions:


*   Mean Squared Error for Review_score, since it is a regression task.
*   Binary Cross Entropy for Review_type, due to it being a binary classification.



## Model Configuration
### Layers

Using the padded sequence as the first input (X_text) and the array of structured features as the second (X_struct).
The first output will be the review type (y_type) with the 2 possible values encoded as integers (0 and 1), while the second will be the scaled version of the review score (y_score) using as min and max the respective possible values of the review (2.5 and 10).

Now it's time to  create a function that represents the model, that then will be called during the hyperparameter optimization phase to see which values have the best score.

In [None]:
def build_model( learning_rate= 0.001, dropout_rate=0.5, num_dense_layers=1):
    text_input = Input(shape=(MAX_SEQUENCE_LENGTH,), name='text_input')
    struct_input = Input(shape=(X_struct.shape[1],), name='struct_input')

    x = Embedding(input_dim=dict_size, output_dim=50)(text_input)
    x = Flatten()(x)   # Embedding and then flattening the text input

    combined = Concatenate()([x, struct_input]) # Concatenating the 2 inputs into an unique vector

    for _ in range(num_dense_layers):                                     # The number of layers changes with the hyperparameter Num_dense_layers,
        combined = Dense(64, kernel_initializer='he_uniform')(combined)   # where each will have 64 units, with He_uniform inizialization,
        combined = BatchNormalization()(combined)                         # a batch normalization bfeore the activation function (relu) and then a dropout
        combined = tf.keras.layers.Activation('relu')(combined)
        combined = Dropout(dropout_rate)(combined)

    out1 = Dense(1, activation='sigmoid', name='review_type')(combined) # Output for the review type
    out2 = Dense(1, activation='sigmoid', name='review_score')(combined) # Output for the review score

    model = tf.keras.Model(inputs=[text_input, struct_input], outputs=[out1, out2])   # Defining the model
    optimizer = Adam(learning_rate=learning_rate)   # Adam is used as the optimizer, with the learning rate as an hyperparameter
    model.compile(
        optimizer=optimizer,
        loss={'review_type': 'binary_crossentropy', 'review_score': 'mse'},   # Binary cross entropy for the binary classification, MSE for the regression
        loss_weights={'review_type': 0.5, 'review_score': 0.5},
        metrics={'review_type': 'accuracy', 'review_score': 'mse'}    # Metrics to evaluate the model
    )
    return model


In [None]:
build_model().summary()

## Hyperparameters Optimization

### Hyperparameter tuning
The hyperparameters I will tune are:
*   Learning Rate
*   Batch Size
*   Dropout rate
*   N of hidden layers
*   Training epochs


To select the model with the best hyperparameters I will train the possible models using only 5 epochs to make it faster. After selecting the best model I will also select the optimal number of epochs.


## Important:
for the dropout rate I will only use one value (0.3) to reduce the total time it takes to run the cross validation. If time was not a problem more values could be added, both for the dropout rate and for the other hyperparameters.

# Model Evaluation (6)

As written in the exam I would use a stratified k-fold cross validation to select the best hyperparameters.
After selecting the best configuration I would evaluate the performance of the final model on the test set, using Accuracy for 'Review_type' and MSE for 'Review_score'.

*Total time to run cv: around 4 minutes*

In [132]:
# Hyperparameter grid
param_grid = {
    'learning_rate': [0.01, 0.001],
    'batch_size': [32, 64],
    'dropout_rate': [0.3],  # 0.5 another possible value, skipped to reduce the time it takes to run this cell
    'num_dense_layers': [1, 2]
    }

kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)  #Defining the k-fold cross validation
best_val_acc = 0      #storing the best accuracy on the validation set
best_config = None    # storing the best hyperparameters configuration

# Grid search with cross-validation
for learning_rate, batch_size, dropout_rate, num_dense_layers in product(
    param_grid['learning_rate'],
    param_grid['batch_size'],
    param_grid['dropout_rate'],
    param_grid['num_dense_layers']):

    print(f"\nTraining with learning_rate={learning_rate}, batch_size={batch_size}, dropout_rate={dropout_rate}, num_dense_layers={num_dense_layers}")

    # Store validation metrics for each fold
    fold_accuracies = []
    fold_mses = []

    for train_idx, val_idx in kf.split(X_text_train_full, y_type_train_full):  # Create training and validation sets for each fold
        X_text_train, X_text_val = X_text_train_full[train_idx], X_text_train_full[val_idx]
        X_struct_train, X_struct_val = X_struct_train_full[train_idx], X_struct_train_full[val_idx]
        y_type_train, y_type_val = y_type_train_full[train_idx], y_type_train_full[val_idx]
        y_score_train, y_score_val = y_score_train_full[train_idx], y_score_train_full[val_idx]

        model = build_model(  # Building the model using the current configuration
            learning_rate=learning_rate,
            dropout_rate=dropout_rate,
            num_dense_layers=num_dense_layers,

        )

        history = model.fit(  # Train the model on the current fold, with only 5 epochs to peed up the process
            [X_text_train, X_struct_train],
            {'review_type': y_type_train, 'review_score': y_score_train},
            validation_data=([X_text_val, X_struct_val],
                             {'review_type': y_type_val, 'review_score': y_score_val}),
            epochs=5,
            batch_size=batch_size,
            verbose=0
        )

        acc = history.history['val_review_type_accuracy'][-1] # Validation accuracy of last epoch
        mse = history.history['val_review_score_mse'][-1] # Validation MSE of last epoch
        fold_accuracies.append(acc)
        fold_mses.append(mse)

    # Compute the average metrics
    avg_acc = np.mean(fold_accuracies)
    avg_mse = np.mean(fold_mses)
    print(f"Average Val Accuracy: {avg_acc:.4f}, Average Val MSE: {avg_mse:.4f}")

    if avg_acc > best_val_acc: # Update the best configuration if another gets better results
        best_val_acc = avg_acc
        best_config = {
            'learning_rate': learning_rate,
            'batch_size': batch_size,
            'dropout_rate': dropout_rate,
            'num_dense_layers': num_dense_layers
        }
# Printing the best configuration and its metrics
print("\nBest configuration:", best_config)
print(f"Best average validation accuracy: {best_val_acc:.4f}")
print(f"Best average validation MSE: {avg_mse:.4f}")


Training with learning_rate=0.01, batch_size=32, dropout_rate=0.3, num_dense_layers=1
Average Val Accuracy: 0.7762, Average Val MSE: 0.1246

Training with learning_rate=0.01, batch_size=32, dropout_rate=0.3, num_dense_layers=2
Average Val Accuracy: 0.6573, Average Val MSE: 0.1014

Training with learning_rate=0.01, batch_size=64, dropout_rate=0.3, num_dense_layers=1
Average Val Accuracy: 0.7327, Average Val MSE: 0.1521

Training with learning_rate=0.01, batch_size=64, dropout_rate=0.3, num_dense_layers=2
Average Val Accuracy: 0.7903, Average Val MSE: 0.1253

Training with learning_rate=0.001, batch_size=32, dropout_rate=0.3, num_dense_layers=1
Average Val Accuracy: 0.8470, Average Val MSE: 0.0638

Training with learning_rate=0.001, batch_size=32, dropout_rate=0.3, num_dense_layers=2
Average Val Accuracy: 0.7058, Average Val MSE: 0.0553

Training with learning_rate=0.001, batch_size=64, dropout_rate=0.3, num_dense_layers=1
Average Val Accuracy: 0.8493, Average Val MSE: 0.0675

Training 

Now we can train the best model with a higher number of epochs and using the whole training set.

In [133]:
# Use the best hyperparameters configuration to create the final model
final_model = build_model(
    learning_rate=best_config['learning_rate'],
    dropout_rate=best_config['dropout_rate'],
    num_dense_layers=best_config['num_dense_layers']
)

#Train the final model with an higher number of epochs, using the whole training data
final_model.fit(
    [X_text_train_full, X_struct_train_full],
    {'review_type': y_type_train_full, 'review_score': y_score_train_full},
    epochs=20,
    batch_size=best_config['batch_size'],
    verbose=1
)

Epoch 1/20
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - loss: 0.2712 - review_score_loss: 0.0675 - review_score_mse: 0.0676 - review_type_accuracy: 0.7978 - review_type_loss: 0.4750
Epoch 2/20
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1475 - review_score_loss: 0.0445 - review_score_mse: 0.0445 - review_type_accuracy: 0.9057 - review_type_loss: 0.2504
Epoch 3/20
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0947 - review_score_loss: 0.0356 - review_score_mse: 0.0356 - review_type_accuracy: 0.9464 - review_type_loss: 0.1539
Epoch 4/20
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0667 - review_score_loss: 0.0283 - review_score_mse: 0.0283 - review_type_accuracy: 0.9662 - review_type_loss: 0.1051
Epoch 5/20
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0371 - review_score_loss: 0.0227 - review_score_mse: 0.02

<keras.src.callbacks.history.History at 0x7ba6db470050>

As it can be seen, around the 15th epoch the model loss and accuracy start oscillating, meaning that 15 epochs should be a good stopping point for the training.

Now we evaluate the performance of the model on the test set, which represents the unseen data.

In [134]:
# Evaluate the best model performance on unseen data (test set)
results = final_model.evaluate(
    [X_text_test, X_struct_test], # Test inputs
    {'review_type': y_type_test, 'review_score': y_score_test}, # True Labels
    verbose=0,
    return_dict=True
)

review_type_accuracy = results['review_type_accuracy'] # Binary classification accuracy
review_score_mse = results['review_score_mse']  # Regression MSE (on [0, 1] scale)

print(f"Final Review Type Accuracy: {review_type_accuracy:.4f}")
print(f"Final Review Score MSE: {review_score_mse:.4f}")


Final Review Type Accuracy: 0.8240
Final Review Score MSE: 0.1023


For comparison, when training with the full dataset and using more hyperparameters, the final model accuracy on the review type was ~0.94 and the review score MSE was ~0.03.

## Example of outputs

Let's check the model performance on the first 3 samples from the test set:

In [135]:
# Predict both outputs on the test set
y_pred_type, y_pred_score = final_model.predict([X_text_test, X_struct_test])

# Convert classification predictions to binary (0 or 1)
y_pred_type_binary = (y_pred_type > 0.5).astype(int)

# Rescale regression predictions back to original [2.5, 10.0] range
y_pred_score_rescaled = y_pred_score * (score_max - score_min) + score_min
y_score_test_rescaled = y_score_test * (score_max - score_min) + score_min


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step


In [136]:
for i in range(3):
    print(f"Example {i+1}")
    print(f"  Predicted Review Type: {y_pred_type_binary[i][0]} (Raw: {y_pred_type[i][0]:.2f})") # Predicted review type. Also shows the raw output of the model
    print(f"  Actual Review Type:    {y_type_test[i]}") # True review type
    print(f"  Predicted Score:       {y_pred_score_rescaled[i][0]:.2f}") # Predicted review score
    print(f"  Actual Score:          {y_score_test_rescaled[i]:.2f}") # True review score
    print("-" * 40)


Example 1
  Predicted Review Type: 0 (Raw: 0.35)
  Actual Review Type:    1
  Predicted Score:       5.06
  Actual Score:          9.60
----------------------------------------
Example 2
  Predicted Review Type: 0 (Raw: 0.00)
  Actual Review Type:    0
  Predicted Score:       5.26
  Actual Score:          4.20
----------------------------------------
Example 3
  Predicted Review Type: 1 (Raw: 0.99)
  Actual Review Type:    1
  Predicted Score:       6.40
  Actual Score:          9.20
----------------------------------------
