In [1]:
!pip install -r requirements.txt

Collecting pandas==2.2.3 (from -r requirements.txt (line 1))
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langdetect==1.0.9 (from -r requirements.txt (line 2))
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting matplotlib==3.10.1 (from -r requirements.txt (line 4))
  Downloading matplotlib-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting tens

In [20]:
import tensorflow as tf

# Limit GPU memory usage
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.set_logical_device_configuration(
                gpu,
                [tf.config.LogicalDeviceConfiguration(memory_limit=(12 * 1024))])
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [21]:
from tensorflow.keras import layers, models
import numpy as np

In [22]:
rating_dim = 3          # e.g., service, cleanliness, value
embedding_dim = 128
lstm_units = 256
vocab_size = 5000       # change depending on tokenizer
max_seq_len = 20        # maximum length of output text


In [23]:
def build_model(rating_dim, embedding_dim, lstm_units, vocab_size, max_seq_len):
    # Inputs
    rating_input = layers.Input(shape=(rating_dim,), name="ratings")
    text_input = layers.Input(shape=(max_seq_len,), name="text")

    # Process ratings
    rating_proj = layers.Dense(lstm_units, activation="relu")(rating_input)
    rating_proj = layers.RepeatVector(max_seq_len)(rating_proj)  # [batch, seq_len, lstm_units]

    # Process tokens
    text_embed = layers.Embedding(vocab_size, embedding_dim)(text_input)

    # Combine ratings and text
    lstm_input = layers.Concatenate()([text_embed, rating_proj])

    # LSTM Decoder
    lstm_output = layers.LSTM(lstm_units, return_sequences=True)(lstm_input)
    output = layers.TimeDistributed(layers.Dense(vocab_size, activation="softmax"))(lstm_output)

    model = models.Model(inputs=[rating_input, text_input], outputs=output)
    return model


In [24]:
model = build_model(rating_dim, embedding_dim, lstm_units, vocab_size, max_seq_len)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()


In [25]:
sample_data = [
    {"ratings": [5.0, 5.0, 5.0], "review": "Excellent service and very clean room."},
    {"ratings": [4.0, 3.5, 4.5], "review": "Good experience, but the room could be cleaner."},
    {"ratings": [2.0, 2.5, 2.0], "review": "Dirty room and poor service."},
    {"ratings": [3.0, 4.0, 3.5], "review": "Room was okay and fairly clean."},
    {"ratings": [1.0, 1.5, 1.0], "review": "Terrible experience. Not worth the money."},
    {"ratings": [4.5, 4.5, 4.0], "review": "Very clean and staff were friendly."},
]

In [26]:
import json
import os

import pandas as pd
import numpy as np

def read_json_to_df(file_name):
    data = []
    with open(file_name) as data_file:
        for line in data_file:
            # Load each line of the JSON file as a dictionary
            data.append(json.loads(line))

    # Form a Pandas DataFrame from the dictionaries
    return pd.json_normalize(data)

# Load the training and test data
raw_train_df = read_json_to_df("hotel_reviews_train.json")
raw_test_df = read_json_to_df("hotel_reviews_test.json")

ratings_columns = [col for col in raw_train_df.columns if col.startswith("ratings.")]

# Select the title, text and overall rating columns to make a new dataframe
train_df = raw_train_df[["title", "text"] + ratings_columns]
test_df = raw_test_df[["title", "text"] + ratings_columns]

# Save the English reviews to a CSV file to save time filtering when running again (NumFOCUS, Inc. 2024)
if os.path.exists("english_hotel_reviews_train.csv"):
    train_df = pd.read_csv("english_hotel_reviews_train.csv")

if os.path.exists("english_hotel_reviews_test.csv"):
    test_df = pd.read_csv("english_hotel_reviews_test.csv")

train_df_2 = train_df.fillna(0)

inputs = train_df_2[ratings_columns]
# outputs = train_df_2['title'] + ' ' + train_df_2['text']
outputs = train_df_2['text']

In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import tensorflow as tf

# Ensure 'inputs' has no NaNs
inputs = train_df_2[ratings_columns].fillna(0).astype(np.float32).values

# Use review text as output
outputs = train_df_2['text'].astype(str).values  # Ensure string type

# Add special tokens
texts_with_tokens = ["<start> " + text + " <end>" for text in outputs]

# Tokenize text with special tokens
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts_with_tokens)

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(texts_with_tokens)

# Define max length for padding
max_len = 21  # or use max_len = max(len(seq) for seq in sequences)

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Create input and target sequences (for teacher forcing in seq2seq models)
input_seq = padded_sequences[:, :-1]
target_seq = padded_sequences[:, 1:]


In [12]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, RepeatVector, TimeDistributed

# Key dimensions
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
seq_length = input_seq.shape[1]
num_ratings = inputs.shape[1]

# Ratings input branch
ratings_input = Input(shape=(num_ratings,), name="ratings_input")
ratings_dense = Dense(64, activation='relu')(ratings_input)
ratings_repeated = RepeatVector(seq_length)(ratings_dense)

# Text input branch
text_input = Input(shape=(seq_length,), name="text_input")
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=seq_length)(text_input)

# Combine both inputs
merged = Concatenate()([embedding, ratings_repeated])

# Decoder LSTM
lstm_out = LSTM(256, return_sequences=True)(merged)
output = TimeDistributed(Dense(vocab_size, activation='softmax'))(lstm_out)

# Build and compile model
model = Model(inputs=[ratings_input, text_input], outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()




In [13]:
target_seq = np.expand_dims(target_seq, -1)  # shape: (samples, timesteps, 1)

In [14]:
model.fit(
    [inputs, input_seq],  # ratings + input tokens
    target_seq,
    batch_size=32,
    epochs=10,
    validation_split=0.2
)


Epoch 1/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 66ms/step - loss: 6.5427 - val_loss: 5.9132
Epoch 2/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 69ms/step - loss: 5.8318 - val_loss: 5.6353
Epoch 3/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 69ms/step - loss: 5.5285 - val_loss: 5.1602
Epoch 4/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 66ms/step - loss: 5.0238 - val_loss: 4.8165
Epoch 5/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 69ms/step - loss: 4.7142 - val_loss: 4.6438
Epoch 6/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 66ms/step - loss: 4.5384 - val_loss: 4.5348
Epoch 7/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 66ms/step - loss: 4.3983 - val_loss: 4.4527
Epoch 8/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 66ms/step - loss: 4.2774 - val_loss: 4.3943
Epoch 9/10
[1m600/600[

<keras.src.callbacks.history.History at 0x7a8b60236090>

In [15]:
def generate_review(model, tokenizer, ratings_input, max_len=20, temperature=0.7, max_repeats=3):
    # Get start/end token IDs safely
    start_token = tokenizer.word_index.get('start')
    end_token = tokenizer.word_index.get('end')

    if start_token is None or end_token is None:
        raise ValueError("Tokenizer is missing 'start' or 'end' tokens. Make sure you added them during training.")

    input_seq = [start_token]
    generated_words = []  # To store the generated words
    generated_token_ids = set()  # To track generated token IDs and avoid repetition

    reverse_word_index = {v: k for k, v in tokenizer.word_index.items()}  # Reverse index for decoding tokens

    # Debugging: Check the reverse word index
    print(f"Reverse word index size: {len(reverse_word_index)}")
    print(f"Reverse word index sample: {list(reverse_word_index.items())[:20]}")  # Print first 20 items

    repeat_count = 0  # Track repetition of words
    for _ in range(max_len):
        padded_seq = tf.keras.preprocessing.sequence.pad_sequences([input_seq], maxlen=max_len, padding='post')

        preds = model.predict([ratings_input, padded_seq], verbose=0)

        # Apply temperature to the predictions
        preds = preds[0][len(input_seq) - 1]  # Get prediction for the next word
        preds = np.asarray(preds).astype('float64')
        preds = np.exp(preds / temperature)  # Apply temperature
        preds = preds / np.sum(preds)  # Normalize to sum to 1 (probabilities)

        # Limit predictions to valid token IDs (tokens in the word index)
        valid_tokens = list(tokenizer.word_index.values())  # Get list of all valid token IDs
        valid_preds = preds[valid_tokens]  # Get the prediction probabilities for valid tokens
        valid_preds /= np.sum(valid_preds)  # Normalize the valid tokens' probabilities

        # Sample a token from the valid predictions
        next_token_id = np.random.choice(valid_tokens, p=valid_preds)

        # Map token ID to word using reverse_word_index
        next_word = reverse_word_index.get(next_token_id, None)

        # Debugging line: Check predicted token and its word
        print(f"Predicted token ID: {next_token_id} -> Word: {next_word}")

        # If the predicted word is invalid or None, skip this iteration or stop early
        if next_word is None:
            print("Prediction is None, stopping early...")
            break

        # Avoid repetition of the same token (in case of overly repetitive predictions)
        if next_token_id in generated_token_ids:
            repeat_count += 1
        else:
            repeat_count = 0

        if repeat_count > max_repeats:  # Stop if the model repeats the same word too much
            print("Model is repeating tokens too often. Stopping early...")
            break

        # Stop at end token
        if next_token_id == end_token:
            break

        generated_words.append(next_word)
        input_seq.append(next_token_id)
        generated_token_ids.add(next_token_id)

    # Convert list of words back to a string
    generated_review = ' '.join(generated_words).strip()

    print(f"Generated review: {generated_review}")
    return generated_review


In [16]:
test_rating = np.array([[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0]])  # Example rating
print(seq_length)
generated_review = generate_review(model, tokenizer, test_rating, max_len=20)

20
Reverse word index size: 41122
Reverse word index sample: [(1, '<OOV>'), (2, 'the'), (3, 'and'), (4, 'a'), (5, 'to'), (6, 'was'), (7, 'i'), (8, 'in'), (9, 'we'), (10, 'of'), (11, 'is'), (12, 'for'), (13, 'hotel'), (14, 'it'), (15, 'room'), (16, 'at'), (17, 'but'), (18, 'were'), (19, 'on'), (20, 'with')]
Predicted token ID: 11730 -> Word: wrought
Predicted token ID: 20086 -> Word: eatt
Predicted token ID: 17257 -> Word: 632
Predicted token ID: 37004 -> Word: bog
Predicted token ID: 16661 -> Word: thom
Predicted token ID: 20665 -> Word: hula
Predicted token ID: 21073 -> Word: repose
Predicted token ID: 9907 -> Word: applying
Predicted token ID: 14456 -> Word: caviar
Predicted token ID: 2072 -> Word: considered
Predicted token ID: 36881 -> Word:  rather
Predicted token ID: 31773 -> Word: hechts
Predicted token ID: 33386 -> Word: boreing
Predicted token ID: 16210 -> Word: moaned
Predicted token ID: 1550 -> Word: 1st
Predicted token ID: 22476 -> Word: napresso
Predicted token ID: 2513 ->

In [30]:
inputs

array([[1., 2., 1., ..., 1., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [4., 5., 4., ..., 4., 0., 0.],
       ...,
       [4., 4., 5., ..., 4., 0., 0.],
       [4., 5., 4., ..., 0., 0., 0.],
       [5., 5., 5., ..., 5., 0., 0.]], dtype=float32)

In [28]:
# Text summarisation example
from transformers import TFBartForConditionalGeneration, AutoTokenizer
import tensorflow as tf
import numpy as np

def format_input(ratings):
    return f"service: {ratings[0]} cleanliness: {ratings[1]} value: {ratings[2]}"

x = [format_input(inputs for d in inputs)]

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-base")

# Tokenize
input_encodings = tokenizer(x, padding="max_length", truncation=True, max_length=32, return_tensors="tf")
target_encodings = tokenizer(outputs, padding="max_length", truncation=True, max_length=32, return_tensors="tf")

IGNORE_INDEX = -100
labels = tf.where(
    target_encodings.input_ids == tokenizer.pad_token_id,
    tf.constant(IGNORE_INDEX, dtype=tf.int32),
    target_encodings.input_ids,
)

# Prepare dataset
dataset = tf.data.Dataset.from_tensor_slices((
    {
        "input_ids": input_encodings.input_ids,
        "attention_mask": input_encodings.attention_mask,
        "decoder_input_ids": target_encodings.input_ids,
        "decoder_attention_mask": target_encodings.attention_mask,
    },
    labels,
)).shuffle(10).batch(2)

# Compile model manually
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")

@tf.function
def train_step(batch_inputs, batch_labels):
    with tf.GradientTape() as tape:
        outputs = model(
            input_ids=batch_inputs["input_ids"],
            attention_mask=batch_inputs["attention_mask"],
            labels=batch_labels,
        )
        loss = tf.reduce_mean(outputs.loss)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# Train
EPOCHS = 100
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    for step, (batch_inputs, batch_labels) in enumerate(dataset):
        loss = train_step(batch_inputs, batch_labels)
        print(f"  Step {step + 1}: loss = {loss.numpy():.4f}")

# Test generation
test_input = "service: 2.0 cleanliness: 1.0 value: 2.5"
test_encoding = tokenizer([test_input], return_tensors="tf", truncation=True)
output_ids = model.generate(
    input_ids=test_encoding["input_ids"],
    attention_mask=test_encoding["attention_mask"],
    max_length=50
)
generated_review = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Generated review:", generated_review)

TypeError: 'generator' object is not subscriptable