In [7]:
import json
import os

import pandas as pd
import numpy as np

def read_json_to_df(file_name):
    data = []
    with open(file_name) as data_file:
        for line in data_file:
            # Load each line of the JSON file as a dictionary
            data.append(json.loads(line))

    # Form a Pandas DataFrame from the dictionaries
    return pd.json_normalize(data)

# Load the training and test data
raw_train_df = read_json_to_df("hotel_reviews_train.json")
raw_test_df = read_json_to_df("hotel_reviews_test.json")

ratings_columns = [col for col in raw_train_df.columns if col.startswith("ratings.")]

# Select the title, text and overall rating columns to make a new dataframe
train_df = raw_train_df[["title", "text"] + ratings_columns]
test_df = raw_test_df[["title", "text"] + ratings_columns]

# Save the English reviews to a CSV file to save time filtering when running again (NumFOCUS, Inc. 2024)
if os.path.exists("english_hotel_reviews_train.csv"):
    train_df = pd.read_csv("english_hotel_reviews_train.csv")

if os.path.exists("english_hotel_reviews_test.csv"):
    test_df = pd.read_csv("english_hotel_reviews_test.csv")

train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

inputs = train_df[ratings_columns]
outputs = train_df['text']

inputs_test = test_df[ratings_columns]
outputs_test = test_df['text']

In [2]:
def format_input(rating) -> str:
    input = "Generate a hotel review based on the following ratings:\n"
    service = rating.iloc[0]
    cleanliness = rating.iloc[1]
    overall = rating.iloc[2]
    value = rating.iloc[3]
    location = rating.iloc[4]
    sleep_quality = rating.iloc[5]
    rooms = rating.iloc[6]
    check_in_front_desk = rating.iloc[7]
    business_service = rating.iloc[8]

    if service != 0:
      input += f"- service: {service}\n"
    if cleanliness != 0:
      input += f"- cleanliness: {cleanliness}\n"
    if overall != 0:
      input += f"- overall: {overall}\n"
    if value != 0:
      input += f"- value: {value}\n"
    if location != 0:
      input += f"- location: {location}\n"
    if sleep_quality != 0:
      input += f"- sleep quality: {sleep_quality}\n"
    if rooms != 0:
      input += f"- rooms: {rooms}\n"
    if check_in_front_desk != 0:
      input += f"- check in: {check_in_front_desk}\n"
    if business_service != 0:
      input += f"- business service: {business_service}\n"

    input += "Review: "

    return input

In [15]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

from datasets import Dataset

dataset = Dataset.from_dict({"ratings": inputs.apply(format_input, axis=1).to_list(), "review_text": outputs.to_list()})

dataset = dataset.select(range(100))

# Preprocessing function for dataset
def preprocess(examples):
    model_inputs = tokenizer(examples["ratings"], truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(examples["review_text"], truncation=True, padding="max_length", max_length=512)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

tokenized = dataset.map(preprocess, batched=True)
training_args = TrainingArguments(output_dir='./results', per_device_train_batch_size=4, num_train_epochs=10)

trainer = Trainer(model=model, args=training_args, train_dataset=tokenized)
trainer.train()

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Step,Training Loss


TrainOutput(global_step=250, training_loss=2.229955322265625, metrics={'train_runtime': 64.5607, 'train_samples_per_second': 15.489, 'train_steps_per_second': 3.872, 'total_flos': 33835450368000.0, 'train_loss': 2.229955322265625, 'epoch': 10.0})

In [17]:
def generate_review(ratings_str):
    device = model.device
    inputs = tokenizer(ratings_str, return_tensors="pt").to(device)
    output = model.generate(**inputs, max_length=512)
    return tokenizer.decode(output[0], skip_special_tokens=True)


test_dataset = Dataset.from_dict({"ratings": inputs_test.apply(format_input, axis=1).to_list(), "review_text": outputs_test.to_list()})

test_dataset = test_dataset.select(range(100))  # Select first 100 rows


# Example
for example in test_dataset.select(range(5)):
    print("INPUT RATINGS:", example["ratings"])
    print("GENERATED REVIEW:", generate_review(example["ratings"]))
    print("REFERENCE REVIEW:", example["review_text"])
    print("="*80)

INPUT RATINGS: Generate a hotel review based on the following ratings:
- service: 5.0
- cleanliness: 5.0
- overall: 5.0
- value: 5.0
- location: 5.0
- sleep quality: 5.0
- rooms: 5.0
Review: 
GENERATED REVIEW: Hotel is a great place to stay in the hotel. The staff were very friendly and helpful and helpful. The room was clean and comfortable. The room was very clean and comfortable. The room was very clean and clean. The staff were very helpful and the room was clean and comfortable, clean and the room was very comfortable, the room was very clean, the room was very clean and the location is very close to the hotel. The location is perfect. The hotel is a short walk to the hotel was excellent. The hotel is very close to the city center. The hotel is a short walk from the u.k. - - The staff were very helpful and friendly. The location is very close to the city center and the a bit of a bit of a bit of the hotel. The location was a little bit - the room was very comfortable. The room was