In [1]:
import os
import json
from pathlib import Path

# Folder containing the JSON files
folder_path = ''

folder_path = os.getcwd()

file_names = [
    'processed_synthetic_dataset_hotel_iterative_ids_40.json',
    'processed_synthetic_dataset_hotel_iterative_ids_big_2.json',
    'processed_synthetic_dataset_hotel_iterative_ids_big.json',
    'processed_synthetic_dataset_train_iterative_1.json',
    'processed_synthetic_dataset_train_iterative_2.json',
    'processed_synthetic_dataset_train_iterative_3.json'
]

# Split files into two categories based on their names
train_files = [fn for fn in file_names if 'train' in fn]
hotel_files = [fn for fn in file_names if 'hotel' in fn]

# Function to combine JSON files
def combine_json_files(file_list, output_file):
    combined_data = []
    for file_name in file_list:
        file_path = Path(folder_path) / file_name
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            combined_data.extend(data)
    
    output_path = Path(folder_path) / output_file
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(combined_data, f, indent=4)

# Combine the files and write to combined JSON files
combine_json_files(train_files, 'combined_train.json')
combine_json_files(hotel_files, 'combined_hotel.json')

print("Files combined successfully!")


Files combined successfully!


In [3]:
import json
import random

def split_data(file_path, train_ratio=0.8, validation_ratio=0.05, test_ratio=0.15):
    # JSON file
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    # Shuffle the data to ensure randomness
    random.shuffle(data)
    
    # Calculate split indices
    total_dialogues = len(data)
    train_end = int(total_dialogues * train_ratio)
    validation_end = train_end + int(total_dialogues * validation_ratio)
    
    # Split the data
    train_set = data[:train_end]
    validation_set = data[train_end:validation_end]
    test_set = data[validation_end:]
    
    # Save the splits to new JSON files
    with open('train_train_set.json', 'w', encoding='utf-8') as file:
        json.dump(train_set, file, ensure_ascii=False, indent=4)
    
    with open('train_validation.json', 'w', encoding='utf-8') as file:
        json.dump(validation_set, file, ensure_ascii=False, indent=4)
    
    with open('train_test_set.json', 'w', encoding='utf-8') as file:
        json.dump(test_set, file, ensure_ascii=False, indent=4)

split_data('combined_train.json')



In [7]:
import json

def combine_files(output_file_name, *input_files):
    combined_data = []
    
    for file_name in input_files:
        with open(file_name, 'r', encoding='utf-8') as file:
            data = json.load(file)
            combined_data.extend(data)
    

    with open(output_file_name, 'w', encoding='utf-8') as output_file:
        json.dump(combined_data, output_file, ensure_ascii=False, indent=4)

combine_files('synth_test_set.json', 'train_test_set.json',"hotel_test_set.json")


In [8]:
import json

def extract_unique_values(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    unique_services = set()
    unique_active_intents = set()
    unique_slot_keys = set()

    # Iterate through each scenario, turn, and frame to collect values
    for entry in data:
        for scenario in entry['scenarios']:
            for turn in scenario['turns']:
                for frame in turn['frames']:
                    unique_services.add(frame['service'])
                    if 'state' in frame and 'active_intent' in frame['state']:
                        unique_active_intents.add(frame['state']['active_intent'])
                    if 'state' in frame and 'slot_values' in frame['state']:
                        for key in frame['state']['slot_values']:
                            unique_slot_keys.add(key)

    print("Unique services:", unique_services)
    print("Unique active intents:", unique_active_intents)
    print("Unique slot keys:", unique_slot_keys)

extract_unique_values('synth_train_set.json')


Unique services: {'train_amenities', 'train_schedule', 'train_service', 'hotel_service', 'travel', 'train_station', 'station_facilities', 'Concierge_Service', 'train_station_facilities', 'hotel_reservation', 'train', 'hotel_1', 'rail_service', 'tourist_information', 'hotel'}
Unique active intents: {'', 'Check-in for reservation', 'Inquire menu prices', 'Provide information on train delays', 'Find out the next stop', 'Inquire about connecting trains', 'Buy a ticket for the next stop', 'Inquire about breakfast options', 'Inform about the next stop and delays', 'Inquire about journey details', 'Provide information on local eateries', 'Provide detailed information on attractions and confirm additional room amenities', 'Assist the traveler with the schedule', 'Find out the next station stop', 'Find out train departure time', 'Assist passengers with connections', 'Inform about seat selection process', 'get_train_info', "Find out the train's next stop", 'Know about hotel facilities', 'Arrange

In [13]:
# Dictionary mapping for replacing service values
service_replacements = {
    "train_amenities": "train",
    "train_schedule": "train",
    "train_service": "train",
    "hotel_service": "hotel",
    "travel": "train",
    "train_station": "train",
    "station_facilities": "train",
    "Concierge_Service": "hotel",
    "train_station_facilities": "train",
    "hotel_reservation": "hotel",
    "train": "train",
    "hotel_1": "hotel",
    "rail_service": "train",
    "tourist_information": "remove",  # To indicate removal of entries
    "hotel": "hotel"
}

# Function to update service values based on the mapping
def update_service_values(data, replacements):
    for scenario in data:
        for turn in scenario['scenarios'][0]['turns']:
            frames_to_remove = []
            for frame in turn['frames']:
                if frame['service'] in replacements:
                    if replacements[frame['service']] == "remove":
                        frames_to_remove.append(frame)
                    else:
                        frame['service'] = replacements[frame['service']]
            # Remove frames marked for removal
            for frame in frames_to_remove:
                turn['frames'].remove(frame)

file_path="synth_validation_set.json"

with open(file_path, 'r', encoding='utf-8') as file:
    json_data = json.load(file)

update_service_values(json_data, service_replacements)

# Save the modified JSON data to a new file
updated_file_path = 'updated_service_synth_validation_set.json'
with open(updated_file_path, 'w') as file:
    json.dump(json_data, file, indent=4)

In [None]:
import json

def remove_duplicates_and_empty_utterances(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    # Processing each scenario
    for scenario in data['scenarios']:
        unique_turns = {}
        # Collect turns, skipping duplicates and empty/blank utterances
        for turn in scenario['turns']:
            if turn['utterance'].strip() and turn['turn_id'] not in unique_turns:
                unique_turns[turn['turn_id']] = turn

        # Replace original turns with filtered ones
        scenario['turns'] = list(unique_turns.values())

    # Write the updated data to a new file
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

input_path = 'updated_service_synth_validation_set.json'
output_path = 'cleaned_service_synth_validation_set.json'
remove_duplicates_and_empty_utterances(input_path, output_path)

In [6]:
def process_data(input_file,output_path):

    with open(input_file, 'r', encoding='utf-8') as file:
        input_data = json.load(file)

    # Process each scenario to filter out duplicate turns and empty utterances
    for scenario in input_data:
        seen_turn_ids = set()
        filtered_turns = []
        
        for turn in scenario['scenarios'][0]['turns']:
            if turn['utterance'].strip() and turn['turn_id'] not in seen_turn_ids:
                filtered_turns.append(turn)
                seen_turn_ids.add(turn['turn_id'])

        scenario['scenarios'][0]['turns'] = filtered_turns



    # Save the processed data to a new JSON file
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(input_data, file, indent=4, ensure_ascii=False)

input_path = 'updated_service_synth_train_set.json'
output_path = 'cleaned_service_synth_train_set.json'
process_data(input_path,output_path)



In [7]:
import json

def adjust_speaker(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    # Iterate through each scenario
    for scenario in data:
        turns = scenario['scenarios'][0]['turns']
        # Check if the first turn's speaker is SYSTEM
        if turns[0]['speaker'] == "SYSTEM":
            # Adjust even numbered turns
            for turn in turns:
                if turn['turn_id'] % 2 == 0:  # Check if turn_id is even
                    turn['speaker'] = "USER"

    # Write the modified data to a new file
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

input_path = 'cleaned_service_synth_train_set.json'
output_path = 'adjusted_speaker_service_synth_train_set.json'
adjust_speaker(input_path, output_path)


In [12]:
import json

def load_slot_replacements(file_path):
    slot_replacements = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(': ', 1)  # Split only at the first colon
            if len(parts) == 2:
                key, value = parts
                slot_replacements[key.strip()] = value.strip()
    return slot_replacements


def update_slot_keys(data, slot_replacements):
    for scenario in data:
        for turn in scenario['scenarios'][0]['turns']:
            for frame in turn['frames']:
                # Update slot_values keys
                updated_slot_values = {}
                for key, value in frame['state']['slot_values'].items():
                    new_key = slot_replacements.get(key, key) 
                    updated_slot_values[new_key] = value
                frame['state']['slot_values'] = updated_slot_values

slot_dict_file = "slot_dictionary.txt"
json_file_path = "cleaned_service_synth_test_set.json"
updated_json_file_path = 'slot_normalised_synth_test_set.json'

# Load slot replacements from file
slot_replacements = load_slot_replacements(slot_dict_file)

# Load JSON data
with open(json_file_path, 'r', encoding='utf-8') as file:
    json_data = json.load(file)

# Apply the function to the JSON data
update_slot_keys(json_data, slot_replacements)

# Save the modified JSON data to a new file
with open(updated_json_file_path, 'w', encoding='utf-8') as file:
    json.dump(json_data, file, indent=4)


In [15]:
import json

def load_intent_replacements(file_path):
    intent_replacements = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(': ', 1)
            if len(parts) == 2:
                key, value = parts
                intent_replacements[key.strip()] = value.strip()
    return intent_replacements


def update_intents(data, intent_replacements):
    for scenario in data:
        for turn in scenario['scenarios'][0]['turns']:
            for frame in turn['frames']:
                # Update intent according to the replacements dictionary
                current_intent = frame['state']['active_intent']
                if current_intent in intent_replacements:
                    frame['state']['active_intent'] = intent_replacements.get(current_intent, current_intent)

intent_dict_file = "intent_dictionary.txt"
json_file_path = "slot_normalised_synth_train_set.json"
updated_json_file_path = 'normalised_intent_train_test_set.json'

# Load intent replacements from file
intent_replacements = load_intent_replacements(intent_dict_file)

# Load JSON data
with open(json_file_path, 'r', encoding='utf-8') as file:
    json_data = json.load(file)

# Apply the function to the JSON data
update_intents(json_data, intent_replacements)

# Save the modified JSON data to a new file
with open(updated_json_file_path, 'w', encoding='utf-8') as file:
    json.dump(json_data, file, indent=4)


In [4]:
import json

def update_slot_values(json_file, output_file):
    # Load JSON data
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    # Iterate through each scenario in the data
    for scenario in data:
        for turn in scenario['scenarios'][0]['turns']:
            for frame in turn['frames']:
                service = frame['service']
                new_slot_values = {}
                # Prefix each slot key with the service name
                for key, value in frame['state']['slot_values'].items():
                    new_key = f"{service}-{key}"  # Create new key by combining service and the original key
                    new_slot_values[new_key] = value
                # Update slot_values with the new keys
                frame['state']['slot_values'] = new_slot_values
    
    # Save the modified data to a new JSON file
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)

# Path to your JSON file
json_file = 'normalised_intent_synth_train_set.json'
output_file = 'normalised_intent_train_slotfixed_set.json'

# Update slot values
update_slot_values(json_file, output_file)


In [3]:
import json
def count_unique_dialogue_ids(data):
    unique_ids = set()
    for item in data:
        for scenario in item["scenarios"]:
            unique_ids.add(scenario["dialogue_id"])
    return len(unique_ids)

with open('combined_hotel.json', 'r', encoding='utf-8') as file:
    data = json.load(file)
# Count unique dialogue IDs
unique_dialogue_count = count_unique_dialogue_ids(data)
print(f"Unique dialogue IDs count: {unique_dialogue_count}")

Unique dialogue IDs count: 500
