Import the CSV file containing all of the entries as a data frame to inspect it.

In [None]:
import os
import pandas as pd
import pickle
import numpy as np

In [None]:
sys.path.append(os.path.abspath('/path/to/LLM-Reconfiguration/Dataset-Notebooks/utils'))
from dataset_utils import *

In [None]:
# Load the CSV file
file_path = '/path/to/LLM-Reconfiguration/Dataset-Notebooks/csv_files/samples_136bus.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(data.head())

Create a dictionary the iterates over every column of the data frame and creates a list of dictionaries where each dictionary has keys of each column of the data frame and it's corresponding values.

In [None]:
data_dict_list = convert_to_dict(data)

# Display the first few dictionaries
# for entry in data_dict_list[:1]:
#     print(entry)

Save the dictionary in a pickle (pkl) file so that we can load and process it faster and more easily and natievely in python. Avoided JSON due to serialization issues of complex numbers.

The reason we first have a pickle file and then we create the dataset in a text file even though we will be using purely text prompts is to preserve the number data format in one of the files in case we want to calculate any power laws in the future.

In [None]:
# Save the list of dictionaries to a Pickle file
output_file_path = '/path/to/LLM-Reconfiguration/Dataset-Notebooks/pkl_files/samples_136bus.pkl'
dict_to_pickle(data_dict_list, output_file_path)


We then iterate through each dictionary in the list and create a text prompt as structured as graph, connectivity, task description, input statistics using the various keys in the dictionary. We then save the file so we don't have to keep creating the pickle and text file every time we want to fine-tune the model.

In [None]:
# Load the dataset
file_path = '/path/to/LLM-Reconfiguration/Dataset-Notebooks/pkl_files/samples_136bus.pkl'

data = load_pickle(file_path)

# Display the structure of the dataset
print(type(data))
print(len(data))
print(data[0])  # Display the first entry for inspection
    
task_description = """ 
Find the optimal configuration, i.e. the optimal connectivity and optimal open lines of these buses and lines 
so as to ensure energy distribution to the whole system while minimizing the power loss. The number given for the busses indicates the 
total number of busses starting from 1 going all the way to the given number in increments of 1. Make sure the Open Lines 
in the output include ONLY Lines that are given in the input and that you take into account their given properties. 
The Available Lines WITHOUT the Open Lines should form a network graph that is a single graph, i.e. no subgraphs or 
multiple connected components lists and the graph should NOT contain any cycles i.e. the number of available lines WITHOUT 
the number of open lines should EQUAL the number of busses minus one. If you predict the system loss and the value is greater 
than the current system loss, DO NOT reconfigure the network and return the same configuration as in the input. ONLY 
return a reconfiguration if and only if the system loss you predict is less than the original one since that is the ultimate goal.
The output format should be strictly as follows Output: Open Lines=[List all predicted open lines as a list of tuples, i.e. pairs in 
brackets separated by a comma], Node Voltages=[List the updated node voltages as a comma separated list], System Loss=predicted system loss.
Do not output anything that is not of this exact format. Stop the output the moment you finish with the system loss, i.e. after you give the 
system loss to 3-4 decimal points print an eos token and stop generating more output. Adhere to the format 100%.
"""
            
# Generate inputs
inputs = create_inputs(data)

# Generate outputs
outputs = create_outputs(data)

# Display the first input as a sample
print(len(inputs))

# Display the first output as a sample
print(len(outputs))

input_path = '/path/to/LLM-Reconfiguration/Dataset-Notebooks/txt_files/inputs.txt'
# Save the inputs to a text file
save_to_txt(input_path, inputs)
        
output_path = '/path/to/LLM-Reconfiguration/Dataset-Notebooks/txt_files/outputs.txt'
# Save the outputs to a text file
save_to_txt(output_path, outputs)

We create a function to load prompts as a list so we can iterate through each prompt in the list during training.

In [None]:
# Load the inputs
inputs = load_entries('/path/to/LLM-Reconfiguration/Dataset-Notebooks/txt_files/inputs.txt')

# Load the outputs
outputs = load_entries('/path/to/LLM-Reconfiguration/Dataset-Notebooks/txt_files/outputs.txt')

# Display the first input as a sample
print(inputs[0])

# Display the first output as a sample
print(outputs[0])

In [None]:
print(len(inputs[0]))
print(len(outputs[0]))

In [None]:
print(len(inputs))
print(len(outputs))

In [None]:
task_descriptions = create_task_descriptions(task_description, inputs)
print(len(task_descriptions))

In [None]:
print(task_description)

In [None]:
prompts = create_prompts(task_descriptions, inputs)
print(len(prompts))

In [None]:
print(prompts[0])

In [None]:
train_df = create_train_df(task_descriptions, inputs, prompts, outputs)

In [None]:
generate_random_split(train_df)

In [None]:
train_df.head(5)

In [None]:
train_df.to_csv('/path/to/LLM-Reconfiguration/Dataset-Notebooks/train_files/train_136_nodes.csv',index_label = 'id')

In [None]:
#Used to modify columns of the dataset

# file_path = '/path/to/LLM-Reconfiguration/Dataset-Notebooks/samples_69bus.csv'
# set_value = '69'
# df_column = 'buses'
# output_file_path = '/path/to/LLM-Reconfiguration/Dataset-Notebooks/samples_69bus_clean.csv'


# modify_and_save(file_path, set_value, df_column, output_file_path)

In [None]:
file_paths = [
    '/path/to//LLM-Reconfiguration/Dataset-Notebooks/train_files/train_33_nodes.csv',
    '/path/to/LLM-Reconfiguration/Dataset-Notebooks/train_files/train_69_nodes.csv',
    '/path/to/LLM-Reconfiguration/Dataset-Notebooks/train_files/train_84_nodes.csv',
]

output_file_path = '/path/to/LLM-Reconfiguration/Dataset-Notebooks/train_files/train_33_69_84_nodes.csv'

combine_csv_files(file_paths, output_file_path)

In [None]:
# Get lengths of prompts and outputs

file_path = '/path/to//LLM-Reconfiguration/Dataset-Notebooks/train_84_nodes.csv'  # Replace with your file path
data = pd.read_csv(file_path)

print(len(prompts[0]))
print(len(outputs[0]))