# Finetuning GPT 3.5 for solving OR problems

In [None]:
import json
import tiktoken 
import numpy as np
from collections import defaultdict
from openai import OpenAI

In [None]:
FULL_TRAIN_DATA_PATH = '/workspaces/da-atlante-engine/atlante/data/sources/llm-for-or/task3_train_public.json'

In [None]:
system_prompt = """You are an expert in Operations Research, Python programming language, PuLP library and JSON file format.
I will give you a description in natural language of an optimization problem <problem_instance> and you just have to write a Python script using PuLP to solve that specific instance of the problem.
Don't write anything before and after the code, it will be simple for me to parse your answer. 
Please remember to solve the problem with status = problem.solve(PULP_CBC_CMD(msg=0)) in order to suppress solver's log.
The code must only print a single string to stdout, formatted in JSON, which contains the answers to the question I will give you in input. 
Use the <output_keys_dict> to format the stdout and use them as keys of the JSON output."""


### Load data

In [140]:
f = open(FULL_TRAIN_DATA_PATH)
train_data = json.load(f)


In [116]:
train_data[0:2]

[{'id': 0,
  'question': 'A company is organizing a team-building event and needs to assign participants to different activities. They have a total of 100 employees. Activity A requires 5 employees a team, activity B requires 3 employees a team, and activity C requires 7 employees a team. The company has a total of 100 employees available for the event. The company has a limitation on the number of teams in activity B, which cannot exceed 20. The company wants to maximize participation teams and decides to allocate different weights to each activity: activity A has a weight of 3, activity B has a weight of 2, and activity C has a weight of 4. The objective is to maximize the total participation weighted by the assigned weights.',
  'code': '# Import PuLP library\nfrom pulp import *\n\n# Define the decision variables\nnum_participants_A = LpVariable("NumParticipantsA", lowBound=0, cat=\'Integer\') # number of participants in activity A\nnum_participants_B = LpVariable("NumParticipantsB"

In [142]:
data_formatted = []
for ex in train_data:
    user_content = "<problem_instance>: {} \n <output_keys_dict> {}".format(ex['question'], ex['results'].keys())
    formatted_ex = {"messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}, {"role": "assistant", "content": ex['code']}]}
    data_formatted.append(formatted_ex)
    

In [143]:
formatted_ex

{'messages': [{'role': 'system',
   'content': "You are an expert in Operations Research, Python programming language, PuLP library and JSON file format.\nI will give you a description in natural language of an optimization problem <problem_instance> and you just have to write a Python script using PuLP to solve that specific instance of the problem.\nDon't write anything before and after the code, it will be simple for me to parse your answer.\nThe code must only print a single string to stdout, formatted in JSON, which contains the answers to the question I will give you in input. \nUse the <output_keys_dict> to format the stdout and use them as keys of the JSON output."},
  {'role': 'user',
   'content': "<problem_instance>: A body builder buys pre prepared meals, a turkey dinner and a tuna salad sandwich. The turkey dinner contains 20 grams of protein, 30 grams of carbs, and 12 grams of fat. The tuna salad sandwich contains 18 grams of protein, 25 grams of carbs, and 8 grams of fat

In [144]:
# Example to remove
ex_to_delate = [0,2,539, 634, 660, 661, 684, 687, 720, 723, 741, 768, 775, 793, 823, 831, 860, 910, 928, 940, 948, 999]
new_list = [j for i, j in enumerate(data_formatted) if i not in ex_to_delate]

In [146]:
data_formatted = new_list

In [148]:
full_data_path = '/workspaces/da-atlante-engine/atlante/data/sources/llm-for-or/full_data.jsonl'

with open(full_data_path, 'w') as f:
    for item in data_formatted:
        f.write(json.dumps(item) + "\n")

In [149]:
# Load the dataset
with open(full_data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 978
First example:
{'role': 'system', 'content': "You are an expert in Operations Research, Python programming language, PuLP library and JSON file format.\nI will give you a description in natural language of an optimization problem <problem_instance> and you just have to write a Python script using PuLP to solve that specific instance of the problem.\nDon't write anything before and after the code, it will be simple for me to parse your answer.\nThe code must only print a single string to stdout, formatted in JSON, which contains the answers to the question I will give you in input. \nUse the <output_keys_dict> to format the stdout and use them as keys of the JSON output."}
{'role': 'user', 'content': "<problem_instance>: A company is organizing a team-building event and needs to assign participants to different activities. They have a total of 100 employees. The company has a total of 100 employees available for the event. Each participant in activity A consumes 2 unit

In [150]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [151]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [152]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 445, 1096
mean / median: 612.3098159509202, 595.5
p5 / p95: 516.0, 738.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 201, 562
mean / median: 273.5838445807771, 263.0
p5 / p95: 225.7, 330.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [153]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~598839 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~1796517 tokens


## Train and test splitting

In [155]:
import random
random.seed(42)

In [156]:
random.shuffle(data_formatted)

train_data = data_formatted[:800]
validation_data = data_formatted[800:]

In [157]:
train_data_path = '/workspaces/da-atlante-engine/atlante/data/sources/llm-for-or/train_data.jsonl'

with open(train_data_path, 'w') as f:
    for item in train_data:
        f.write(json.dumps(item) + "\n")

In [158]:
validation_data_path = '/workspaces/da-atlante-engine/atlante/data/sources/llm-for-or/validation_data.jsonl'

with open(validation_data_path, 'w') as f:
    for item in validation_data:
        f.write(json.dumps(item) + "\n")

## Finetuning

#### File upload

In [161]:
client = OpenAI()

# Train upload

train_file_openai = client.files.create(
  file=open(train_data_path, "rb"),
  purpose="fine-tune"
)


In [162]:
# Validation upload

validation_file_openai = client.files.create(
  file=open(validation_data_path, "rb"),
  purpose="fine-tune"
)

#### Fine tuning job

In [189]:
client.fine_tuning.jobs.create(
  training_file=train_file_openai.id, 
  validation_file=validation_file_openai.id,
  model="gpt-3.5-turbo-0125",
  hyperparameters= {"n_epochs": 4},
  suffix='llm-for-or-new'
)

FineTuningJob(id='ftjob-jSSVeg8jjqcPZwx18mkcPHId', created_at=1716902661, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=4, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-GBbGKB5R2ohK7iGc0XliSbnr', result_files=[], seed=1117995693, status='validating_files', trained_tokens=None, training_file='file-FJmekHS1TFMf0FHCbpyfiY5u', validation_file='file-4rVYmEfHQbg5hwPwaSywQYjN', estimated_finish=None, integrations=[], user_provided_suffix='llm-for-or-new')

In [200]:
# run this line for checking status
client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-hQYrRfDMEw4CrYCczMIZRGO1", limit=10)

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-OM5BsmvtBPQHcuGmcv5J7rmw', created_at=1716903221, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-fMrpBAGXUqd8SreXSYqU80mm', created_at=1716903214, level='info', message='New fine-tuned model created: ft:gpt-3.5-turbo-0125:spindox-spa:llm-for-or-new:9TrFGGza', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-Z8qLUO8GjTEWCgL1jcQYoha1', created_at=1716903214, level='info', message='Checkpoint created at step 1200 with Snapshot ID: ft:gpt-3.5-turbo-0125:spindox-spa:llm-for-or-new:9TrFF4a8:ckpt-step-1200', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-owuP3qF1BYaVjKI52gyWjtJO', created_at=1716903214, level='info', message='Checkpoint created at step 800 with Snapshot ID: ft:gpt-3.5-turbo-0125:spindox-spa:llm-for-or-new:9TrFFQZR:ckpt-step-

## Testing

In [201]:
f = open('/workspaces/da-atlante-engine/atlante/data/sources/llm-for-or/task3_test_public.json')
test_data = json.load(f)

data_test_formatted = []
for ex in test_data:
    user_content = "<problem_instance>: {} \n <output_keys_dict> {}".format(ex['question'], ex['results'].keys())
    formatted_ex = {"messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}]}
    data_test_formatted.append(formatted_ex)
    

In [212]:
# change it for running on a new test istance
index = 2

In [207]:
data_test_formatted[index]

{'messages': [{'role': 'system',
   'content': "You are an expert in Operations Research, Python programming language, PuLP library and JSON file format.\nI will give you a description in natural language of an optimization problem <problem_instance> and you just have to write a Python script using PuLP to solve that specific instance of the problem.\nDon't write anything before and after the code, it will be simple for me to parse your answer.\nThe code must only print a single string to stdout, formatted in JSON, which contains the answers to the question I will give you in input. \nUse the <output_keys_dict> to format the stdout and use them as keys of the JSON output."},
  {'role': 'user',
   'content': "<problem_instance>: A furniture factory makes two products: bedside tables and bookcases. Both products have to go through two processes: crafting and polishing. For each bedside table, the workers spend 2.5 hours crafting and 1.5 hours polishing. For each bookcase, the workers spe

In [208]:
# ft:gpt-3.5-turbo-0125:spindox-spa:llm-for-or:9TouHarF   ---> first fine tuning
# ft:gpt-3.5-turbo-0125:spindox-spa:llm-for-or-new:9TrFGGza ---> last fine tuning

completion = client.chat.completions.create(
  model="ft:gpt-3.5-turbo-0125:spindox-spa:llm-for-or-new:9TrFGGza",
  messages=data_test_formatted[index]['messages']
)

In [210]:
result = completion.choices[0].message.content

In [211]:
result.replace('problem.solve()', 'problem.solve(PULP_CBC_CMD(msg=0))')

'# Import PuLP library\nfrom pulp import *\n\n# Define the decision variables\nnum_bedside_tables = LpVariable("NumBedsideTables", lowBound=0, cat=\'Continuous\') # number of bedside tables produced\nnum_bookcases = LpVariable("NumBookcases", lowBound=0, cat=\'Continuous\') # number of bookcases produced\n\n# Define the question as a maximum or minimum problem\nproblem = LpProblem("FurnitureFactory", LpMaximize)\n\n# Define the objective function\nobjective = 200 * num_bedside_tables + 500 * num_bookcases\nproblem += objective # maximize the total profit\n\n# Define the constraints\nproblem += 2.5 * num_bedside_tables + 5 * num_bookcases <= 30 # crafting hours constraint\nproblem += 1.5 * num_bedside_tables + 3 * num_bookcases <= 20 # polishing hours constraint\n\n# Solve the problem\nstatus = problem.solve(PULP_CBC_CMD(msg=0))\n\n# Output the answer\nprint("The number of bedside tables produced:", num_bedside_tables.value())\nprint("The number of bookcases produced:", num_bookcases.va