In [1]:
from datasets import load_dataset
from dotenv import load_dotenv
import os
from huggingface_hub import login
import pandas as pd
from generator_flow import generator
from generator_flow import normal_generator

Load examples

In [2]:
load_dotenv()
login(os.getenv("HF_API_KEY"))
dataset = load_dataset("Tamiza/zimpl_data")
df_manual = dataset['test'].to_pandas()
dataset

DatasetDict({
    test: Dataset({
        features: ['input', 'output', 'parameter', 'solution'],
        num_rows: 67
    })
})

Table for manual tests

In [3]:
df_manual


Unnamed: 0,input,output,parameter,solution
0,A bakery produces two types of cookies: chocol...,"set CookieTypes := {""Caramel"", ""ChocolateChip""...",1,objective value: ...
1,"""The factory produces two products, W1 and W2,...","set Products := {""W1"", ""W2""};\nset Resources :...",1,objective value: ...
2,A mining company produces lignite and anthraci...,"set CoalTypes := {""Lignite"", ""Anthracite""};\ns...",1,objective value: ...
3,"A company has two warehouses, Warehouse A and ...","set Warehouses := {""A"", ""B""};\nset Stores := {...",1,objective value: ...
4,A nutritionist wants to create a diet plan for...,"set Foods := {""Food1"", ""Food2"", ""Food3""};\nset...",1,objective value: ...
...,...,...,...,...
62,A health-conscious family wants to have a very...,var x; # number of banana servings taken\nvar ...,0,objective value: 11.363636...
63,A campsite for caravans and tents has an area ...,var x integer; # number of caravans\nvar y ...,0,objective value: ...
64,A factory manufactures doodads and whirligigs....,var x integer; # Each doodad costs $2 to make ...,0,objective value: ...
65,The annual subscription for a tennis club is £...,var x integer; # number of adult members\nv...,0,objective value: ...


Generate responses using AI ZIMPL Generator (for now first 10)

In [4]:
df_manual['ai_advanced'] = df_manual.apply(lambda row: generator(row['input'], row['parameter']), axis=1)

Show answers

In [None]:
df_manual[:5].map(lambda x: str(x)).style.set_properties(**{'text-align': 'left', 'white-space': 'pre-wrap'})

In [6]:
df_manual['ai_normal'] = df_manual['input'].apply(normal_generator)

In [None]:
from pandas import DataFrame
df_manual[:5][['output', 'ai_advanced', 'ai_normal']].map(lambda x: str(x)).style.set_properties(**{'text-align': 'left', 'white-space': 'pre-wrap'})

## Solver results

In [None]:
from solver import run_scip_and_extract_solution
SCIP_PATH = r"C:\\Program Files\\SCIPOptSuite 9.1.0\\bin"

df_manual["ai_advanced"] = df_manual["ai_advanced"].astype(str)
df_manual["ai_normal"] = df_manual["ai_normal"].astype(str)

def process_zimpl_code(zimpl_code):
    try:
        valid, result = run_scip_and_extract_solution(zimpl_code, SCIP_PATH)
        print(f"Valid: {valid}, Result: {result}")
        return valid, result
    except Exception as e:
        print(f"Error processing code: {e}")
        return False, None



df_manual[["valid_advanced", "result_advanced"]] = df_manual["ai_advanced"].apply(
    lambda code: pd.Series(process_zimpl_code(code))
)

df_manual[["valid_normal", "result_normal"]] = df_manual["ai_normal"].apply(
    lambda code: pd.Series(process_zimpl_code(code))
)

## Compare validation

In [None]:
import pandas as pd

print("ZIMPL generator results")
comparison_table = pd.DataFrame({
    "Category": [
        "Valid (True)",
        "Not Valid (False)",
    ],
    "Count": [
        (df_manual["valid_advanced"] == True).sum(),
        (df_manual["valid_advanced"] == False).sum(),
    ]
})

comparison_table["Percentage"] = (
    comparison_table["Count"] / len(df_manual) * 100
).round(2)

print(comparison_table)

print("ZIMPL generator results - hardcoded")
comparison_table = pd.DataFrame({
    "Category": [
        "Valid hardcoded (True)",
        "Not Valid hardcoded (False)",
    ],
    "Count": [
        ((df_manual["valid_advanced"] == True) & (df_manual["parameter"] == 0)).sum(),
        ((df_manual["valid_advanced"] == False) & (df_manual["parameter"] == 0)).sum(),
    ]
})

comparison_table["Percentage"] = (
    comparison_table["Count"] / comparison_table["Count"].sum() * 100
).round(2)

print(comparison_table)

print("ZIMPL generator results - params")
comparison_table = pd.DataFrame({
    "Category": [
        "Valid with params (True)",
        "Not Valid with params (False)"
    ],
    "Count": [
        ((df_manual["valid_advanced"] == True) & (df_manual["parameter"] == 1)).sum(),
        ((df_manual["valid_advanced"] == False) & (df_manual["parameter"] == 1)).sum(),
    ]
})

comparison_table["Percentage"] = (
    comparison_table["Count"] / comparison_table["Count"].sum() * 100
).round(2)

print(comparison_table)

print("Normal OpenAI model results")
comparison_table_normal = pd.DataFrame({
    "Category": [
        "Valid (True)",
        "Not Valid (False)",
    ],
    "Count": [
        (df_manual["valid_normal"] == True).sum(),
        (df_manual["valid_normal"] == False).sum(),
    ]
})

comparison_table_normal["Percentage"] = (
    comparison_table_normal["Count"] / len(df_manual) * 100
).round(2)

print(comparison_table_normal)

print("All results")
comparison_table = pd.DataFrame({
    "Category": [
        "Both Valid (True, True)",
        "Advanced Valid Only (True, False)",
        "Normal Valid Only (False, True)",
        "Both Invalid (False, False)",
    ],
    "Count": [
        ((df_manual["valid_advanced"] == True) & (df_manual["valid_normal"] == True)).sum(),
        ((df_manual["valid_advanced"] == True) & (df_manual["valid_normal"] == False)).sum(),
        ((df_manual["valid_advanced"] == False) & (df_manual["valid_normal"] == True)).sum(),
        ((df_manual["valid_advanced"] == False) & (df_manual["valid_normal"] == False)).sum(),
    ]
})

comparison_table["Percentage"] = (
    comparison_table["Count"] / len(df_manual) * 100
).round(2)

print(comparison_table)


In [None]:
import pandas as pd

notvalid_rows_advanced = df_manual[df_manual['valid_advanced'] == False]
selected_columns = notvalid_rows_advanced[['output', 'ai_advanced', 'ai_normal']]
selected_columns.to_excel('not_valid.xlsx', index=False, engine='openpyxl')

print("Dane zostały zapisane w formacie XLSX do pliku not_valid.xlsx.")


In [11]:
from IPython.display import display, HTML

df_manual["ai_advanced_display"] = df_manual["ai_advanced"].str.replace('\n', '<br>', regex=False)

html_content = df_manual["ai_advanced_display"].apply(lambda x: f'<div>{x}</div>').to_list()
#display(HTML(''.join(html_content)))

Check valid rows and search for correct results in advanced ZIMPL creator

In [None]:
from obj_fun_validator import compare_solutions

valid_rows_advanced = df_manual[df_manual['valid_advanced'] == True]
notvalid_rows_advanced = df_manual[df_manual['valid_advanced'] == False]
#print(valid_rows_advanced["result_advanced"])
#print(notvalid_rows_advanced["result_advanced"])

df_manual.loc[valid_rows_advanced.index, 'compare_advanced'] = valid_rows_advanced.apply(
    lambda row: compare_solutions(row['solution'], row['result_advanced'])[0], axis=1
)

valid_count = len(valid_rows_advanced)

#print(f"valid_advanced == True: {valid_count}")
#print(df_manual['compare_advanced'])

total_count = len(df_manual)
nan_count = df_manual['compare_advanced'].isna().sum()
true_count = (df_manual['compare_advanced'] == True).sum()
false_count = (df_manual['compare_advanced'] == False).sum()

nan_percentage = (nan_count / total_count) * 100
true_percentage = (true_count / total_count) * 100
false_percentage = (false_count / total_count) * 100

print(f"All stats:")
print(f"True: {true_count} ({true_percentage:.2f}%)")
print(f"False: {false_count} ({false_percentage:.2f}%)")
print(f"NaN: {nan_count} ({nan_percentage:.2f}%)")

non_nan_rows = df_manual.loc[~df_manual['compare_advanced'].isna()]
true_non_nan_count = (non_nan_rows['compare_advanced'] == True).sum()
false_non_nan_count = (non_nan_rows['compare_advanced'] == False).sum()

true_non_nan_percentage = (true_non_nan_count / valid_count) * 100
false_non_nan_percentage = (false_non_nan_count / valid_count) * 100

print(f"\nStats without NaN (valid_count = {valid_count}):")
print(f"True: {true_non_nan_count} ({true_non_nan_percentage:.2f}%)")
print(f"False: {false_non_nan_count} ({false_non_nan_percentage:.2f}%)")

#notvalid_rows_advanced["ai_advanced"].head(10)


Check valid rows and search for correct results in normal GPT model

In [None]:
valid_rows_normal = df_manual[df_manual['valid_normal'] == True]
#print(valid_rows["result_normal"])

df_manual.loc[valid_rows_normal.index, 'compare_normal'] = valid_rows_normal.apply(
    lambda row: compare_solutions(row['solution'], row['result_normal'])[0], axis=1
)

valid_count = len(valid_rows_normal)

#print(f"valid_normal == True: {valid_count}")
#print(df_manual['compare_normal'])

total_count = len(df_manual)
nan_count = df_manual['compare_normal'].isna().sum()
true_count = (df_manual['compare_normal'] == True).sum()
false_count = (df_manual['compare_normal'] == False).sum()

nan_percentage = (nan_count / total_count) * 100
true_percentage = (true_count / total_count) * 100
false_percentage = (false_count / total_count) * 100

print(f"All stats:")
print(f"NaN: {nan_count} ({nan_percentage:.2f}%)")
print(f"True: {true_count} ({true_percentage:.2f}%)")
print(f"False: {false_count} ({false_percentage:.2f}%)")

non_nan_rows = df_manual.loc[~df_manual['compare_normal'].isna()]
true_non_nan_count = (non_nan_rows['compare_normal'] == True).sum()
false_non_nan_count = (non_nan_rows['compare_normal'] == False).sum()

if valid_count == 0:
    true_non_nan_percentage = (true_non_nan_count / valid_count) * 100
    false_non_nan_percentage = (false_non_nan_count / valid_count) * 100
else:
    true_non_nan_percentage = 0
    false_non_nan_percentage = 0

print(f"\nStats without NaN (valid_count = {valid_count}):")
print(f"True: {true_non_nan_count} ({true_non_nan_percentage:.2f}%)")
print(f"False: {false_non_nan_count} ({false_non_nan_percentage:.2f}%)")

In [None]:
from generator_flow import validate_generator

df_manual['llm_score_advanced'] = validate_generator(df_manual['output'], df_manual['ai_advanced'], df_manual['parameter'])

print(df_manual[['output', 'ai_advanced', 'llm_score_advanced']].head())


TODO


- weryfikacja kodu poprzez: punktowanie przez niezależy model językowy (kolumna ai_advanced)