In [5]:
import os, json
import pandas as pd
from openai import OpenAI

In [9]:
OPENAI_API_KEY = os.getenv("OPENKEY")
assert OPENAI_API_KEY, "Missing OPENKEY in environment/.env"

df = pd.read_csv("/Users/eugeneleach/code/Eugle3/cycle_more/Notebooks/2.a)LLM Model/Data_Engineered.csv")
print(f"Loaded {len(df)} routes")

Loaded 18329 routes


In [11]:
client = OpenAI(api_key=OPENAI_API_KEY)

## Data exploration to set limits

In [13]:
df = pd.read_csv("/Users/eugeneleach/code/Eugle3/cycle_more/Notebooks/2.a)LLM Model/Data_Engineered.csv")

In [None]:
df.shape

Unnamed: 0,id,name,distance_m,duration_s,ascent_m,descent_m,steps,turns,region,Cycleway,...,Paved_Road,Pedestrian,Unknown_Way,Cycle Track,Main Road,Steep Section,Moderate Section,Flat Section,Downhill Section,Steep Downhill Section
0,6101627,Ciclopedonale Lago Ghirla,1885.5,377.1,51.8,93.8,2,0,Alps,61.29,...,38.71,0.0,0.0,0.0,0.0,0.0,75.81,0.0,9.68,14.52


In [19]:
pd. set_option('display.max_columns', None)

In [21]:
df.shape

(18329, 27)

In [20]:
print(df.describe())

                 id    distance_m     duration_s      ascent_m     descent_m  \
count  1.832900e+04   18329.00000   18329.000000  18329.000000  18329.000000   
mean   1.111211e+07    5421.80718    1126.854891     69.538567     67.662660   
std    5.773845e+06   22359.97120    4583.648965    349.195923    349.324034   
min    2.649000e+03       0.50000       0.100000      0.000000      0.000000   
25%    7.256621e+06     369.90000      77.500000      2.000000      2.000000   
50%    1.151049e+07    1144.50000     240.400000      9.000000      9.300000   
75%    1.623401e+07    3336.50000     692.400000     32.700000     32.300000   
max    1.990331e+07  807920.60000  168968.500000  19465.000000  18759.000000   

              steps         turns      Cycleway  Turn_Density       on_road  \
count  18329.000000  18329.000000  18329.000000  18329.000000  18329.000000   
mean      10.029953      6.741557     20.634499      2.653734     66.504514   
std       27.980823     23.198779     33.8

## Model

In [None]:
import csv, io, json
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()

In [25]:
OPENAI_API_KEY = os.getenv("OPENKEY")
client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
COLUMNS = ["id","name","distance_m","duration_s","ascent_m","descent_m","steps","turns",
            "region","Cycleway","Turn_Density","on_road","off_road","Gravel_Tracks",
            "Paved_Paths","Other","Unknown Surface","Paved_Road","Pedestrian","Unknown_Way",
            "Cycle Track","Main Road","Steep Section","Moderate Section","Flat Section",
            "Downhill Section","Steep Downhill Section"]

In [28]:
DEFAULTS = {
    "distance_m":5421.8072,"duration_s":1126.8549,"ascent_m":69.5386,"descent_m":67.6627,
    "steps":10.029953,"turns":6.741557,"Cycleway":20.634499,"Turn_Density":2.653734,
    "on_road":66.504514,"off_road":0.901243,"Gravel_Tracks":3.495492,"Paved_Paths":5.377312,
    "Other":0.092803,"Unknown Surface":23.583983,"Paved_Road":59.350565,"Pedestrian":2.927223,
    "Unknown_Way":0.214249,"Cycle Track":11.988817,"Main Road":4.791810,"Steep Section":1.018174,
    "Moderate Section":18.202830,"Flat Section":57.723090,"Downhill Section":19.178338,
    "Steep Downhill Section":0.887704,
}

In [29]:
def clamp_pct(x):
    try:
        return max(0, min(float(x), 100))
    except Exception:
        return 0.0

def fill_and_clamp(row_dict, region_default="Unknown"):
    out = {}
    for col in COLUMNS:
        if col == "id":
            out[col] = "Synthetic_Route_1"
        elif col == "name":
            out[col] = row_dict.get(col) or "Synthetic Route"
        elif col == "region":
            out[col] = row_dict.get(col) or region_default
        elif col in DEFAULTS:
            val = row_dict.get(col, DEFAULTS[col])
            val = clamp_pct(val) if col not in ["distance_m","duration_s","ascent_m","descent_m","steps","turns"] else max(0, float(val))
            out[col] = val
        else:
            out[col] = row_dict.get(col, 0)
    return out


In [30]:
SYSTEM_PROMPT = """
You are a route-row generator. Return exactly one CSV row including header.
Columns (27): id,name,distance_m,duration_s,ascent_m,descent_m,steps,turns,region,
Cycleway,Turn_Density,on_road,off_road,Gravel_Tracks,Paved_Paths,Other,Unknown Surface,
Paved_Road,Pedestrian,Unknown_Way,Cycle Track,Main Road,Steep Section,Moderate Section,
Flat Section,Downhill Section,Steep Downhill Section
Rules:
- id must be "Synthetic_Route_1".
- name is a short descriptive title.
- region is a short category.
- distance_m, duration_s, ascent_m, descent_m, steps, turns are non-negative numbers.
- The remaining fields are percentages/fractions; keep each between 0 and 100 and sum logically.
If unsure, use the typical defaults below.
Typical defaults (use if unspecified): distance_m=5421.8072, duration_s=1126.8549,
ascent_m=69.5386, descent_m=67.6627, steps=10.029953, turns=6.741557,
Cycleway=20.634499, Turn_Density=2.653734, on_road=66.504514, off_road=0.901243,
Gravel_Tracks=3.495492,
Paved_Paths=5.377312, Other=0.092803, Unknown Surface=23.583983, Paved_Road=59.350565,
Pedestrian=2.927223, Unknown_Way=0.214249, Cycle Track=11.988817, Main Road=4.791810,
Steep Section=1.018174, Moderate Section=18.202830, Flat Section=57.723090,
Downhill Section=19.178338, Steep Downhill Section=0.887704.
Output only CSV text with the header line and one data line. No extra text.
""".strip()


In [31]:
user_text = "A flat 10 km loop around Richmond Park, mostly paved, low traffic, few turns."

In [34]:
resp = client.chat.completions.create(
    model="gpt-4o-mini",  # or your chosen GPT model
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_text},
    ],
    temperature=0.2,
    max_tokens=300,
)

csv_text = resp.choices[0].message.content
print("Raw CSV from model:\n", csv_text)

Raw CSV from model:
 id,name,distance_m,duration_s,ascent_m,descent_m,steps,turns,region,Cycleway,Turn_Density,on_road,off_road,Gravel_Tracks,Paved_Paths,Other,Unknown Surface,Paved_Road,Pedestrian,Unknown_Way,Cycle Track,Main Road,Steep Section,Moderate Section,Flat Section,Downhill Section,Steep Downhill Section  
Synthetic_Route_1,Flat 10 km Loop,10000,2000,50,50,10,3,Park,25,1.5,70,5,2,10,0.1,15,65,3,0.2,10,5,1,20,60,15,0.5  


In [35]:
import io, csv as pycsv

reader = pycsv.DictReader(io.StringIO(csv_text))
row = next(reader)
clean = fill_and_clamp(row)  # uses your helper above

In [36]:
buf = io.StringIO()
w = pycsv.DictWriter(buf, fieldnames=COLUMNS)
w.writeheader()
w.writerow(clean)
final_csv = buf.getvalue()
print("\nFinal CSV:\n", final_csv)


Final CSV:
 id,name,distance_m,duration_s,ascent_m,descent_m,steps,turns,region,Cycleway,Turn_Density,on_road,off_road,Gravel_Tracks,Paved_Paths,Other,Unknown Surface,Paved_Road,Pedestrian,Unknown_Way,Cycle Track,Main Road,Steep Section,Moderate Section,Flat Section,Downhill Section,Steep Downhill Section
Synthetic_Route_1,Flat 10 km Loop,10000.0,2000.0,50.0,50.0,10.0,3.0,Park,25.0,1.5,70.0,5.0,2.0,10.0,0.1,15.0,65.0,3.0,0.2,10.0,5.0,1.0,20.0,60.0,15.0,0.887704

