In [1]:
import json
import openai
import os
import pandas as pd
from pprint import pprint

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "INSERT_KEY_HERE")
openai.api_key = OPENAI_API_KEY

openai.api_key


'INSERT_KEY_HERE'

## Upload training data to OpenAI

This requires two `.jsonl` files - training and validation set, containing verified, high-quality input-output pairs. They should be in this same directory.

(To learn how to create these files, refer to [other notebook]

In [7]:
os.chdir('finetune_openAI')

training_file_name = "usace_finetune_training.jsonl"

validation_file_name = "usace_finetune_validation.jsonl"


In [None]:
with open(training_file_name, "rb") as training_fd:
    training_response = openai.files.create(
        file=training_fd, purpose="fine-tune"
    )

training_file_id = training_response.id

with open(validation_file_name, "rb") as validation_fd:
    validation_response = openai.files.create(
        file=validation_fd, purpose="fine-tune"
    )
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

## Fine tuning

In [9]:
response = openai.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-3.5-turbo",
    suffix="usace-json",
)

job_id = response.id

print("Job ID:", response.id)
print("Status:", response.status)

Job ID: ftjob-sosHADFF8Aj4rCQyvmv2ONvY
Status: validating_files


In [13]:
response = openai.fine_tuning.jobs.retrieve(job_id)

print("Job ID:", response.id)
print("Status:", response.status)
print("Trained Tokens:", response.trained_tokens)

Job ID: ftjob-sosHADFF8Aj4rCQyvmv2ONvY
Status: running
Trained Tokens: None


In [14]:
response = openai.fine_tuning.jobs.list_events(job_id)

events = response.data
events.reverse()

for event in events:
    print(event.message)

Created fine-tuning job: ftjob-sosHADFF8Aj4rCQyvmv2ONvY
Validating training file: file-0u7fSKiBaE2VczJEpF0ANEc0 and validation file: file-KsnzpUCeoh4uWgxG2QRkAEku
Files validated, moving job to queued state
Fine-tuning job started


In [67]:
response = openai.fine_tuning.jobs.list_events(job_id)

events = response.data
events.reverse()

for event in events:
    print(event.message)

Files validated, moving job to queued state
Fine-tuning job started
Step 1/156: training loss=0.76, validation loss=0.29
Step 11/156: training loss=0.17, validation loss=0.23
Step 21/156: training loss=0.04, validation loss=0.06
Step 31/156: training loss=0.07, validation loss=0.05
Step 41/156: training loss=0.02, validation loss=0.22
Step 51/156: training loss=0.08, validation loss=0.18
Step 61/156: training loss=0.24, validation loss=0.03
Step 71/156: training loss=0.06, validation loss=0.03
Step 81/156: training loss=0.38, validation loss=0.11
Step 91/156: training loss=0.08, validation loss=0.11
Step 101/156: training loss=0.03, validation loss=0.04
Step 111/156: training loss=0.17, validation loss=0.04
Step 121/156: training loss=0.09, validation loss=0.36
Step 131/156: training loss=0.01, validation loss=0.17
Step 141/156: training loss=0.21, validation loss=0.03
Step 151/156: training loss=0.01, validation loss=0.03
New fine-tuned model created: ft:gpt-3.5-turbo-0613:eidc-mdi-ge

In [None]:
response = openai.fine_tuning.jobs.retrieve(job_id)
fine_tuned_model_id = response.fine_tuned_model

if fine_tuned_model_id is None: 
    raise RuntimeError("Fine-tuned model ID not found. Your job has likely not been completed yet.")

print("Fine-tuned model ID:", fine_tuned_model_id)

## Inference

In [18]:
fine_tuned_model_id = 'ft:gpt-3.5-turbo-0613:eidc-mdi-georgetown:usace-json:8ait0mXa'


In [9]:
text = """'The applicant requests authorization to perform maintenance dredging in previously dredged areas in Yards 2 - 6 and 8 of the Modern American Recycling & Repair Service ( MARRS ) property. The dredging will consist of the mechanical removal of 459,796 cubic yards of material from 571,189 square feet (13.1 acres) of river bottoms located to the west of the Mobile River federal navigation channel.Dredge depths with be between - 25 to -60 feet relative to Mean Lower Low Water ( MLLW ) at each of the 8 proposed dredge sites. The dredged materials are proposed to be disposed at either Galliard Island or the Mobile Ocean Dredged Material Disposal Site (ODMDS), depending on regulatory approval. The details of each dredge area can be found in the attached drawings."""
text = """The applicant seeks authorization to fill 2,000 cubic yards (0.38 acres) of wetlands and 53.52 acres of waters (10.93 acres of dredge; 42.59 acres of fill) of ditches) for construction of a single family residential development. The plan proposes to preserve the remaining 7.86 acres of wetlands and 23.43 acres of waters on site."""

In [10]:


tdf = pd.DataFrame([text], columns=['user'])

test_row = tdf.iloc[0]

test_row


user    The applicant seeks authorization to fill 2,00...
Name: 0, dtype: object

In [26]:
sys_prompt2 = """Your task is to Extract information from a project description to create a structured dictionary, following the provided JSON schema. Focus on identifying wetland impacts based on the given criteria.

JSON Schema Overview:

wetland_type: Type or descriptor of the wetland (e.g., swamp, marsh).
impact_quantity: Numeric value of the impacted area.
impact_unit: Units of measurement (acres, sq. feet, linear feet).
impact_duration: Duration of impact (permanent, temporary, unknown).
impact_type: Nature of impact (harmful, beneficial, unknown).
Instructions:

Identify Wetlands and Impacts: Look for sentences detailing the wetland/water/land type and the impacts. Record the type, quantity, and unit. There may be multiple impacts.
Determine Impact Duration: Mark 'permanent' or 'temporary' if mentioned with wetland type and area; else, write 'unknown'.
Assess Impact Type: Identify if the impact is harmful, beneficial, or unknown.
Avoid Double Counting: Be mindful of nested projects or phrases indicating multiple projects. Do not double count impacts.
Create a Dictionary: For each wetland, provide its type, area, duration, and impact type in a structured format.
Example Input:
'The project will affect 3.5 acres of marshland permanently, leading to habitat loss...'

Expected Output:
{'wetlands': [{'wetland_type': 'marshland', 'impact_quantity': '3.5', 'impact_unit': 'acres', 'impact_duration': 'permanent', 'impact_type': 'loss'}]}
"""

In [59]:

def create_user_message(row):
    return f"""Project Description: {row['user']}\n\nExtracted Data: """

test_messages = []
test_messages.append({"role": "system", "content": sys_prompt2})
user_message = create_user_message(test_row)
test_messages.append({"role": "user", "content": create_user_message(test_row)})
test_messages

# fine_tuned_model_id = 'gpt-3.5-turbo-0613'

response = openai.chat.completions.create(
    model=fine_tuned_model_id, messages=test_messages, temperature=0.9, max_tokens=800
)
print(response.choices[0].message.content)

[{'role': 'system',
  'content': "Task: Extract information from a project description to create a structured dictionary, following the provided JSON schema. Focus on identifying wetland impacts based on the given criteria.\n\nJSON Schema Overview:\n\nwetland_type: Type or descriptor of the wetland (e.g., swamp, marsh).\nimpact_quantity: Numeric value of the impacted area.\nimpact_unit: Units of measurement (acres, sq. feet, linear feet).\nimpact_duration: Duration of impact (permanent, temporary, unknown).\nimpact_type: Nature of impact (harmful, beneficial, unknown).\nInstructions:\n\nIdentify Wetlands and Impacts: Look for sentences detailing the wetland/water/land type and the impacts. Record the type, quantity, and unit. There may be multiple impacts.\nDetermine Impact Duration: Mark 'permanent' or 'temporary' if mentioned with wetland type and area; else, write 'unknown'.\nAssess Impact Type: Identify if the impact is harmful, beneficial, or unknown.\nAvoid Double Counting: Be mi

{'wetlands': [{'wetland_type': 'wetlands', 'impact_quantity': '0.38', 'impact_unit': 'acres', 'impact_duration': 'unknown', 'impact_type': 'fill'}]}


### Coordinate-to-area case (on fly) - FAILED!

Catastrophic forgetting? General capabilities lost (adjust in prompt).

In [114]:
text = "The proposed project is to construct 217 single-family residential dwellings into Phase 4B of the Parks of Plaquemines subdivision. Construction activities include the homesites, roads, recreation features, natural buffers, and infrastructure. The project as proposed would directly impact 56.63 acres (permanently) of bottomland hardwood wetlands and 2,454 linear feet of non-tidal waters."
tdf = pd.DataFrame([text], columns=['user'])

test_row = tdf.iloc[0]


In [115]:
def create_user_message(row):
    return f"""Project Description: {row['user']}\n\nExtracted Data: """


test_messages = []
test_messages.append({"role": "system", "content": sys_prompt2})
user_message = create_user_message(test_row)
test_messages.append({"role": "user", "content": create_user_message(test_row)})
test_messages

response = openai.chat.completions.create(
    model=fine_tuned_model_id, messages=test_messages, temperature=0, max_tokens=500
)
print(response.choices[0].message.content)

[{'role': 'system',
  'content': "Task: Extract information from a project description to create a structured dictionary, following the provided JSON schema. Focus on identifying wetland impacts based on the given criteria.\n\nJSON Schema Overview:\n\nwetland_type: Type or descriptor of the wetland (e.g., swamp, marsh).\nimpact_quantity: Numeric value of the impacted area.\nimpact_unit: Units of measurement (acres, sq. feet, linear feet).\nimpact_duration: Duration of impact (permanent, temporary, unknown).\nimpact_type: Nature of impact (harmful, beneficial, unknown).\nInstructions:\n\nIdentify Wetlands and Impacts: Look for sentences detailing the wetland type and area impacted. Record the type, quantity, and unit.\nDetermine Impact Duration: Mark 'permanent' or 'temporary' if mentioned with wetland type and area; else, write 'unknown'.\nAssess Impact Type: Identify if the impact is harmful, beneficial, or unknown.\nAvoid Double Counting: Be mindful of nested projects or phrases indi

{'wetlands': [{'wetland_type': 'bottomland hardwood wetlands', 'impact_quantity': '56.63', 'impact_unit': 'acres', 'impact_duration': 'permanent', 'impact_type': 'loss'}, {'wetland_type': 'non-tidal waters', 'impact_quantity': '2,454', 'impact_unit': 'linear feet', 'impact_duration': 'unknown', 'impact_type': 'loss'}]}


## Fine tune + function calling