In [1]:
import json
from openai import OpenAI
import jq
import json

In [2]:
client = OpenAI()

## Get arbitrary data outputs

In [3]:
with open('gcal.json', 'r') as f:
    gcal = json.load(f)
    
with open('outlook.json', 'r') as f:
    outlook = json.load(f)

## Provide the schema for the data you want

Fields I want:
```
{
    "id": "string",
    "start_time": "timestamp",
    "end_time": "timestamp",
    "time_zone": "string",
    "status": "string - confirmed/busy/tentative",
    "location": "string",
    "summary": "string",
    "subject": "string",
    "recurrance": "boolean",
    "visibility":"string"
}
```

In [4]:
# desired_schema = {
#     "id": "string",
#     "start_time": "timestamp",
#     "end_time": "timestamp",
#     "time_zone": "string",
#     "status": "string - confirmed/busy/tentative",
#     "location": "string",
#     "summary": "string",
#     "subject": "string",
#     "recurrance": "boolean",
#     "visibility":"string"
# }

desired_schema = {
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "properties": {
    "id": {
      "type": "string"
    },
    "start_time": {
      "$ref": "#/$defs/timestamp"
    },
    "end_time": {
      "$ref": "#/$defs/timestamp"
    },
    "time_zone": {
      "type": "string"
    },
    "status": {
      "type": "string",
      "enum": ["confirmed", "busy", "tentative"]
    },
    "location": {
      "type": "string"
    },
    "summary": {
      "type": "string"
    },
    "subject": {
      "type": "string"
    },
    "recurrance": {
      "type": "boolean"
    },
    "visibility": {
      "type": "string"
    }
  },
  "required": ["id", "start_time", "end_time", "status", "subject", "recurrance", "visibility"],
  "$defs": {
    "timestamp": {
      "type": "string",
      "format": "date-time"
    }
  }
}

### Validation layer

In [5]:
import instructor
from openai import OpenAI
from pydantic import BaseModel

# Enables `response_model`
client = instructor.patch(OpenAI())


class UserDetail(BaseModel):
    name: str
    age: int


user = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=UserDetail,
    messages=[
        {"role": "user", "content": "Extract Jason is 25 years old"},
    ],
)

assert isinstance(user, UserDetail)
assert user.name == "Jason"
assert user.age == 25

In [6]:
from dyntamic.factory import DyntamicFactory

In [7]:
messages = [{
             "role": "system",
             "content": """You are a perfect system designed to validate and extract data from JSON files. 
For each field, you provide a short check about your reasoning. Go line by line and do a side by side comparison. For example:

"id" | "id" : The field name is identical
"time" | "timestamp": This is the same concept and is a partial match, therefore it counts.
"cats" | None: There no matching or remotely similar fields.

Some fields may not have the exact same names. Use your best judgement about the meaning of the field to determine if they should count.

You come to a definitive conclusion- True or False at the end of your response."""
            }, 
            {
             "role": "user",
             "content": f"Here is the desired schema: \n {desired_schema}\n\n Are all REQUIRED fields present in {gcal} "
            }]

reasoning_response = client.chat.completions.create(messages=messages, model="gpt-4-0125-preview")



In [8]:
reasoning = reasoning_response.choices[0].message.content
print(reasoning)

To determine if all REQUIRED fields from the desired schema are present in the provided JSON, we'll check each required field against the JSON data:

1. "id" | "id": The field name is identical and present.
2. "start_time" | "start" -> "dateTime": Although named differently, "start" contains "dateTime" which fulfills the concept of "start_time".
3. "end_time" | "end" -> "dateTime": Similarly, "end" contains "dateTime", fulfilling the concept of "end_time".
4. "status" | "status": The field name is identical and present.
5. "subject" | None: There is no direct match for "subject", however, "summary" could be considered its equivalent as it is a descriptive field about the event.
6. "recurrance" | "recurrence": The field names are similar, with a minor spelling difference ("recurrance" vs. "recurrence"), but they describe the same concept.
7. "visibility" | "visibility": The field name is identical and present.

Based on the analysis:

- The key field "subject" requires judgment. While n

In [9]:
messages = [{
             "role": "system",
             "content": "You are a perfect system designed to evaluation a statement. You reply ONLY True or False. Do not use any natural language whatsoever."
            }, 
            {
             "role": "user",
             "content": f"Evaluate this statement: {reasoning}"
            },
            {
             "role": "user",
             "content": f"Is the conclusion True or False?"
            }]

conclusion_response = client.chat.completions.create(messages=messages,
                                                     model="gpt-3.5-turbo")

In [10]:
conclusion_response.choices[0].message.content

'False.'

### 2. Devise a way to pull the fields from the data we have

**Testing it out...**

In [11]:
# Find the keys

def create_jq_reasoning(input_schema, output_schema):
    messages = [{
                 "role": "system",
                 "content": """You are a perfect jq engineer designed to validate and extract data from JSON files using jq.
                 
Your task is to provide the jq filter needed for finding the data necessary to answering a query. Before constructing the query, reason line-by-line through the requirements of the schema and propose how you will extract the data using jq. Then, provide how that would look like in jq code form on that line..

The key names many not be exact matches, do your best to find whichever key is most likely to match. For example, `eventId` may correspond with `id` or `time` may correspond with `timestamp`. 
"""
                }, 
                {
                 "role": "user",
                 "content": f"Here is the data we are querying: \n {input_schema}\n\n How will we extract all the fields from the source and transform it into {output_schema}?"
                }]

    response = client.chat.completions.create(messages=messages, model="gpt-4-0125-preview")
    return response.choices[0].message.content

In [12]:
jq_reasoning = create_jq_reasoning(desired_schema, gcal)

In [13]:
print(jq_reasoning)

To extract and transform the given data structure into the desired output structure using jq, I will:

1. Convert the `$schema` key to a static `kind` value `calendar#event`.
2. Leave `etag` as a static value, as no etag is available in the source.
3. Extract `id`, `status`, `location`, `summary`, and `visibility` directly from the source as they match directly to the desired output.
4. Substitute `start_time` and `end_time` with `created` and `updated` respectively, assuming `start_time` represents `created` and `end_time` represents `updated`. Note that these mappings might not perfectly reflect the intention of the original schema without further context.
5. Assume that `time_zone` pertains to the `timeZone` fields under `start` and `end` objects.
6. Due to the transformation's static nature and the missing data in the source that matches the specified output directly (e.g., `htmlLink`, `colorId`, among others), these will be set as static values or omitted where appropriate.

Consi

## Condense into a single filter

In [14]:
def condense_response(reasoning, query=None, error=None):
    prompt = f"Condense down this reasoning into a single jq filter string: {reasoning}"
    messages = [{
                 "role": "system",
                 "content": "You are a perfect jq engineer designed to validate and extract data from JSON files using jq. Condense down this reasoning into a single jq filter string. Do not use any natural language whatsoever. Do NOT use markdown, only provide raw strings."
                }, 
                {
                 "role": "user",
                 "content": f"Condense down this reasoning into a single jq filter string: {reasoning}"
                }]
    
    if error:
        messages.append({
            "role": "assistant",
            "content": f"""jq query: {query}
Error: {error}
"""})

        
    jq_response = client.chat.completions.create(messages=messages,
                                                         model="gpt-4-0125-preview")
    return jq_response.choices[0].message.content

In [27]:
def verify_match(result, output_schema):
    messages = [{
                "role": "system",
                "content": "You are a perfect jq engineer designed to validate and extract data from JSON files using jq. Do NOT reply in natural language. Only reply True or False"
                },
                {
                "role": "user",
                "content": f"""Does the result match the schema we provided?

Result: {result}

Schema: {output_schema}"""}]
    response = client.chat.completions.create(messages=messages,
                                              model="gpt-4-0125-preview")
    return response.choices[0].message.content    
    
    

## Try jq

In [28]:
jq_reasoning = create_jq_reasoning(desired_schema, gcal)

In [29]:
# Create reasoning
while True:
    jq_filter = condense_response(reasoning)
    try:
        transformed_data = jq.compile(jq_filter).input(gcal).all()
        
        match_results = verify_match(transformed_data, desired_schema)
        
        if "false" in match_results.lower():
            err = f"""Received different output from the schema expected.

Output: {transformed_data}

Desired: {desired_schema}
"""
            print(err)
            raise Exception()
        
        print(jq_filter)
        print(transformed_data)
        
        print('----')
        break
    except Exception as e:
        print(str(e))
        print('****')
        jq_filter = condense_response(reasoning, jq_filter, str(e))


jq: error: syntax error, unexpected '(', expecting end of file (Unix shell quoting issues?) at <top-level>, line 1:
.has("id", "status", "visibility") and (.start | has("dateTime")) and (.end | has("dateTime")) and (.has("summary") or .has("subject")) and .has("recurrence")    
jq: 1 compile error
****
jq: error: syntax error, unexpected '(', expecting end of file (Unix shell quoting issues?) at <top-level>, line 1:
.all((["id", "start.dateTime", "end.dateTime", "status", "summary", "recurrence", "visibility"] | map(has(.)) | add) == 7)    
jq: 1 compile error
****
jq: error: syntax error, unexpected '(', expecting end of file (Unix shell quoting issues?) at <top-level>, line 1:
.has("id", "status", "visibility") and (.start | has("dateTime")) and (.end | has("dateTime")) and ((.summary != null) or (.subject != null)) and ((.recurrance != null) or (.recurrence != null))    
jq: 1 compile error
****
jq: error: syntax error, unexpected '(', expecting end of file (Unix shell quoting issue

KeyboardInterrupt: 

In [None]:
print(json.dumps(transformed_data, indent=2))

## Create regex/something for converting/pulling keys from data output

In [102]:
def get_data(schema, keys):
    # Check if there's more than one key left for recursion
    if type(keys) == list and len(keys) > 1:
        # Recursively navigate down the schema
        return get_data(schema[keys[0]], keys[1:])
    else:
        # Return the value of the last key
        return schema[keys[0]]  # This should access the key at keys[0], not keys

In [103]:
get_data(gcal, nested_keys)

'2024-02-15T09:00:00Z'

### Testing it out:
Gcal

In [18]:
desired_schema

{'$schema': 'http://json-schema.org/draft-07/schema#',
 'type': 'object',
 'properties': {'id': {'type': 'string'},
  'start_time': {'$ref': '#/$defs/timestamp'},
  'end_time': {'$ref': '#/$defs/timestamp'},
  'time_zone': {'type': 'string'},
  'status': {'type': 'string', 'enum': ['confirmed', 'busy', 'tentative']},
  'location': {'type': 'string'},
  'summary': {'type': 'string'},
  'subject': {'type': 'string'},
  'recurrance': {'type': 'boolean'},
  'visibility': {'type': 'string'}},
 'required': ['id',
  'start_time',
  'end_time',
  'status',
  'subject',
  'recurrance',
  'visibility'],
 '$defs': {'timestamp': {'type': 'string', 'format': 'date-time'}}}

In [104]:
print(list(desired_schema['properties'].keys()))

['id', 'start_time', 'end_time', 'time_zone', 'status', 'location', 'summary', 'subject', 'recurrance', 'visibility']


In [155]:
for key in desired_schema['properties']:
    nested_keys = get_nested_keys(gcal, key)
    print(f"{key}: {get_data(gcal, nested_keys)}")

id: 123abc456def
start_time: 2024-02-15T09:00:00Z
end_time: 2024-02-15T10:00:00Z
time_zone: America/New_York
status: confirmed
location: Conference Room B, 123 Business Rd, City, Country
summary: Project Launch Meeting
subject: Project Launch Meeting
recurrance: ['RRULE:FREQ=WEEKLY;COUNT=4']
visibility: public
