In [1]:
import pandas as pd
import openai
import requests
from openai import OpenAI
import json
import os
import time
import warnings
import requests
import re
import csv
from datetime import datetime
from settings import API_KEY
import numpy as np
# Ignore the specific UserWarning from openpyxl
warnings.filterwarnings(action='ignore', category=UserWarning, module='openpyxl')

In [2]:
# !pip install --upgrade openai --quiet

In [3]:
# !pip show openai

# References:
Links: 
- https://platform.openai.com/docs/assistants/tools/supported-files
- https://github.com/davideuler/awesome-assistant-api/blob/main/GPT-PPT-Slides-Generator.ipynb

Goal: This document will utilize assistants to complete the tasks of filling out these materials. 

In [4]:
client = openai.OpenAI(api_key = API_KEY)

## All Functions
Create the assistants and the needed functions and prompts.

In [5]:
def delete_assistant(assistant_id):
    url = f"https://api.openai.com/v1/assistants/{assistant_id}"

    # Set up headers with your API key
    headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}",
    "OpenAI-Beta": "assistants=v1"
    }

    # Make the DELETE request
    response = requests.delete(url, headers=headers)

    # Print the response content
    return response.status_code


In [6]:
def cancel_assistant_run(thread_id,run_id):
    
    url = f"https://api.openai.com/v1/threads/{thread_id}/runs/{run_id}/cancel"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
        "OpenAI-Beta": "assistants=v1"
    }

    response = requests.post(url, headers=headers)
    
    return response.json()

In [7]:
def get_assistant_response(thread_id, run_id):
    run = client.beta.threads.runs.retrieve(thread_id=thread_id,run_id=run_id)
    print(f"Checking run status: {run.status}")
    while run.status != "completed":
        time.sleep(15)
        run = client.beta.threads.runs.retrieve(thread_id=thread_id,run_id=run_id)
        
    print("Run is completed. Printing the entire thread now in sequential order \n")
    messages = client.beta.threads.messages.list(thread_id=thread_id)
    
#     for thread_message in messages.data[::-1]:
#         run_id_value = thread_message.run_id
#         content_value = thread_message.content[0].text.value
#         print(f"{run_id_value}: {content_value} \n")
    
    
    most_recent = messages.data[0].content[0].text.value
    print(f"Most run {run_id} response: {most_recent} ")
    return most_recent

In [8]:
def create_assistant(file_id):
    assistant = client.beta.assistants.create(
        name="Get Extraction",
        instructions= instructions.replace("__COMMODITY__", os.environ.get('commodity')).replace("__SIGN__", os.environ.get('sign')),
        tools=[{"type": "retrieval"}],
        model="gpt-4-1106-preview",
        file_ids=[file_id]
    )

    thread = client.beta.threads.create(
    messages=[
    {
      "role": "user",
      "content": "You are a geology expert and you are very good in understanding mining reports, which is attached.",
      "file_ids": [file_id]
    }])
    print(f"Created an Assistant")
    return thread.id, assistant.id

In [9]:
def check_file(thread_id, assistant_id):
    run = client.beta.threads.runs.create(
      thread_id=thread_id,
      assistant_id=assistant_id,
      instructions= file_instructions
    )
    print(f"Current run id = {run.id} thread_id = {thread_id}")
    
    ans = get_assistant_response(thread_id, run.id)
    print(f"Response: {ans}")
    if ans.lower() == "no":
        print("We need to reload file.")
        delete_assistant(assistant_id)
        file = client.files.create(
              file=open(f"./reports/{os.environ.get('file_path')}", "rb"),
              purpose='assistants'
            )
        thread_id, assistant_id =  create_assistant(file.id)
        check_file(assistant_id, thread_id)
    else:
        print("File was correctly uploaded")
        return thread_id, assistant_id,

In [10]:
def extract_json_strings(input_string):
    start = input_string.find('{')
    if start != -1:
        count = 0
        for i in range(start, len(input_string)):
            if input_string[i] == '{':
                count += 1
            elif input_string[i] == '}':
                count -= 1
            if count == 0:
                json_str = input_string[start:i+1]
                return json.loads(json_str)
    else:
        return None

In [11]:
def read_csv_to_dict(file_path):
    data_dict_list = []
    
    with open(file_path, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        
        for row in csv_reader:
            data_dict_list.append(dict(row))
    
    return data_dict_list

In [12]:
def clean_mineral_site_json(json_str):
    # cycle through dict
    for key in json_str:
        key_string = str(json_str[key])
        if key_string.strip() == "" or key_string.strip() == "POINT()":
            del json_str[key]
    return json_str
    

In [13]:
def clean_mineral_inventory_json(json_str, commodities_dict, unit_dict):
    url_str ="https://minmod.isi.edu/resource/"
    for inner_json in json_str["MineralInventory"]:
        for key, value in inner_json.items():
            if value == "NA" or value == "Not Provided" or value == "N/A":
                inner_json[key] = ""
                
            if key in {"ore", "grade", "cutoff_grade"}:
                # check units
                for new_key, new_value in inner_json[key].items():
                    print(new_key, new_value)
                    if new_value in unit_dict:
                        inner_json[key][new_key] = url_str + unit_dict[new_value]

            if "category" in key:
                inner_json[key] = url_str + value

            if "commodity" in key:
                inner_json[key] = url_str + commodities_dict[value]

            if "cutoff_grade" in key:
                for new_key, new_value in inner_json[key].items():
                    if value == "NA" or value == "Not Provided" or value == "N/A":
                        del inner_json[key][new_key]
            
        if len(inner_json['cutoff_grade']) == 0:
            del inner_json['cutoff_grade']
    return json_str

## Initial Prompts

In [14]:
file_instructions = """If the file was correctly uploaded and can be read return YES otherwise return NO. 
Only return the Yes or No answer.
"""

In [15]:
instructions = """You are a geology expert and you are very good in understanding mining reports. You will be given 
a text from a mining report and a table name. You have to find out what are the different combinations of
classification (which is either indicated, inferred, measure, proven, probable, or total ), cut-off (represented as a decimal), tonnage (in Tonnes) and 
grade (given in %) from the given table in the text. Please extract the name of the element and place it in the output below without any additional text
Note we only care about the mineral __COMMODITY__ represented by __SIGN__"
"""

## Set Up
Goal: attach a file and ask it a series of questions

In [16]:
au_path = "au_papers/"
mvt_path = "mvt_zinc/"
zinc_path = "zinc/"
file_name = "BYP Au 6-2011.pdf"

In [17]:
os.environ['commodity'] = 'zinc'
os.environ['sign'] = 'Zn'
os.environ['file_path'] = mvt_path + file_name

In [18]:
file = client.files.create(
  file=open(f"./reports/{os.environ.get('file_path')}", "rb"),
  purpose='assistants'
)

In [19]:
thread_id, assistant_id =create_assistant(file.id)

Created an Assistant


In [20]:
thread_id, assistant_id = check_file(thread_id, assistant_id)

Current run id = run_xSEQA72fmPU1hzfOVfjxLuhH thread_id = thread_LW1JHqa7jYCLxX9ysHdz68LS
Checking run status: in_progress
Run is completed. Printing the entire thread now in sequential order 

Most run run_xSEQA72fmPU1hzfOVfjxLuhH response: YES 
Response: YES
File was correctly uploaded


## File was Properly uploaded! 

In [None]:
document_ref = f"""{{
              "title": "",
              "doi" : ""
              "uri": "",
              "authors": "[]",
              "year": "",
              "month": "",
              "volume": "",
              "issue": "",
              "description": ""
            }}"""

In [21]:
name_instructions = """
Please tell me the name of the document and the date it was published. Return it as a json structure 
that follows this format {"name": document name, "date":yyyy-mm}. Only return the json structure.
"""

In [22]:
print("Creating the thread")
run = client.beta.threads.runs.create(
  thread_id=thread_id,
  assistant_id=assistant_id,
  instructions= name_instructions
)
print(f"Current run id = {run.id} thread_id = {thread_id}")

Creating the thread
Current run id = run_KhUiaHCEON4VAJONJJDbvJu3 thread_id = thread_LW1JHqa7jYCLxX9ysHdz68LS


In [23]:
print("Retrieving the response\n")
ans = get_assistant_response(thread_id, run.id)

Retrieving the response

Checking run status: queued
Run is completed. Printing the entire thread now in sequential order 

Most run run_KhUiaHCEON4VAJONJJDbvJu3 response: ```json
{
  "name": "Technical Report on the BYP Property Hunan Province China",
  "year": "2011-06"
}
``` 


In [24]:
doc_json = extract_json_strings(ans)
# doi_value = doc_json['doi']
doc_date = doc_json['year']
doc_month = int(doc_date[-2:])
doc_year = int(doc_date[:4])
doc_name = doc_json['name']

In [25]:
print(f"Here are the doc name {doc_name} doc year {doc_date}")

Here are the doc name Technical Report on the BYP Property Hunan Province China doc year 2011-06


## Filling out Mineral Site

In [26]:
## json strings
site_format = f"""
  {{ "MineralSite":[
      "source_id": "",
      "record_id": 1,
      "name": "{doc_name}",
      "location_info": {{
        "location": "POINT()",
        "crs": "WGS84"
        "country": "",
        "state_or_province": "",
        }}
    ] }}
"""

In [27]:
loc_instructions = f"""Find the geographic location of the mining 
site in the document and put it in geographic coordinates using latitude and longitude that will then be converted to
geometry point structure. Fill out the JSON structure Mineral Site based on the geographic information found.
Here is an example format: Mineral Site: {site_format}
If there is no location information or if the correct conversions cannot be made replace the value as empty strings. 
Return only the filled in MineralSite json Structure with only the keys and values and no additional comments.
"""

In [28]:
print("Creating the thread")
run = client.beta.threads.runs.create(
  thread_id=thread_id,
  assistant_id=assistant_id,
  instructions=loc_instructions
)
print(f"Current run id = {run.id} thread_id = {thread_id}")

Creating the thread
Current run id = run_tbcXtWTkES5yuLelK9AjNXMD thread_id = thread_LW1JHqa7jYCLxX9ysHdz68LS


In [29]:
print("Retrieving the response\n")
ans = get_assistant_response(thread_id, run.id)

Retrieving the response

Checking run status: in_progress
Run is completed. Printing the entire thread now in sequential order 

Most run run_tbcXtWTkES5yuLelK9AjNXMD response: ```json
{
  "MineralSite":[
    {
      "source_id": "Technical Report on the BYP Property Hunan Province China",
      "record_id": "BYP Au 6-2011.pdf",
      "name": "Technical Report on the BYP Property Hunan Province China",
      "location_info": {
        "location": "POINT(111.306111 27.375833)",
        "crs": "WGS84",
        "country": "China",
        "state_or_province": "Hunan Province"
      }
    } 
  ]
}
``` 


In [30]:
mineral_site_json = extract_json_strings(ans)
if mineral_site_json is None:
    mineral_site_json = json.loads(site_format)
    
mineral_site_json = clean_mineral_site_json(mineral_site_json)
print(mineral_site_json)


{'MineralSite': [{'source_id': 'Technical Report on the BYP Property Hunan Province China', 'record_id': 'BYP Au 6-2011.pdf', 'name': 'Technical Report on the BYP Property Hunan Province China', 'location_info': {'location': 'POINT(111.306111 27.375833)', 'crs': 'WGS84', 'country': 'China', 'state_or_province': 'Hunan Province'}}]}


## Filling out Deposit Types

In [31]:
minmod_deposit_types = read_csv_to_dict("./codes/minmod_deposit_types.csv")
deposit_id = {}
for key in minmod_deposit_types:
    deposit_id[key['Deposit type']] = key['Minmod ID']

In [32]:
deposit_format = """
{
  "deposit_type": [
    {
      "id":  "deposit type"
    }
  ]
}
"""
deposit_format_correct = """
{
  "deposit_type": [
    {
      "id":  "https://minmod.isi.edu/resource/deposit_id"
    }
  ]
}
"""
deposit_instructions = f"""Identify the deposit types from the attached document. Note that the main
commodity in this paper is {os.environ.get('commodity')}.The output was to be formatted in the JSON structure Deposit_Type
{deposit_format}.  Please return the filled in Deposit_Type json Structure or 
leave the list empty if there are No matching deposit types. Return only the json structure.
"""
check_deposit_instructions = f"""Given this json structure with deposit types __DEPOSIT_TYPES__, check that each deposit is in
the acceptable list of deposits or there is a deposit type that appears to be close. Update the 
deposit type name with the correct ID from this given list {deposit_id}. The return format
should only be the JSON structure: {deposit_format_correct} where deposit_id is changed to the correct ID and the https url is still included.
If there is no match return an empty list for deposit_type. 
"""


In [33]:
# print(deposit_instructions)

In [34]:
print("Creating the run")
run = client.beta.threads.runs.create(
  thread_id=thread_id,
  assistant_id=assistant_id,
  instructions=deposit_instructions
)
print(f"Current run id = {run.id} thread_id = {thread_id}")

Creating the run
Current run id = run_IQJ4BgSj9OWsYkjatFTMqT6K thread_id = thread_LW1JHqa7jYCLxX9ysHdz68LS


In [35]:
print("Retrieving the response\n")
ans = get_assistant_response(thread_id, run.id)

Retrieving the response

Checking run status: in_progress
Run is completed. Printing the entire thread now in sequential order 

Most run run_IQJ4BgSj9OWsYkjatFTMqT6K response: ```json
{
  "deposit_type": [
    {
      "id": "stratabound mineralization"
    }
  ]
}
``` 


In [36]:
deposit_types_json = extract_json_strings(ans)
print(f"deposit types: {deposit_types_json}")

deposit types: {'deposit_type': [{'id': 'stratabound mineralization'}]}


In [37]:
if deposit_types_json is not None and len(deposit_types_json['deposit_type']) > 0:
    print("Creating the run")
    run = client.beta.threads.runs.create(
      thread_id=thread_id,
      assistant_id=assistant_id,
      instructions=check_deposit_instructions.replace("__DEPOSIT_TYPE__", str(deposit_types_json))
    )
    print(f"Current run id = {run.id} thread_id = {thread_id}")
    
    ans = get_assistant_response(thread_id, run.id)
    deposit_types_json = extract_json_strings(ans)
    
    print("Updated file: \n",deposit_types_json)
else:
    deposit_types_json = "{'deposit_type':[]}"

Creating the run
Current run id = run_QMUAhkFKkga5bccCS5nedfCu thread_id = thread_LW1JHqa7jYCLxX9ysHdz68LS
Checking run status: in_progress
Run is completed. Printing the entire thread now in sequential order 

Most run run_QMUAhkFKkga5bccCS5nedfCu response: ```json
{
  "deposit_type": [
    {
      "id": "https://minmod.isi.edu/resource/Q435"
    }
  ]
}
``` 
Updated file: 
 {'deposit_type': [{'id': 'https://minmod.isi.edu/resource/Q435'}]}


In [38]:
print(deposit_types_json)

{'deposit_type': [{'id': 'https://minmod.isi.edu/resource/Q435'}]}


## Filling out Mineral Inventory

In [39]:
minmod_commodities = read_csv_to_dict("./codes/minmod_commodities.csv")
commodities = {}
for key in minmod_commodities:
    commodities[key['CommodityinGeoKb']] = key['minmod_id']

In [40]:
minmod_units = read_csv_to_dict("./codes/minmod_units.csv")
correct_units = {}
for key in minmod_units:
    correct_units[key['unit name']] = key['minmod_id']
    correct_units[key['unit aliases']] = key['minmod_id']

In [41]:
inventory_format = f"""
  {{ "MineralInventory": [
        {{
          "id": "0",
          "commodity":"{os.environ.get('commodity')}",
          "category": "classification category",
          "ore": {{
            "ore_unit": "unit",
            "ore_value": "value"
          }},
          "grade": {{
            "grade_unit": "unit",
            "grade_value": "value"
          }},
          "cutoff_grade": {{
            "cutoff_grade_unit": "unit",
            "cutoff_grade_value": "value"
          }},
          "contained_metal": "ore_value * grade_value",
          "reference": {{
            "id": "1",
            "document": {document_dict},
            "page_info": [
              {{
                "page": "page number",
                "bounding_box": {{
                  "x_min": "",
                  "x_max": "",
                  "y_min": "",
                  "y_max": ""
                }}
              }}
            ]
          }},
          "date": "{doc_date}"
        }}
   ] }}
"""

In [75]:
find_relevant_table_instructions = f"""
Can you go through the document, find any tables that discuss mineral resources or mineral reserves 
from the same year or closets to the year {doc_year}? 
Return the list of tables as a json structure: {{"Tables": ["Table 1 Name", "Table 2 Name" ]}}. Only 
return the json structure.
"""

find_info_instructions = f""" From this list of tables: __RELEVANT__, extract create a JSON structure that 
captures all {os.environ.get('commodity')} resource estimate data from these tables.
Each relevant row from the data should have an entry in the JSON should include the following fields:
category: The category of resource must be either INFERRED, INDICATED, MEASURED, PROBABLE, PROVEN, ORIGINAL_RESOURCE, EXTRACTED, CUMULATIVE_EXTRACTED. 
{os.environ.get('sign')} Cut-Off: The threshold grade used to determine the economic viability of mining the {os.environ.get('commodity')}
resource (this might not be provided in some tables).{os.environ.get('sign')} Tonnage: The calculated or
estimated tonnage for the resource. {os.environ.get('sign')} Grade %: The concentration of 
{os.environ.get('commodity')} in the resource.  Contained_metal is the tonnage value times grade value 
and then divided by 100 but should be reported as the final number. 
The page number should be found by starting from the first page of the document as page 1 and incrementing by 1 until the referenced 
table is found in the document.
Note: The term "cut-off" is sometimes not provided within the resource estimates.
This is the final format that the json structure should follow {inventory_format}, do not put any additional comments in the
format outside of the keys and values. 
Any unknown values should be left as ''. Unit values should either be tonnes, million tonnes, gram per tonne,
or percent.
"""

In [43]:
print("Creating the thread")
run = client.beta.threads.runs.create(
  thread_id=thread_id,
  assistant_id=assistant_id,
  instructions=find_relevant_table_instructions
)
print(f"Current run id = {run.id} thread_id = {thread_id}")

Creating the thread
Current run id = run_pLfUP1We5Vymvk9nrhOcPdnN thread_id = thread_LW1JHqa7jYCLxX9ysHdz68LS


In [44]:
print("Retrieving the response\n")
ans = get_assistant_response(thread_id, run.id)

Retrieving the response

Checking run status: queued
Run is completed. Printing the entire thread now in sequential order 

Most run run_pLfUP1We5Vymvk9nrhOcPdnN response: ```json
{
  "Tables": [
    "Table 17.1  Statistics for Raw Data",
    "Table 16.15  Mass Balances of Pb Zn Flotation Tests (Option 3)",
    "Table 16.16  PbS Conc Composition(%）(Option 3)",
    "Table 16.17  ZnS Conc Composition(%）(Option 3)",
    "Table 16.18  Tails Composition(%）(Option 3)"
  ]
}
``` 


In [45]:
relevant_tables = extract_json_strings(ans)
print(relevant_tables)

{'Tables': ['Table 17.1  Statistics for Raw Data', 'Table 16.15  Mass Balances of Pb Zn Flotation Tests (Option 3)', 'Table 16.16  PbS Conc Composition(%）(Option 3)', 'Table 16.17  ZnS Conc Composition(%）(Option 3)', 'Table 16.18  Tails Composition(%）(Option 3)']}


In [46]:
print("Creating the thread")
run = client.beta.threads.runs.create(
  thread_id=thread_id,
  assistant_id=assistant_id,
  instructions=find_info_instructions.replace("__RELEVANT__", str(relevant_tables))
)
print(f"Current run id = {run.id} thread_id = {thread_id}")

Creating the thread
Current run id = run_ZZlLCiw8ujdCRMdLpAnqViNi thread_id = thread_LW1JHqa7jYCLxX9ysHdz68LS


In [47]:
print("Retrieving the response\n")
ans = get_assistant_response(thread_id, run.id)

Retrieving the response

Checking run status: queued
Run is completed. Printing the entire thread now in sequential order 

Most run run_ZZlLCiw8ujdCRMdLpAnqViNi response: ```json
{
  "MineralInventory": [
    {
      "id": "0",
      "commodity": "zinc",
      "category": "inferred",
      "ore": {
        "ore_unit": "tonnes",
        "ore_value": "9590420"
      },
      "grade": {
        "grade_unit": "percent",
        "grade_value": "1.97"
      },
      "cutoff_grade": {
        "cutoff_grade_unit": "percent",
        "cutoff_grade_value": ""
      },
      "contained_metal": "189020.474",
      "reference": {
        "id": "0",
        "document": {
          "id": "doc_id_0",
          "title": "Technical Report on the BYP Property Hunan Province China",
          "uri": "",
          "month": "6",
          "year": "2011"
        },
        "page_info": [
          {
            "page": "78",
            "bounding_box": {
              "x_min": "",
              "x_max": "",

In [49]:
mineral_inventory_json = extract_json_strings(ans)

In [50]:
print(mineral_inventory_json)

{'MineralInventory': [{'id': '0', 'commodity': 'zinc', 'category': 'inferred', 'ore': {'ore_unit': 'tonnes', 'ore_value': '9590420'}, 'grade': {'grade_unit': 'percent', 'grade_value': '1.97'}, 'cutoff_grade': {'cutoff_grade_unit': 'percent', 'cutoff_grade_value': ''}, 'contained_metal': '189020.474', 'reference': {'id': '0', 'document': {'id': 'doc_id_0', 'title': 'Technical Report on the BYP Property Hunan Province China', 'uri': '', 'month': '6', 'year': '2011'}, 'page_info': [{'page': '78', 'bounding_box': {'x_min': '', 'x_max': '', 'y_min': '', 'y_max': ''}}]}, 'date': '2011-06'}, {'id': '1', 'commodity': 'zinc', 'category': 'inferred', 'ore': {'ore_unit': 'tonnes', 'ore_value': '9007320'}, 'grade': {'grade_unit': 'percent', 'grade_value': '2.05'}, 'cutoff_grade': {'cutoff_grade_unit': 'percent', 'cutoff_grade_value': ''}, 'contained_metal': '184650.06', 'reference': {'id': '1', 'document': {'id': 'doc_id_0', 'title': 'Technical Report on the BYP Property Hunan Province China', 'ur

In [51]:
mineral_inventory_json = clean_mineral_inventory_json(mineral_inventory_json, commodities, correct_units)

ore_unit tonnes
ore_value 9590420
grade_unit percent
grade_value 1.97
cutoff_grade_unit percent
cutoff_grade_value 
ore_unit tonnes
ore_value 9007320
grade_unit percent
grade_value 2.05
cutoff_grade_unit percent
cutoff_grade_value 
ore_unit tonnes
ore_value 5330220
grade_unit percent
grade_value 2.51
cutoff_grade_unit percent
cutoff_grade_value 
ore_unit tonnes
ore_value 2025380
grade_unit percent
grade_value 3.03
cutoff_grade_unit percent
cutoff_grade_value 


In [52]:
print(mineral_inventory_json)

{'MineralInventory': [{'id': '0', 'commodity': 'https://minmod.isi.edu/resource/Q589', 'category': 'https://minmod.isi.edu/resource/inferred', 'ore': {'ore_unit': 'https://minmod.isi.edu/resource/Q200', 'ore_value': '9590420'}, 'grade': {'grade_unit': 'https://minmod.isi.edu/resource/Q201', 'grade_value': '1.97'}, 'cutoff_grade': {'cutoff_grade_unit': 'https://minmod.isi.edu/resource/Q201', 'cutoff_grade_value': ''}, 'contained_metal': '189020.474', 'reference': {'id': '0', 'document': {'id': 'doc_id_0', 'title': 'Technical Report on the BYP Property Hunan Province China', 'uri': '', 'month': '6', 'year': '2011'}, 'page_info': [{'page': '78', 'bounding_box': {'x_min': '', 'x_max': '', 'y_min': '', 'y_max': ''}}]}, 'date': '2011-06'}, {'id': '1', 'commodity': 'https://minmod.isi.edu/resource/Q589', 'category': 'https://minmod.isi.edu/resource/inferred', 'ore': {'ore_unit': 'https://minmod.isi.edu/resource/Q200', 'ore_value': '9007320'}, 'grade': {'grade_unit': 'https://minmod.isi.edu/re

## Combine json structures into one and write 

In [54]:
additional = mineral_site_json

In [55]:
# Combine dictionaries into one
print(mineral_site_json,deposit_types_json, mineral_inventory_json)

{'MineralSite': [{'source_id': 'Technical Report on the BYP Property Hunan Province China', 'record_id': 'BYP Au 6-2011.pdf', 'name': 'Technical Report on the BYP Property Hunan Province China', 'location_info': {'location': 'POINT(111.306111 27.375833)', 'crs': 'WGS84', 'country': 'China', 'state_or_province': 'Hunan Province'}}]} {'deposit_type': [{'id': 'https://minmod.isi.edu/resource/Q435'}]} {'MineralInventory': [{'id': '0', 'commodity': 'https://minmod.isi.edu/resource/Q589', 'category': 'https://minmod.isi.edu/resource/inferred', 'ore': {'ore_unit': 'https://minmod.isi.edu/resource/Q200', 'ore_value': '9590420'}, 'grade': {'grade_unit': 'https://minmod.isi.edu/resource/Q201', 'grade_value': '1.97'}, 'cutoff_grade': {'cutoff_grade_unit': 'https://minmod.isi.edu/resource/Q201', 'cutoff_grade_value': ''}, 'contained_metal': '189020.474', 'reference': {'id': '0', 'document': {'id': 'doc_id_0', 'title': 'Technical Report on the BYP Property Hunan Province China', 'uri': '', 'month':

In [72]:
mineral_site_json["MineralSite"][0]['MineralInventory'] = mineral_inventory_json['MineralInventory']
mineral_site_json["MineralSite"][0]['deposit_type'] = deposit_types_json['deposit_type']
print(mineral_site_json)
current_datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")

{'MineralSite': [{'source_id': 'Technical Report on the BYP Property Hunan Province China', 'record_id': 'BYP Au 6-2011.pdf', 'name': 'Technical Report on the BYP Property Hunan Province China', 'location_info': {'location': 'POINT(111.306111 27.375833)', 'crs': 'WGS84', 'country': 'China', 'state_or_province': 'Hunan Province'}, 'MineralInventory': [{'id': '0', 'commodity': 'https://minmod.isi.edu/resource/Q589', 'category': 'https://minmod.isi.edu/resource/inferred', 'ore': {'ore_unit': 'https://minmod.isi.edu/resource/Q200', 'ore_value': '9590420'}, 'grade': {'grade_unit': 'https://minmod.isi.edu/resource/Q201', 'grade_value': '1.97'}, 'cutoff_grade': {'cutoff_grade_unit': 'https://minmod.isi.edu/resource/Q201', 'cutoff_grade_value': ''}, 'contained_metal': '189020.474', 'reference': {'id': '0', 'document': {'id': 'doc_id_0', 'title': 'Technical Report on the BYP Property Hunan Province China', 'uri': '', 'month': '6', 'year': '2011'}, 'page_info': [{'page': '78', 'bounding_box': {'

In [74]:
# Specify the path to the output JSON file
output_file_path = f'./extracted/{file_name[:-4]}_summary_{current_datetime_str}.json'

def convert_int_or_float(obj):
    if isinstance(obj, dict):
        return {key: convert_int_or_float(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_int_or_float(item) for item in obj]
    elif isinstance(obj, (int, float)):
        return obj
    elif isinstance(obj, str) and obj.isdigit():
        return int(obj)
    elif isinstance(obj, str) and obj.replace('.', '', 1).isdigit():
        return float(obj)
    return obj

# Writing to a file using json.dump with custom serialization function
with open(output_file_path, "w") as json_file:
    json.dump(convert_int_or_float(mineral_site_json), json_file, indent=2)
    

print(f"Combined data written to {output_file_path}")

Combined data written to ./extracted/BYP Au 6-2011_summary_20240108_130752.json


## Deletion
At the end should work on removing the existance of the assistant to not have any outstanding files or assistants which could cause more charges.

In [None]:
resp_code = delete_assistant(assistant_id)

if resp_code == 200:
    print(f"Deleted assistant {assistant_id}")
else:
    print(f"Deletion FAILED")
    

# Notes
1. Should try function calling