In [None]:
import pandas as pd
import openai
import requests
from openai import OpenAI
import json
import os
import time
import ast
import warnings
import re
import csv
import requests
from pyzotero import zotero
import copy
from datetime import datetime
from fuzzywuzzy import process
from settings import API_KEY, LIBRARY_ID, LIBRARY_TYPE, ZOLTERO_KEY 
import numpy as np
# Ignore the specific UserWarning from openpyxl
warnings.filterwarnings(action='ignore', category=UserWarning, module='openpyxl')

In [None]:
# !pip install --upgrade openai --quiet

In [None]:
# !pip show openai

## References:
Links: 
- https://platform.openai.com/docs/assistants/tools/supported-files
- https://github.com/davideuler/awesome-assistant-api/blob/main/GPT-PPT-Slides-Generator.ipynb

Goal: This document will utilize assistants to complete the tasks of filling out these materials. 

In [None]:
client = openai.OpenAI(api_key = API_KEY)

## All Functions
Create the assistants and the needed functions and prompts.

In [None]:
def delete_assistant(assistant_id):
    url = f"https://api.openai.com/v1/assistants/{assistant_id}"

    # Set up headers with your API key
    headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}",
    "OpenAI-Beta": "assistants=v1"
    }

    # Make the DELETE request
    response = requests.delete(url, headers=headers)

    # Print the response content
    return response.status_code


In [None]:
def cancel_assistant_run(thread_id,run_id):
    
    url = f"https://api.openai.com/v1/threads/{thread_id}/runs/{run_id}/cancel"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
        "OpenAI-Beta": "assistants=v1"
    }

    response = requests.post(url, headers=headers)
    
    return response.json()

In [None]:
def get_assistant_response(thread_id, run_id):
    run = client.beta.threads.runs.retrieve(thread_id=thread_id,run_id=run_id)
    print(f"Checking run status: {run.status}")
    while run.status != "completed":
        time.sleep(15)
        run = client.beta.threads.runs.retrieve(thread_id=thread_id,run_id=run_id)
        
    print("Run is completed. Printing the entire thread now in sequential order \n")
    messages = client.beta.threads.messages.list(thread_id=thread_id)
    
#     for thread_message in messages.data[::-1]:
#         run_id_value = thread_message.run_id
#         content_value = thread_message.content[0].text.value
#         print(f"{run_id_value}: {content_value} \n")
    
    
    most_recent = messages.data[0].content[0].text.value
    print(f"Most run {run_id} response: {most_recent} ")
    return most_recent

In [None]:
def create_assistant(file_id):
    assistant = client.beta.assistants.create(
        name="Get Extraction",
        instructions= instructions.replace("__COMMODITY__", os.environ.get('commodity')).replace("__SIGN__", os.environ.get('sign')),
        tools=[{"type": "retrieval"}],
        model="gpt-4-1106-preview",
        file_ids=[file_id]
    )

    thread = client.beta.threads.create(
    messages=[
    {
      "role": "user",
      "content": "You are a geology expert and you are very good in understanding mining reports, which is attached.",
      "file_ids": [file_id]
    }])
    print(f"Created an Assistant")
    return thread.id, assistant.id

In [None]:
def check_file(thread_id, assistant_id):
    file_instructions = """If the file was correctly uploaded and can be read return YES otherwise return NO. 
                        Only return the Yes or No answer.
                        """
    run = client.beta.threads.runs.create(
      thread_id=thread_id,
      assistant_id=assistant_id,
      instructions= file_instructions
    )
    print(f"Current run id = {run.id} thread_id = {thread_id}")
    
    ans = get_assistant_response(thread_id, run.id)
    print(f"Response: {ans}")
    if ans.lower() == "no":
        print("We need to reload file.")
        response_code = delete_assistant(assistant_id)
        if response_code == 200:
            print(f"Deleted assistant {assistant_id}")
        file = client.files.create(
              file=open(f"./reports/{os.environ.get('file_path')}", "rb"),
              purpose='assistants'
            )
        new_thread_id, new_assistant_id =  create_assistant(file.id)
        return check_file(new_thread_id, new_assistant_id)
    else:
        print("File was correctly uploaded")
        return thread_id, assistant_id,

In [None]:
def extract_json_strings(input_string, remove_comments = False):
    start = input_string.find('{')
    if start != -1:
        # Remove comments starting with // or # since we get a lot in the return
        if remove_comments: 
            input_string = re.sub(r'(?<!["\'])//.*?\n|/\*.*?\*/|(#.*?\n)', '', input_string)
        
        count = 0
        for i in range(start, len(input_string)):
            if input_string[i] == '{':
                count += 1
            elif input_string[i] == '}':
                count -= 1
            if count == 0:
                json_str = input_string[start:i+1]
                # print(json_str)
                return json.loads(json_str)
    else:
        return None

In [None]:
def read_csv_to_dict(file_path):
    data_dict_list = []
    
    with open(file_path, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        
        for row in csv_reader:
            data_dict_list.append(dict(row))
    
    return data_dict_list

In [None]:
def is_array(s):
    return s.startswith('[') and s.endswith(']')

def clean_document_dict(document_dict_temp):
    key_to_remove = []

    for key, value in document_dict_temp.items():
        if isinstance(value, str):
            if value.strip() == "" and key != "doi":
                key_to_remove.append(key) 
        if key == 'title':
            document_dict_temp[key] = os.environ.get("title")
        
        if key == 'doi':
            if value != os.environ.get("url"):
                document_dict_temp[key] = os.environ.get("url")
        if key == 'authors':
            if isinstance(value, str):
                if value.strip()[0] == "[":
                    document_dict_temp[key] = [str(item.strip()).replace('"', "") for item in value[1:-1].split(',')]
                else:
                    document_dict_temp[key] = [str(item.strip()) for item in value.split(',')]  

    for key in key_to_remove:
        del document_dict_temp[key]

    return document_dict_temp

In [None]:
def clean_mineral_site_json(json_str):
    # cycle through dict
    key_to_remove = []

    for key, value in json_str["MineralSite"][0].items():
        # print(f"Here is the key {key}, value {value}")
        if isinstance(value, str):
            if value.strip() == "" and key != "source_id":
                key_to_remove.append((key, None))  # Append a tuple (key, None) for outer keys
        
        if key == 'name':
             json_str["MineralSite"][0][key] = os.environ.get("title")
        if key == 'source_id':
            if value != os.environ.get("url"):
                json_str["MineralSite"][0][key] = os.environ.get("url")
        if key == 'location_info' and isinstance(value, dict):
            for new_key, new_value in value.items():
                if isinstance(new_value, str) and (new_value.strip() == "" or new_value.strip() == "POINT()"):
                    key_to_remove.append((key, new_key))  # Append a tuple (key, new_key) for inner keys
                    key_to_remove.append((key, 'crs'))

    for outer_key, inner_key in key_to_remove:
        if inner_key is None:
            del json_str["MineralSite"][0][outer_key]
        else:
            del json_str["MineralSite"][0][outer_key][inner_key]

    return json_str
    

In [None]:
def find_best_match(input_str, list_to_match, threshold=75):
    # Get the best match and its score
    best_match, score = process.extractOne(input_str, list_to_match)

    # Check if the score is above the threshold
    if score >= threshold:
        return best_match
    else:
        return None


def create_mineral_inventory(extraction_dict, inventory_format, relevant_tables, unit_dict):
    kt_values = ["k","kt", "000s tonnes", "thousand tonnes", "thousands", "000s" , "000 tonnes"]
    url_str = "https://minmod.isi.edu/resource/"
    output_str = {"MineralInventory":[]}
    grade_unit_list = list(unit_dict.keys())
    
    ## add conversion to tonnes
    
    for inner_dict in extraction_dict['extractions']:
        current_inventory_format = copy.deepcopy(inventory_format)
        changed_tonnage = False
    
        for key, value in inner_dict.items():
            
            if 'category' in key:
                current_inventory_format['category'] = []
                acceptable_values = ["inferred", "indicated","measured", "probable", 
                "proven", "proven+probable", "inferred+indicated", "inferred+measured",
                "measured+indicated"]
               
                if value.lower() in acceptable_values:
                    if "+" in value.lower():
                        new_vals = value.lower().split("+")
                        for val in new_vals:
                            current_inventory_format['category'].append(url_str + val.lower())
                    else:
                        current_inventory_format['category'].append(url_str + value.lower())
            
            elif 'zone' in key:
                current_inventory_format['zone'] = value.lower()
                
                
            elif 'cut' in key.lower() and 'unit' not in key.lower():
                current_inventory_format['cutoff_grade']['grade_value'] = value.lower()
            
            elif 'cut' in key.lower() and 'unit' in key.lower():
                if value == '%':
                    current_inventory_format['cutoff_grade']['grade_unit'] = url_str + unit_dict['percent']
                elif value != '':
                    
                    found_value = find_best_match(value, grade_unit_list[5:])
       
                    if found_value is not None:
                        current_inventory_format['cutoff_grade']['grade_unit'] = url_str + unit_dict[found_value]
                    else:
                        current_inventory_format['cutoff_grade']['grade_unit'] = ''
                else:
                    current_inventory_format['cutoff_grade']['grade_unit'] = ''
            
            elif 'tonnage' in key.lower() and 'unit' not in key.lower():
                current_inventory_format['ore']['ore_value'] = value.lower()
          
            
            elif 'tonnage' in key.lower() and 'unit' in key.lower():
                if value.lower() in kt_values:
                    value = "tonnes"
                    float_val = float(current_inventory_format['ore']['ore_value']) * 1000
                    current_inventory_format['ore']['ore_value'] =  str(float_val)
                    current_inventory_format['ore']['ore_unit'] = url_str + unit_dict[value]
                    changed_tonnage = True
                else:
                    found_value = find_best_match(value, grade_unit_list)
                    current_inventory_format['ore']['ore_unit'] = url_str + unit_dict[found_value.lower()]
                
                # print(f"After looking at tonnage unit {current_inventory_format['ore']['ore_value']}")
                
            elif 'contained' in key.lower():
                tonnes = float(current_inventory_format['ore']['ore_value'])
                grade = float(current_inventory_format['grade']['grade_value'])
                value = str(tonnes*grade/100)

                if changed_tonnage: 
                    integer_value = float(value.lower())*1000
                    current_inventory_format['contained_metal'] = str(integer_value)
                else:
                    current_inventory_format['contained_metal'] = value.lower()
                
            elif 'grade' in key.lower():
                current_inventory_format['grade']['grade_unit'] = url_str + unit_dict['percent']
                current_inventory_format['grade']['grade_value'] = value.lower()
                
            elif 'table' in key.lower():
                    table_match = find_best_match(value.lower(), list(relevant_tables['Tables'].keys()), threshold = 70)
        
                    if table_match is not None:
                        current_inventory_format['reference']['page_info'][0]['page'] = relevant_tables['Tables'][table_match]
                    else:
                        print("Need to find correct Page number for current table: ", value)
                        
        if current_inventory_format['cutoff_grade']['grade_unit'] == '' and current_inventory_format['cutoff_grade']['grade_value'] == '':
            current_inventory_format.pop('cutoff_grade')
            
        output_str["MineralInventory"].append(current_inventory_format)
        
    return output_str

## Set Up
Goal: attach a file and ask it a series of questions

In [None]:
instructions = """You are a geology expert and you are very good in understanding mining reports. You will be given 
a text from a mining report and a table name. You have to find out what are the different combinations of
classification (which is either indicated, inferred, measure, proven, probable, or total ), cut-off (represented as a decimal), tonnage (in Tonnes) and 
grade (given in %) from the given table in the text. Please extract the name of the element and place it in the output below without any additional text
Note we only care about the mineral __COMMODITY__ represented by __SIGN__"
"""

In [None]:
au_path = "au_papers/"
mvt_path = "mvt_zinc/"
zinc_path = "zinc/"
nickel_path = "nickel/"
file_name = "HN4 & N100_Ni_10-2021.pdf"

In [None]:
os.environ['url'] = 'https://w3id.org/usgs/z/4530692/53GPCC8R'
os.environ['commodity'] = 'nickel'
os.environ['sign'] = 'Ni'
os.environ['file_path'] = nickel_path + file_name

In [None]:
zot = zotero.Zotero(LIBRARY_ID, LIBRARY_TYPE, ZOLTERO_KEY)
file_list = os.environ.get('url').split("/")
file_key = file_list[-1]
file_item = zot.item(file_key)
os.environ['title'] = file_item['data']['title']
print(f"file_key {file_key} Title: {os.environ.get('title')}")

In [None]:
file = client.files.create(
  file=open(f"./reports/{os.environ.get('file_path')}", "rb"),
  purpose='assistants'
)

In [None]:
thread_id, assistant_id =create_assistant(file.id)

In [None]:
thread_id, assistant_id = check_file(thread_id, assistant_id)

## Extract Document Reference

In [None]:
document_ref = f"""{{
              "title": "{os.environ.get('title')}",
              "doi" : "{os.environ.get("url")}"
              "authors": "[]",
              "year": "",
              "month": "",
              "volume": "",
              "issue": "",
              "description": ""
            }}"""

In [None]:
name_instructions = f"""Please tell me description information about the attached document such as the title, 
list of author names (ignore professional titles), year and month it was published as integers, volume, issue, and a one sentence description. 
Return the response as a json structure that follows this format {document_ref}. Only return the json structure.
Any unknown values should be returned as ""
"""

In [None]:
print("Creating the thread")
run = client.beta.threads.runs.create(
  thread_id=thread_id,
  assistant_id=assistant_id,
  instructions= name_instructions
)
print(f"Current run id = {run.id} thread_id = {thread_id}")

In [None]:
print("Retrieving the response\n")
ans = get_assistant_response(thread_id, run.id)

In [None]:
document_dict_temp = extract_json_strings(ans)
document_dict = clean_document_dict(document_dict_temp)
doc_month = document_dict['month']
doc_year = document_dict['year']
doc_name = document_dict['title']
doc_date = f"{doc_year}-{doc_month}"

In [None]:
print(f"Here is the reference material for the document: \n {document_dict}")

## Filling out Mineral Site

In [None]:
## json strings
site_format = f"""
  {{ "MineralSite":[
      "source_id": "{os.environ.get("url")}",
      "record_id": "1",
      "name": "{doc_name}",
      "location_info": {{
        "location": "POINT()",
        "crs": "WGS84"
        "country": "",
        "state_or_province": ""
        }}
    ] }}
"""

In [None]:
loc_instructions = f"""Find the geographic location of the mining 
site in the document and put it in geographic coordinates using latitude and longitude that will then be converted to
geometry point structure using WGS84 standard. If there are multiple points the format will look like: 
"MULTIPOINT(long1 lat1,long2 lat2, ..)". If there is no location information or if the correct conversions cannot be made replace the value as empty strings. 
Fill out the JSON structure Mineral Site based on the geographic information found.
Here is an example format: Mineral Site: {site_format}.
Return only the filled in MineralSite Json Structure with the given keys and found values. Do not 
add any additional comments and do not use // within the JSON structure. Only return one Json structure.
"""

In [None]:
print("Creating the thread")
run = client.beta.threads.runs.create(
  thread_id=thread_id,
  assistant_id=assistant_id,
  instructions=loc_instructions
)
print(f"Current run id = {run.id} thread_id = {thread_id}")

In [None]:
print("Retrieving the response\n")
ans = get_assistant_response(thread_id, run.id)

In [None]:
mineral_site_json = extract_json_strings(ans)
if mineral_site_json is None:
    mineral_site_json = json.loads(site_format)
    

mineral_site_json = clean_mineral_site_json(mineral_site_json)

print(mineral_site_json)



## Filling out Deposit Types

In [None]:
resp_code = delete_assistant(assistant_id)

if resp_code == 200:
    print(f"Deleted assistant {assistant_id}")
else:
    print(f"Deletion FAILED")
    

In [None]:
file = client.files.create(
  file=open(f"./reports/{os.environ.get('file_path')}", "rb"),
  purpose='assistants'
)
thread_id, assistant_id =create_assistant(file.id)

In [None]:
thread_id, assistant_id = check_file(thread_id, assistant_id)

In [None]:
minmod_deposit_types = read_csv_to_dict("./codes/minmod_deposit_types.csv")
deposit_id = {}
for key in minmod_deposit_types:
    deposit_id[key['Deposit type']] = key['Minmod ID']

In [None]:
def format_deposit_candidates(deposit_list):
    deposit_type_candidate = { "deposit_type_candidate": []}
    
    for dep in deposit_list['deposit_type'].keys():
        inner_dict = {}
        inner_dict["observed_name"] = dep
        inner_dict["normalized_uri"] = deposit_list['deposit_type'][dep]
        inner_dict["source"] = "report" 
        inner_dict["confidence"] = 1/len(deposit_list['deposit_type']) 
        deposit_type_candidate['deposit_type_candidate'].append(inner_dict)
        
    return deposit_type_candidate
        

In [None]:

deposit_format = """
{"deposit_type": []
}

"""

deposit_format_correct = """
{
  "deposit_type": {
       "observed text": "https://minmod.isi.edu/resource/deposit_id",  
       "observed text" : "https://minmod.isi.edu/resource/deposit_id",
       "observed text": ""
  }
}
"""
deposit_instructions = f"""Identify the deposit types from the attached document. Note that the main
commodity in this paper is {os.environ.get('commodity')}.The output was to be formatted in the JSON structure Deposit_Type
{deposit_format}.  Please return the filled in Deposit_Type json Structure or 
leave the list empty if there are No matching deposit types. Return only the json structure.
"""
check_deposit_instructions = f"""Given this list with deposit type observed texts __DEPOSIT_TYPES_LIST__ and with the main commodity being {os.environ.get('commodity')}, 
check that each deposit is in the acceptable list of deposits or there is a deposit type that appears to be close. Update the 
deposit type with the correct ID from this given list {deposit_id} or return an empty string if a match can't be made. The return format
should only be the JSON structure: {deposit_format_correct} where in the value deposit_id is changed to the correct ID and the https url is still included.
The keys of the return should be all the values given in deposit types json structure given.
Do not return any additional comments and do not use // in the json structure.
"""


In [None]:
print("Creating the run")
run = client.beta.threads.runs.create(
  thread_id=thread_id,
    
  assistant_id=assistant_id,
  instructions=deposit_instructions
)
print(f"Current run id = {run.id} thread_id = {thread_id}")

In [None]:
print("Retrieving the response\n")
ans = get_assistant_response(thread_id, run.id)

In [None]:
deposit_types_initial = extract_json_strings(ans)
print(f"deposit types: {deposit_types_initial}")

In [None]:
if deposit_types_initial is not None and len(deposit_types_initial['deposit_type']) > 0:
    print("Creating the run")
    run = client.beta.threads.runs.create(
      thread_id=thread_id,
      assistant_id=assistant_id,
      instructions=check_deposit_instructions.replace("__DEPOSIT_TYPE_LIST__", str(deposit_types_initial['deposit_type']))
    )
    print(f"Current run id = {run.id} thread_id = {thread_id}")
    
    ans = get_assistant_response(thread_id, run.id)
    deposit_types_output = extract_json_strings(ans)
    
else:
    deposit_types_output = {'deposit_type':[]}

In [None]:
if len(deposit_types_output['deposit_type']) == 0:
    deposit_types_json = {'deposit_type_candidate':[]}
else:
    deposit_types_json = format_deposit_candidates(deposit_types_output) 
print(deposit_types_json)

## Filling out Mineral Inventory

In [None]:
resp_code = delete_assistant(assistant_id)

if resp_code == 200:
    print(f"Deleted assistant {assistant_id}")
else:
    print(f"Deletion FAILED")

In [None]:
file = client.files.create(
  file=open(f"./reports/{os.environ.get('file_path')}", "rb"),
  purpose='assistants'
)
thread_id, assistant_id =create_assistant(file.id)

In [None]:
thread_id, assistant_id = check_file(thread_id, assistant_id)

In [None]:
minmod_commodities = read_csv_to_dict("./codes/minmod_commodities.csv")
commodities = {}
for key in minmod_commodities:
    commodities[key['CommodityinGeoKb']] = key['minmod_id']

In [None]:
minmod_units = read_csv_to_dict("./codes/minmod_units.csv")
correct_units = {}
for key in minmod_units:
    correct_units[key['unit name']] = key['minmod_id']
    correct_units[key['unit aliases']] = key['minmod_id']

In [None]:
dictionary_format = f"""
        {{ "extractions":[
        {{
        "category": "",
        "zone": "",
        "{os.environ.get("commodity")} Cut-Off": "",
        "{os.environ.get("commodity")} Cut-Off Unit": "",
        "{os.environ.get("commodity")} Tonnage": "",
        "{os.environ.get("commodity")} Tonnage Unit": "",
        "{os.environ.get("commodity")} Grade Percent": "",
        "Contained_metal": "",
        "Table": ""
        }}
        ]
    }}

"""

inventory_format = {
    "commodity": "https://minmod.isi.edu/resource/" + commodities[os.environ.get('commodity')],
    "category": "",
    "ore": {
        "ore_unit": "unit",
        "ore_value": "value"
    },
    "grade": {
        "grade_unit": "unit",
        "grade_value": "value"
    },
    "cutoff_grade": {
        "grade_unit": "unit",
        "grade_value": "value"
    },
    "contained_metal": "ore_value * grade_value",  # Note: This won't be evaluated here
    "reference": {
        "document": document_dict,
        "page_info": [
            {
                "page": 0,
                "bounding_box": {
                    "x_min": "",
                    "x_max": "",
                    "y_min": "",
                    "y_max": ""
                }
            }
        ]
    },
    "date": doc_date,
    "zone": "",
}

In [None]:
find_relevant_table_instructions = f"""
Can you go through the document, find all tables that give mineral resource estimates or mineral reserve estimates. If there are multiple resource
or reserve tables pull the tables that are closest to the doc_date. Avoid any resource sensitivities tables. Include the page number from the document that you got the table from. The page number can be calculated by 
counting from the first page up to the page that the table was found.
Return the list of tables as a json structure: {{"Tables": {{"Table 1 Name": page_number, 
"Table 2 Name": page_number}}}}. Only return the json structure. Note that these tables are typically found in the
early sections of the document.
"""

find_relevant_categories = f""" From this list of tables: __RELEVANT__, return the json structure that
contains the list of categories found in the tables. The allotted categories are ["inferred", "indicated","measured", 
"probable", "proven", "proven+probable", "inferred+indicated", "inferred+measured", "measured+indicated"]. The Return value should be {{"categories": [value1, value2, ...]}} and each value should be all
lower case.
"""

find_category_rows = f""" From this list of tables: __RELEVANT__, create a python dictionary that
captures all rows that describe {os.environ.get('commodity')} resource estimate data. Each 
relevant row should have the category __CATEGORY__. The rows should also include the following headers.
Zone: the named area where the resources were extracted from (Note: Include Total values).
{os.environ.get('sign')} Cut-Off: The threshold grade used to determine the economic viability of 
mining the {os.environ.get('commodity')} resource (this might not be provided in some tables). 
{os.environ.get('sign')} Cut-Off Unit: The unit that is labeled cut off and always start from the smallest cut-off value. Note if it is a NSR value. 
{os.environ.get('sign')} Tonnage: The calculated or estimated tonnage for the resource. 
{os.environ.get('sign')} Tonnage Unit: The unit that the tonnage was presented in, which should be in tonnes, thousand tonnes, 
million tonnes, or gram per tonne,. 
{os.environ.get('sign')} Grade %: The concentration of {os.environ.get('commodity')} in the resource, which should 
be converted into a percentage. 
Unit values should either be converted into tonnes, million tonnes, gram per tonne, or percent. Also return what tables the rows were extracted from.
If any values are unknown return it as an empty string ''

Return the information as dictionary with an internal list of keys and values, wrapped in "", that follows this
format: {dictionary_format}. Do not add any additional comments using // in the returned dictionary format.
"""

find_additional_categories = f""" Follow the same instructions as the previous extraction for tables __RELEVANT__
but extract for rows that relate to the category, __CATEGORY__. Return the information as dictionary with an internal list of keys and values, wrapped in "", that follows this
format: {dictionary_format}. Do not add any additional comments using // in the returned dictionary format. If any values are unknown make sure to
return them as empty strings.

Note if no rows are found for __CATEGORY__ do not return any json.
"""

In [None]:
def extract_by_category(curr_cat, relevant_tables, thread_id, assistant_id, done_first):
    if relevant_tables is not None and len(relevant_tables['Tables']) > 0:
        print("Creating the thread")
        if not done_first:
            use_instructions = find_category_rows.replace("__RELEVANT__", str(relevant_tables)).replace("__CATEGORY__", curr_cat)
        else:
            use_instructions = find_additional_categories.replace("__RELEVANT__", str(relevant_tables)).replace("__CATEGORY__", curr_cat)
            
        # print(use_instructions)
        run = client.beta.threads.runs.create(
        thread_id=thread_id,
        assistant_id=assistant_id,
        instructions=use_instructions
        )

        print(f"Current run id = {run.id} thread_id = {thread_id}")

        print("Retrieving the response\n")
        ans = get_assistant_response(thread_id, run.id)


        extraction_dict = extract_json_strings(ans, remove_comments = True)

        return extraction_dict

    else:
        return None

In [None]:
print("Creating the thread")
run = client.beta.threads.runs.create(
  thread_id=thread_id,
  assistant_id=assistant_id,
  instructions=find_relevant_table_instructions
)
print(f"Current run id = {run.id} thread_id = {thread_id}")

In [None]:
print("Retrieving the response\n")
ans = get_assistant_response(thread_id, run.id)

In [None]:
relevant_tables = extract_json_strings(ans)

In [None]:
print(relevant_tables)

In [None]:
## return list of categories to extract then can decide which ones to run
if relevant_tables is not None:
    print("Creating the thread")
    run = client.beta.threads.runs.create(
      thread_id=thread_id,
      assistant_id=assistant_id,
      instructions=find_relevant_categories.replace("__RELEVANT__", str(relevant_tables['Tables'].keys()))
    )
    print(f"Current run id = {run.id} thread_id = {thread_id}")

    print("Retrieving the response\n")
    ans = get_assistant_response(thread_id, run.id)

In [None]:
if relevant_tables is not None:
    relevant_cats = extract_json_strings(ans)
    cat_list = relevant_cats["categories"]
else:
    cat_list = []


In [None]:
print(relevant_cats)

In [None]:
mineral_inventory_json = {"MineralInventory":[]}
done_first = False

In [None]:
cat = "INFERRED"
extract_inferred = None
if cat.lower() in cat_list:
    print(f"Extracting category: {cat}")
    extract_inferred = extract_by_category(cat, relevant_tables, thread_id, assistant_id, done_first)
    done_first = True
    print(f'Extracted: {extract_inferred}')

In [None]:
if extract_inferred is not None or cat.lower() in cat_list:
    cleaned_inferred = create_mineral_inventory(extract_inferred,inventory_format, relevant_tables, correct_units)
    mineral_inventory_json["MineralInventory"] += cleaned_inferred['MineralInventory']

In [None]:
cat = "INDICATED"
extract_indicated = None
if cat.lower() in cat_list:
    print(f"Extracting category: {cat}")
    extract_indicated = extract_by_category(cat, relevant_tables, thread_id, assistant_id, done_first)
    print(f'Extracted: {extract_indicated}')
    done_first = True

In [None]:
if extract_indicated is not None or cat.lower in cat_list:
    cleaned_indicated = create_mineral_inventory(extract_indicated,inventory_format, relevant_tables, correct_units)
    mineral_inventory_json["MineralInventory"] += cleaned_indicated['MineralInventory']


In [None]:
cat = "INDICATED+INFERRED"
extract_indicated_inf = None
if cat.lower() in cat_list:
    print(f"Extracting category: {cat}")
    extract_indicated_inf = extract_by_category(cat, relevant_tables, thread_id, assistant_id, done_first)
    print(f'Extracted: {extract_indicated_inf}')
    done_first = True
    

In [None]:
if extract_indicated_inf is not None or cat.lower() in cat_list:
    cleaned_indicated_inf = create_mineral_inventory(extract_indicated_inf,inventory_format, relevant_tables, correct_units)
    mineral_inventory_json["MineralInventory"] += cleaned_indicated_inf['MineralInventory']


In [None]:
cat = "MEASURED"
extract_measured = None
if cat.lower() in cat_list:
    print(f"Extracting category: {cat}")
    extract_measured = extract_by_category(cat, relevant_tables, thread_id, assistant_id, done_first)
    print(f'Extracted: {extract_measured}')
    done_first = True

In [None]:
if extract_measured is not None or cat.lower() in cat_list:
    cleaned_measured = create_mineral_inventory(extract_measured,inventory_format, relevant_tables, correct_units)
    mineral_inventory_json["MineralInventory"] += cleaned_measured['MineralInventory']


In [None]:
cat = "MEASURED+INDICATED"
extract_measured_ind = None
if cat.lower() in cat_list:
    print(f"Extracting category: {cat}")
    extract_measured_ind = extract_by_category(cat, relevant_tables, thread_id, assistant_id, done_first)
    print(f'Extracted: {extract_measured_ind}')
    done_first = True
    

In [None]:
if extract_measured_ind is not None or cat.lower() in cat_list:
    cleaned_measured_ind = create_mineral_inventory(extract_measured_ind,inventory_format, relevant_tables, correct_units)
    mineral_inventory_json["MineralInventory"] += cleaned_measured_ind['MineralInventory']

In [None]:
cat = "MEASURED+INFERRED"
extract_measured_inf = None
if cat.lower() in cat_list:
    print(f"Extracting category: {cat}")
    extract_measured_inf = extract_by_category(cat, relevant_tables, thread_id, assistant_id, done_first)
    print(f'Extracted: {extract_measured_inf}')
    done_first = True

In [None]:
if extract_measured_inf is not None or cat.lower() in cat_list:
    cleaned_measured_inf = create_mineral_inventory(extract_measured_inf,inventory_format, relevant_tables, correct_units)
    mineral_inventory_json["MineralInventory"] += cleaned_measured_inf['MineralInventory']

In [None]:
cat = "PROBABLE"
extract_probable = None
if cat.lower() in cat_list:
    print(f"Extracting category: {cat}")
    extract_probable = extract_by_category(cat, relevant_tables, thread_id, assistant_id, done_first)
    done_first = True


In [None]:
if extract_probable is not None or cat.lower() in cat_list:
    cleaned_probable = create_mineral_inventory(extract_probable,inventory_format, relevant_tables, correct_units)
    mineral_inventory_json["MineralInventory"] += cleaned_probable['MineralInventory']


In [None]:
cat = "PROVEN"
extract_proven = None
if cat.lower() in cat_list:
    print(f"Extracting category: {cat}")
    extract_proven = extract_by_category(cat, relevant_tables, thread_id, assistant_id, done_first)
    print(f'Extracted: {extract_proven}')
    done_first = True

In [None]:
if extract_proven is not None or cat.lower() in cat_list:
    cleaned_proven = create_mineral_inventory(extract_proven,inventory_format, relevant_tables, correct_units)
    mineral_inventory_json["MineralInventory"] += cleaned_proven['MineralInventory']


In [None]:
cat = "PROVEN+PROBABLE"
extract_proven_prob = None
if cat.lower() in cat_list:
    print(f"Extracting category: {cat}")
    extract_proven_prob = extract_by_category(cat, relevant_tables, thread_id, assistant_id, done_first)
    print(f'Extracted: {extract_proven_prob}')
    done_first = True

In [None]:
if extract_proven_prob is not None or cat.lower() in cat_list:
    cleaned_proven_prob = create_mineral_inventory(extract_proven_prob,inventory_format, relevant_tables, correct_units)
    mineral_inventory_json["MineralInventory"] += cleaned_proven_prob['MineralInventory']


## Combine json structures into one and write 

In [None]:
mineral_site_json["MineralSite"][0]['MineralInventory'] = mineral_inventory_json['MineralInventory']
mineral_site_json["MineralSite"][0]['deposit_type_candidate'] = deposit_types_json['deposit_type_candidate']
print(mineral_site_json)
current_datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")

In [None]:
# Specify the path to the output JSON file
new_name = file_name[:-4].replace(" ", "_")

output_file_path = f'./extracted/{new_name}_summary_{current_datetime_str}.json'

def convert_int_or_float(obj):
    if isinstance(obj, dict):
        return {key: convert_int_or_float(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_int_or_float(item) for item in obj]
    elif isinstance(obj, (int, float)):
        return obj
    elif isinstance(obj, str) and obj.isdigit():
        return int(obj)
    elif isinstance(obj, str) and obj.replace('.', '', 1).isdigit():
        return float(obj)
    return obj

# Writing to a file using json.dump with custom serialization function
with open(output_file_path, "w") as json_file:
    json.dump(convert_int_or_float(mineral_site_json), json_file, indent=2)
    

print(f"Combined data written to {output_file_path}")

## Deletion
At the end should work on removing the existance of the assistant to not have any outstanding files or assistants which could cause more charges.

In [None]:
resp_code = delete_assistant(assistant_id)

if resp_code == 200:
    print(f"Deleted assistant {assistant_id}")
else:
    print(f"Deletion FAILED")
    

# Notes
1. Should try function calling