In [85]:
import os
import openai
import csv
from extraction_package import extraction_functions
from settings import API_KEY, LIBRARY_ID, LIBRARY_TYPE, ZOLTERO_KEY 


client = openai.OpenAI(api_key = API_KEY)

## Commodities Extraction
This will work as the first pass that goes through each of the files and we want to identify all the commodities in each file. We will use chatGPT to do this and then create a master list.

Notes From Goran
- idea: do two commodities --> get the ones mentioned directly for mineral inventory & then another grand list


In [86]:
first_pass_instructions = """You are a geology expert and you are very good in understanding mining reports. 
You will be given a text from a mining report and a table name. From this mining report, you will need to 
extract all of the commodities that are mentioned and are relevant.
"""

get_commodities_prompt = """Given this document, can you return all commodities that are referenced in the 
Mineral Reserve Estimates or Mineral Resource Estimates. The commodities MUST be in this list 
__COMMODITIES_LIST__. Return the found commodities in this JSON format 
{{'commodities': [commodity_1, commodity_2,....]}}. If there are no commodities found, return 
{{'commodities': []}}.
"""

In [87]:
minmod_commodities = extraction_functions.read_csv_to_dict("./codes/minmod_commodities.csv")
commodities_list = []
for key in minmod_commodities:
    commodities_list.append(key['CommodityinGeoKb'])

In [88]:
def create_assistant_commodities(file_id):
    assistant = client.beta.assistants.create(
        name="Get Extraction",
        instructions= first_pass_instructions ,
        tools=[{"type": "retrieval"}],
        model="gpt-4-1106-preview",
        file_ids=[file_id]
    )

    thread = client.beta.threads.create(
    messages=[
    {
      "role": "user",
      "content": "You are a geology expert and you are very good in understanding mining reports, which is attached.",
      "file_ids": [file_id]
    }])
    # print(f"Created an Assistant")
    return thread.id, assistant.id


def check_file_commodities(thread_id, assistant_id, file_path):
    file_instructions = """If the file was correctly uploaded and can be read return YES otherwise return NO. 
                        Only return the Yes or No answer.
                        """
    run = client.beta.threads.runs.create(
      thread_id=thread_id,
      assistant_id=assistant_id,
      instructions= file_instructions
    )
    # print(f"Current run id = {run.id} thread_id = {thread_id}")
    
    ans = extraction_functions.get_assistant_response(thread_id, run.id)
    print(f"Response: {ans}")
    if ans.lower() == "no":
        print("We need to reload file.")
        response_code = extraction_functions.delete_assistant(assistant_id)
        if response_code == 200:
            print(f"Deleted assistant {assistant_id}")
        file = client.files.create(
              file=open(f"{file_path}", "rb"),
              purpose='assistants'
            )
        new_thread_id, new_assistant_id =  create_assistant_commodities(file.id)
        return check_file_commodities(new_thread_id, new_assistant_id, file_path)
    else:
        print("File was correctly uploaded \n")
        return thread_id, assistant_id
    
def add_to_metadata(file_name, commodities_dict):
    csv_file_path = 'commodities_metadata.csv'

    if not os.path.exists(csv_file_path):
        # Create the CSV file with header row
        with open(csv_file_path, mode='w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['File Name', 'Identified Commodities'])

    with open(csv_file_path, mode='a', newline='') as csvfile:
        # Create a CSV writer object
        writer = csv.writer(csvfile)
        
        if commodities_dict['commodities']:
            joined_commodities = ', '.join(commodities_dict['commodities'])
        else:
            joined_commodities = ""
        
        writer.writerow([file_name, joined_commodities])
    print(f"Finished writing row for {file_name} \n")

In [89]:
directory_path = './reports/nickel/review/'


file_list = [file_name for file_name in os.listdir(directory_path) if file_name.endswith('.pdf')]

for idx, file_name in enumerate(file_list):
    print(f"Working on File: {file_name} file num: {idx+1} out of {len(file_list)}")
    file_path = directory_path + file_name
    file = client.files.create(
    file=open(f"{file_path}", "rb"),
    purpose='assistants'
    )

    thread_id, assistant_id = create_assistant_commodities(file.id)

    thread_id, assistant_id = check_file_commodities(thread_id, assistant_id, file_path)
    run = client.beta.threads.runs.create(
      thread_id=thread_id,
      assistant_id=assistant_id,
      instructions= get_commodities_prompt.replace("__COMMODITIES_LIST__", str(commodities_list))
    )

    ans = extraction_functions.get_assistant_response(thread_id, run.id)
    correct_format = {'commodities': []}
    commodities_json = extraction_functions.extract_json_strings(ans, str(correct_format), remove_comments = False)
    if commodities_json == None:
        commodities_json = correct_format
        
    print(f"Here are the extracted commodities: {commodities_json}")
    
    extraction_functions.delete_assistant(assistant_id)
    add_to_metadata(file_name, commodities_json)

Working on File: Trout Bay Ni Cu PGM 7-2006.pdf file num: 1 out of 39
Run: run_Ezv2v4MwrXSKl2KksN38K8xt Thread: thread_zzSIwczRfdiGNCmZChVeWCB7 
 response: YES 

Response: YES
File was correctly uploaded 

Run: run_IwRt3YWOdMkLjwxc9KOe9dUn Thread: thread_zzSIwczRfdiGNCmZChVeWCB7 
 response: The commodities referenced in the Mineral Reserve Estimates or Mineral Resource Estimates within the document provided are nickel (Ni), copper (Cu), platinum-group elements (PGE), zinc (Zn), silver (Ag), lead (Pb), and gold (Au)【10†source】【12†source】.

Here is the requested JSON format for the identified commodities:
```json
{'commodities': ['nickel', 'copper', 'platinum-group elements', 'zinc', 'silver', 'lead', 'gold']}
``` 

Need to reformat the JSON extraction 

Here are the extracted commodities: {'commodities': ['nickel', 'copper', 'platinum-group elements', 'zinc', 'silver', 'lead', 'gold']}
Finished writing row for Trout Bay Ni Cu PGM 7-2006.pdf 

Working on File: Northmet Ni Cu PGM 9-2007.p