In [1]:
import json
import pandas as pd
import numpy as np
import os
import ast
import shutil
from tabulate import tabulate

# Steps for creating GT

1. Move extraction to automatic_evalution/data/extracted
2. Fill in the metadata and the complete or minimal data. Use the prompt below plus a screenshot to get the output.

#### Prompt
Given the screenshot, please fill out the csv headers. The commodity and unit values must be full spelled out, all ore_values should be floats, and category_observed_name needs to be one of the following values: ["inferred", "indicated","measured", "probable","proven", "proven+probable", "inferred+indicated", "inferred+measured", "measured+indicated"]. Leave contained_metal empty. Get all rows. 

csv headers = commodity_observed_name,category_observed_name,ore_unit_observed_name,ore_value,grade_unit_observed_name,grade_value,cutoff_grade_unit_observed_name,cutoff_grade_value,contained_metal,zone


# Functions

In [5]:


class CustomEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        return super().default(obj)

def get_mineral_inventory_gt(report_name,commodity_name, minimal_or_complete='minimal'):

    m = pd.read_csv(f'../data/gt_csv_format/{commodity_name}/' + report_name + '/' + 'metadata.csv')
    m = m.fillna("")
    print(tabulate(m, headers='keys', tablefmt='pretty'))
    mining_site_name = m.loc[0].get('mining_name', '')
    location_info = {
        'location':'',
        'crs':'',
        'country':[{
            'normalized_uri':'',
            'observed_name': m.loc[0]['country_observed_name'],
            'confidence': 1,
            'source':'Inferlink Extraction v3'
        }],
        'state_or_province': [{
            'normalized_uri': '',
           'observed_name': m.loc[0]['state_or_province_observed_name'],
           'confidence': 1,
           'source': 'Inferlink Extraction v3'}]
    }

    

    deposit_type_candidate = [
        {
            'observed_name': '',
            'normalized_uri': '',
            'source': 'Inferlink Extraction v3',
            'confidence': 1.0}
    ]

    authors = m.loc[0]['authors'].split(',')

    reference = {
            'document': {
                'title': '',
                'authors': authors,
                'year': m.loc[0]['year'],
                'month': m.loc[0]['month'],
                'description': '',
                'uri': ''}
        }
    
    res_gt = pd.read_csv(f'../data/gt_csv_format/{commodity_name}/' + report_name + '/' + 'mineral_inventory_' + minimal_or_complete + '.csv', index_col=False)
    res_gt = res_gt.fillna("")
    for col in res_gt.columns:
        if col.endswith('_value'):
            res_gt[col] = pd.to_numeric(res_gt[col].replace("", pd.NA), errors='coerce').fillna(0)
    res_gt['contained_metal'] = (res_gt['ore_value'] * res_gt['grade_value'] * 0.01).round(4)


    mineral_inventory = []
    for i,row in res_gt.iterrows():
        mi = {}
        mi['commodity'] = {
            'normalized_uri': '',
            'observed_name': row['commodity_observed_name'],
            'confidence': 1,
            'source': 'Inferlink Extraction v3'
        }
        mi['category'] = [
            {
           'normalized_uri':'',
           'observed_name': row['category_observed_name'],
           'confidence': 1,
           'source': 'Inferlink Extraction v3'
            }]
        mi['ore'] = {
            'unit': {
                'normalized_uri': '',
                'observed_name': row['ore_unit_observed_name'],
                'confidence': 1,
                'source': 'Inferlink Extraction v3'},
            'value': row['ore_value']}
        mi['grade'] = {
            'unit': {
                'normalized_uri': '',
                'observed_name': row['grade_unit_observed_name'],
                'confidence': 1,
                'source': 'Inferlink Extraction v3'},
            'value': row['grade_value']}
        mi['cutoff_grade'] = {
            'unit': {
                'normalized_uri': '',
                'observed_name': row['cutoff_grade_unit_observed_name'],
                'confidence': 1,
                'source': 'Inferlink Extraction v3'},
            'value': row['cutoff_grade_value']}
        mi['contained_metal'] = row['contained_metal']
        mi['reference'] = reference
        mi['date'] = str(m.loc[0]['year']) + "-" + str(m.loc[0]['month'])
        mi['zone'] = row['zone']
        mineral_inventory.append(mi)
    
    if len(res_gt) == 0:
        mi = {}
        mi['reference'] = reference
        mi['date'] = str(m.loc[0]['year']) + "-" + str(m.loc[0]['month'])
        mineral_inventory.append(mi)
    
        
    d = [
            {
                'source_id': '',
                'record_id': '',
                'name': mining_site_name,
                'location_info': location_info,
                'mineral_inventory':mineral_inventory,
                'deposit_type_candidate':deposit_type_candidate,
                'reference':[reference]
            }
        ]
    
    
    return(d)

# Run

In [7]:
# report_name = '02771a5d21ae0aca3c5bfe28f1b0c73eebe1790745adcf42cc10105946c31add6e_NI_43-101_Technical_Report_for_the_Nunavik_Project_in_North_America_dated_April_2010_summary_20241021'
# report_name = report_name.split('_')[0]



In [27]:
## Making all the files
commodity_name = 'nickel'
folder_path = f"../data/gt_csv_format/{commodity_name}/"
report_names = [
    entry.name
    for entry in os.scandir(folder_path)
    if entry.is_dir()
]


# folder_path = f"../../reports/{commodity_name}/"
# report_names = [
#     file.split('_')[0]
#     for file in os.listdir(folder_path)
#     if file.endswith('.pdf')
# ]

for report_name in report_names[:15]: 
    minimal_or_complete='minimal'
    filename = f'../data/gt/{commodity_name}/{report_name}/{minimal_or_complete}.json'
    source_dir = '../data/gt_csv_format/format'
    os.makedirs(f'../data/gt/{commodity_name}/' + report_name, exist_ok=True)
    
    if not os.path.exists(f'../data/gt_csv_format/{commodity_name}/' + report_name):
        os.makedirs(f'../data/gt_csv_format/{commodity_name}/' + report_name, exist_ok=True)
        shutil.copytree(source_dir,f'../data/gt_csv_format/{commodity_name}/' + report_name, dirs_exist_ok=True)
    


In [29]:
## Writing output


# append_date = "_2024102

# print(report_names)

minimal_or_complete='minimal'
for report_name in report_names[:15]:
    filename = f'../data/gt/{commodity_name}/' + report_name + '/' + minimal_or_complete + '.json'
    d = get_mineral_inventory_gt(report_name,commodity_name, minimal_or_complete)
    print(f"completed \n")
    with open(filename, 'w') as f:
        json.dump(d, f, cls=CustomEncoder, indent=4)




['02566050697784428cb9bc4d6cf5546d1e07f671eefc62f79aa3163701fb17fcdb', '02f973c7f9d847a305032fa4ec221182d8aa7e4e44677bc78d114c9c6d47cdfb08_NI_43-101_Technical_Report_for_the_Marathon_Project_in_North_America_dated_October_2006_summary', '02fd98e067010f8d853d2c018cb002be462ac7fcd895af4125beb89f42b659f89d', '02aab13c251f7d11f5bf4a0d752331726ada09d10eac1b32c0b45ae39b0737da38', '023a9e77a67e3689e17aa48d8b22913b9216a6895b9d84039cf00ad91c660ee548', '02a432b7e0c3f5432c7606759a6851e66a8f45094d522fa7c4c460084d9a5ddbc9', '02d40055ac23d9034c815a941763045780b709289537ed42a56b7390db074d11cd', '02771a5d21ae0aca3c5bfe28f1b0c73eebe1790745adcf42cc10105946c31add6e', '02226facf3218bc3ac5600ea372a971656b7ea477d3a1144aaa3f94ac476b0a10e', '025616a24d90f78dc37e943df3495bce368e062297a2e33a6f519524496c09c443', '0200a1c6d2cfafeb485d815d95966961d4c119e8662b8babec74e05b59ba4759d2', '0275b5be425dad03ece21eaa64ebc388d765b47ed001e4956f494978c11214f37d']
+---+-----------------------+---------------------------------+