In [21]:
import os
import json
import yaml
import argparse
from collections import OrderedDict

import pandas as pd
import numpy as np

In [28]:
cfg_file = 'configs/base_config.yaml'

In [29]:
with open(cfg_file) as f:
        cfg = yaml.load(f, Loader=yaml.FullLoader)

doc_metadata = pd.read_csv(os.path.join(cfg['data_dir'], 'metadata.csv'))

# paths
doc_dir = os.path.join(
    cfg['log_dir'], cfg['text_extraction']['doc_dir']
)

min_sys_dir = os.path.join(
    cfg['log_dir'], cfg['mineral_system']['response_dir']
)

map_cri_dir = os.path.join(
    cfg['log_dir'], cfg['mappable_criteria']['response_dir']
)

map_layer_ppl_dir = os.path.join(
    cfg['log_dir'], cfg['map_layers']['response_ppl_dir']
)

map_layer_df = pd.read_csv(cfg["map_layers"]['map_layer_file'])
map_layer_df = map_layer_df[['Method', 'Method sub-type', 'Dataset description', 'Dataset name']]
map_layer_df = map_layer_df[map_layer_df['Method sub-type'] != 'Training data']
map_layer_list = map_layer_df.apply(':'.join, axis=1).to_list()

In [30]:
components = list(cfg['mappable_criteria']['llm_config']['definition'].keys())
mineral_system = {k:[] for k in components}
mineral_system

{'source': [], 'pathway': [], 'trap': [], 'preservation': []}

In [31]:
doc_metadata.id.to_list()

['SIR10-5070A']

In [32]:
def load_responses_map_layer(fname):
    # import ipdb; ipdb.set_trace()
    responses = {}
    with open(fname, 'r') as f:
        for line in f.readlines():
            item_list = json.loads(line)
            if len(item_list) == 2:
                request, response = item_list
            elif len(item_list) == 3:
                request, response, metadata = item_list
            else:
                print("error!")
            # responses.append({
            #     'response': response['choices'][0]['message']['content'], 'metadata': metadata
            #     })
            node_id = metadata['node_id']
            map_layer_id = metadata['map_layer_id']
            if node_id not in responses:
                responses[node_id] = {map_layer_id: response['choices'][0]['logprobs']['content']}
            else:
                responses[node_id][map_layer_id] = response['choices'][0]['logprobs']['content']
            # responses.append({
            #     'response': response['choices'][0]['logprobs']['content'], 'metadata': metadata
            #     })
            
    return responses

In [33]:
def load_response_map_cri(fname):
    map_cri_response = {}
    with open(fname, 'r') as f:
        for line in f.readlines():
            items = json.loads(line)
            if len(items) == 2:
                request, response = items
            elif len(items) == 3:
                request, response, metadata = items
            else:
                print("error!")
            map_cri_response[metadata['node_id']] = {
                'component': metadata['component'],
                'response': response['choices'][0]['message']['content'].replace('\n\n', '\n')
            }
    return map_cri_response

In [36]:
fnames = [fname+'.jsonl' for fname in doc_metadata.id.to_list()]
for fname in fnames:
    
    file_id = fname.split('.')[0]
    doc_fname = os.path.join(doc_dir, file_id + '.json')
    with open(doc_fname, 'r') as f:
        doc = json.load(f, object_pairs_hook=OrderedDict)
    
    doc_meta = doc_metadata[doc_metadata["id"] == file_id].iloc[0].to_dict()

    map_cri_file = os.path.join(map_cri_dir, fname)
    map_cri_response = load_response_map_cri(map_cri_file)
    
    map_layer_file = os.path.join(map_layer_ppl_dir, fname)
    map_layer_response = load_responses_map_layer(map_layer_file)

    for node_id in map_cri_response:
        comp = map_cri_response[node_id]['component']
        map_cri = map_cri_response[node_id]['response']

        map_layer_dict = []
        for i in range(len(map_layer_list)):
            top_logprobs = map_layer_response[node_id][i][0]['top_logprobs']
            for l in top_logprobs:
                if l['token'].lower() == 'yes':
                    map_layer_dict.append({
                        "name":map_layer_list[i],
                        "relevance_score": np.exp(l['logprob']),
                    })
        map_layers_sorted = sorted(map_layer_dict, key=lambda d: d['relevance_score'], reverse=True)  # descending order

        mineral_system[comp].append({
            "criteria": map_cri,
            "theoretical": "N/A",
            "potential_dataset": map_layers_sorted[:5],
            "supporting_references": [{
                # "id": file_id,
                "document": doc_meta,
                "page_info": [{"text": doc[node_id]["text"], "page": doc[node_id]["page"], "bounding_box": doc[node_id]["coords"]}]
            }]
        })


In [37]:
mineral_system

{'source': [{'criteria': 'The mappable proxy that represents the geological features related to the source of MVT Lead-Zinc deposits could be fracture density and permeability.',
   'theoretical': 'N/A',
   'potential_dataset': [{'name': 'Geology:Fault:Proximity to fault (pixels; calculated from raster with 0.01 degree grid):Geology_Fault_Proximity',
     'relevance_score': 0.998837718273238},
    {'name': 'Geology:Geological properties:Precense or absence of geological property:Geology_Dictionary_Cherty',
     'relevance_score': 0.9732007737651269},
    {'name': 'Geology:Geological properties:Precense or absence of geological property:Geology_Dictionary_Evaporitic',
     'relevance_score': 0.9639554713346702},
    {'name': 'Geophysics:Gravity:Bouguer gravity anomaly (mGal):Gravity_Bouguer',
     'relevance_score': 0.958305782047392},
    {'name': 'Geology:Geological properties:Precense or absence of geological property:Geology_Dictionary_Intermediate',
     'relevance_score': 0.958181

In [38]:
doc_meta

{'id': 'SIR10-5070A',
 'title': 'A Deposit Model for Mississippi Valley-Type Lead-Zinc Ores',
 'doi': '10.3133/sir20105070A',
 'uri': 'https://pubs.usgs.gov/sir/2010/5070/a/pdf/SIR10-5070A.pdf',
 'authors': 'David L. Leach, Ryan D. Taylor, David L. Fey, Sharon F. Diehl, and Richard W. Saltus',
 'journal': 'Scientific Investigations Report 2010-5070-A',
 'year': 2010,
 'month': nan,
 'volume': nan,
 'issue': nan,
 'description': nan}

In [41]:
print({k: doc_meta[k] for k in doc_meta if doc_meta[k] is np.nan})

{}


In [42]:
import math
math.isnan(doc_meta['month'])

True