In [135]:
import pandas as pd
import json
import re
import csv
import os
import copy
from datetime import datetime
import glob
import numpy as np
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Spacer, Paragraph
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle

In [136]:
def collect_json_files(directory):
    json_files = []
    for file_path in glob.glob(os.path.join(directory, '*.json')):
         json_files.append(os.path.basename(file_path))
    return json_files


In [137]:
def read_csv_to_dict(file_path):
    data_dict_list = []
    
    with open(file_path, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        
        for row in csv_reader:
            data_dict_list.append(dict(row))
    
    return data_dict_list

In [138]:
def select_keys(d, keys):
    selected = {}
    for key in keys:
        value = d
        for subkey in key.split('.'):
            if isinstance(value, dict) and subkey in value:
                value = value[subkey]
            else:
                value = None
                break
        selected[key] = value
    return selected

In [139]:
 def get_deposit_types(json_data):   
    minmod_deposit_types = read_csv_to_dict("./codes/minmod_deposit_types.csv")
    deposit_id = {}
    for key in minmod_deposit_types:
        deposit_id[key['Minmod ID']] = key['Deposit type']

    data = []
    for item in json_data['deposit_type']:
        row = {
            "deposit type": deposit_id[item.split('/')[-1]],
        }
        data.append(row)

    deposit_types = pd.DataFrame(data)
    return deposit_types

In [190]:
def get_document_ref(json_data):
    if 'MineralInventory' in json_data and json_data['MineralInventory']:
        if 'reference' in json_data['MineralInventory'][0] and 'document' in json_data['MineralInventory'][0]['reference']:
            document_ref = json_data['MineralInventory'][0]['reference']['document']
            document_df = pd.DataFrame.from_dict(document_ref, orient='index').T
            return document_df
    # If the 'document' key does not exist, or if any of the necessary keys are missing, return None or raise an error
    return pd.DataFrame() 

In [185]:
def get_mineral_site(json_data):
    selected_keys = ["source_id", "name", "location_info.location","location_info.country", 
                     "location_info.crs", "location_info.state_or_province"]

    mineral_site = pd.DataFrame(select_keys(json_data, selected_keys), index=[0])
    return mineral_site

In [186]:
def get_mineral_inventory(json_data):
    minmod_commodities = read_csv_to_dict("./codes/minmod_commodities.csv")
    commodities = {}
    for key in minmod_commodities:
        commodities[key['minmod_id']] = key['CommodityinGeoKb']

    minmod_units = read_csv_to_dict("./codes/minmod_units.csv")
    correct_units = {}
    for key in minmod_units:
        correct_units[key['minmod_id']] =  key['unit name']

    # Extracting required fields with cutoff_unit and cutoff_value
    data = []
    for item in json_data['MineralInventory']:
        row = {
            "zone": item.get("zone"),
            "page_number": item["reference"]["page_info"][0]["page"],
            "commodity": commodities[item.get("commodity").split('/')[-1]],
            "category": item["category"][0].split('/')[-1],
            "ore_unit": correct_units[item["ore"]["ore_unit"].split('/')[-1]],
            "ore_value": item["ore"]["ore_value"],
            "grade_unit": correct_units[item["grade"]["grade_unit"].split('/')[-1]],
            "grade_value": item["grade"]["grade_value"],
            "cutoff_unit": correct_units[item["cutoff_grade"].get("grade_unit").split('/')[-1]] if "cutoff_grade" in item else None,
            "cutoff_value": item["cutoff_grade"].get("grade_value") if "cutoff_grade" in item else None
        }
        data.append(row)

    # Creating DataFrame
    mineral_inventory = pd.DataFrame(data)
    return mineral_inventory

In [187]:
def create_pdf_with_tables(dataframes, output_file):
    doc = SimpleDocTemplate(output_file, pagesize=letter)
    elements = []
    styles = getSampleStyleSheet()

    # Define a style for wrapped text inside table cells
    cell_style = styles['Normal']
    cell_style.wordWrap = 'CJK'  # Use 'CJK' word wrapping which wraps on any character

    for name, dataframe in dataframes.items():
        # Add title above each table
        title_style = ParagraphStyle('TitleStyle', parent=styles['Normal'], alignment=0, fontName='Helvetica-Bold')  # Left alignment and bold font
        title = Paragraph(name, title_style)
        elements.append(title)
        
        # Convert DataFrame to list of lists
        # Ensure each cell in the dataframe is wrapped with a Paragraph object
    
        if len(dataframe.columns)>0:
            table_data = [[Paragraph(str(cell), cell_style) for cell in dataframe.columns]]  # Header row
            for row in dataframe.values.tolist():
                table_data.append([Paragraph(str(cell), cell_style) for cell in row])  # Data rows

            # Calculate the width of the table to fit within the page width
            table_width = doc.width * 0.9  

            # Create table with column width calculated to fit the page

            table = Table(table_data, colWidths=[table_width / len(dataframe.columns)] * len(dataframe.columns))

            # Add style to the table
            style = TableStyle([
                ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
                ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
                ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
                ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                ('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),  # Set font to Helvetica for table data
                ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
                ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
                ('GRID', (0, 0), (-1, -1), 1, colors.black),
                ('TEXTCOLOR', (0, 0), (-1, -1), colors.black),  # Set text color to black
                ('INNERGRID', (0, 0), (-1, -1), 0.25, colors.black),  # Set inner grid color
                ('BOX', (0, 0), (-1, -1), 0.25, colors.black),  # Draw a border around each cell
                ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),  # Center text vertically
                ('ALIGN', (0, 0), (-1, -1), 'CENTER'),  # Center text horizontally
                ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),  # Set font to Helvetica for all cells
                ('SIZE', (0, 0), (-1, -1), 9),  # Set initial font size to 9
                ('TEXTFONT', (0, 0), (-1, -1), 'Helvetica'),  # Set font to Helvetica for all cells
                ('LEADING', (0, 0), (-1, -1), 9),  # Set initial leading (line spacing) to 9
                ('BACKGROUND', (0, 0), (-1, 0), colors.gray)  # Set header row background color
            ])

            table.setStyle(style)
            elements.append(table)

            # Add space between tables
            elements.append(Spacer(1, 24))

        # Build PDF with all tables
    doc.build(elements)


In [188]:
folder_path = './extracted/extractions/'
files = collect_json_files(folder_path)
print(files)

['Hakkira_Zn_4-2011_summary_20240110_111100.json', 'Mehdiabad_Zn_3-2005_summary_20231222_093735.json', 'Horne_5_Cu_Zn_Au_Ag_10-2017_FS_summary_20240118_133338.json', 'Macmillan_Pass_Zn_Pb_Ag_7-2018_PEA_summary_20240119_130630.json', 'La_Negra_Ag_Cu_Pb_Zn_1-2015_OM_summary_20240124_115513.json', 'Pine_Point_Zn_Pb_8-2007_v_1_summary_20240118_135326.json', 'McIlvenna_Bay_Cu_Zn_Ag_Au_1-2015_PEA_summary_20240119_132335.json', 'Scotia_Pb_Zn_7-2020_summary_20240112_110458.json', 'La_Negra_Ag_Cu_Pb_Zn_5-2013_OM_summary_20240124_151004.json', 'Jubilee_Zn_Pb_11-2007_summary_20240111_154103.json', 'Hakkari_Zn_7-2013_summary_20240116_133821.json', 'Tufanbeyli_Zn_7-2013_summary_20240112_115329.json', 'Hakkari_Zn_3-2010_summary_20240109_104541.json', 'Scarlet_Zn_Pb_10-2011_summary_20240111_161945.json', 'Cozamin_Zn_Cu_Ag_8-2014_OM_summary_20240117_130906.json', 'Scotia_Pb_Zn_7-2006_summary_20240111_164607.json', 'Prairie2017.json', 'Bleiberg_Pb_Zn_5-2017_summary_20231221_101020.json', 'Penasquito_Zn

In [189]:
for file in files:
    print(f"working on file: {file}")
    file_path = folder_path + file
    filename = file[:-3]
    with open(file_path, 'r') as f:
        # Load the JSON data into a Python dictionary
        data = json.load(f)

    # Now 'data' contains the contents of the JSON file
    json_data = data['MineralSite'][0]
    document_df = get_document_ref(json_data)
    mineral_site = get_mineral_site(json_data)
    mineral_inventory = get_mineral_inventory(json_data)
    deposit_types = get_deposit_types(json_data)
    data_dict = {'Document Reference':document_df, 'Mineral Site ':mineral_site, 
                 'Mineral Inventory':mineral_inventory, 'Deposit Types': deposit_types}
    create_pdf_with_tables(data_dict, f'./created_pdf/{filename}.pdf')
    print(f"Finished file {file}")

working on file: Hakkira_Zn_4-2011_summary_20240110_111100.json
                                               title  \
0  NI 43-101 Technical Report on the Hakkari Zinc...   

                             uri  \
0  file-YzC1Hy8G0xIipW8IcbgH0fuj   

                                          authors  year month  \
0  [Mike Robertson, Dr Brendan Clarke, Mike Hall]  2011     4   

                                         description  
0  The mineral properties that form the HZP repre...  
Finished file Hakkira_Zn_4-2011_summary_20240110_111100.json
working on file: Mehdiabad_Zn_3-2005_summary_20231222_093735.json
                                        doi  \
0  https://w3id.org/usgs/z/4530692/N958IUC2   

                                             title uri month  year  
0  mehdiabad zinc project – information memorandum         3  2005  
Finished file Mehdiabad_Zn_3-2005_summary_20231222_093735.json
working on file: Horne_5_Cu_Zn_Au_Ag_10-2017_FS_summary_20240118_133338.json
         