In [12]:
import pandas as pd
import json
import re
import csv
import os
import copy
from datetime import datetime
import glob
import numpy as np
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Spacer, Paragraph
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle

In [13]:
def collect_json_files(directory):
    json_files = []
    for file_path in glob.glob(os.path.join(directory, '*.json')):
         json_files.append(os.path.basename(file_path))
    return json_files


In [14]:
def read_csv_to_dict(file_path):
    data_dict_list = []
    
    with open(file_path, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        
        for row in csv_reader:
            data_dict_list.append(dict(row))
    
    return data_dict_list

In [15]:
def select_keys(d, keys):
    selected = {}
    for key in keys:
        value = d
        for subkey in key.split('.'):
            if isinstance(value, dict) and subkey in value:
                value = value[subkey]
            else:
                value = None
                break
        selected[key] = value
    return selected

In [16]:
 def get_deposit_types(json_data):   
    minmod_deposit_types = read_csv_to_dict("../codes/minmod_deposit_types.csv")
    deposit_id = {}
    for key in minmod_deposit_types:
        deposit_id[key['Minmod ID']] = key['Deposit type']

    data = []
    for item in json_data['deposit_type_candidate']:
        row = {
            "observed deposit type": item.get('observed_name') if 'observed_name' in item else None,
            "normalized id": deposit_id.get(item.get('normalized_uri').split('/')[-1]) if 'normalized_uri' in item and item.get('normalized_uri').split('/')[-1] in deposit_id else None
        }
        data.append(row)

    deposit_types = pd.DataFrame(data)
    return deposit_types

In [17]:
def get_document_ref(json_data):
    if 'mineral_inventory' in json_data and json_data['mineral_inventory']:
        if 'reference' in json_data['mineral_inventory'][0] and 'document' in json_data['mineral_inventory'][0]['reference']:
            document_ref = json_data['mineral_inventory'][0]['reference']['document']
            document_df = pd.DataFrame.from_dict(document_ref, orient='index').T
            return document_df
    # If the 'document' key does not exist, or if any of the necessary keys are missing, return None or raise an error
    return pd.DataFrame() 

In [18]:
def get_mineral_site(json_data):
    selected_keys = ["source_id", "name", "location_info.location","location_info.country", 
                     "location_info.crs", "location_info.state_or_province"]

    mineral_site = pd.DataFrame(select_keys(json_data, selected_keys), index=[0])
    return mineral_site

In [43]:
def get_mineral_inventory(json_data):
    minmod_commodities = read_csv_to_dict("../codes/minmod_commodities.csv")
    commodities = {}
    for key in minmod_commodities:
        commodities[key['minmod_id']] = key['CommodityinGeoKb']

    minmod_units = read_csv_to_dict("../codes/minmod_units.csv")
    correct_units = {}
    for key in minmod_units:
        correct_units[key['minmod_id']] =  key['unit name']

    # Extracting required fields with cutoff_unit and cutoff_value
    data = []
    for item in json_data['mineral_inventory']:
        # print(item)
        row = {
            "zone": item.get("zone") if "zone" in item else None,
            "page_number": item["reference"].get("page_info", [{}])[0].get("page", ""),
            "commodity": commodities[item["commodity"].split('/')[-1]] if "commodity" in item else None,
            "category": item.get("category", ""),
            "ore_unit": correct_units[item["ore"]["ore_unit"].split('/')[-1]] if "ore" in item and "ore_unit" in item["ore"] else None,
            "ore_value": item["ore"]["ore_value"] if "ore" in item and "ore_value" in item["ore"] else None,
            "grade_unit": correct_units[item["grade"]["grade_unit"].split('/')[-1]] if "grade" in item and "grade_unit" in item["grade"] else None,
            "grade_value": item["grade"]["grade_value"] if "grade" in item and "grade_value" in item["grade"] else None,
            "cutoff_unit": correct_units[item["cutoff_grade"]["grade_unit"].split('/')[-1]] if "cutoff_grade" in item and "grade_unit" in item["cutoff_grade"] else None,
            "cutoff_value": item["cutoff_grade"]["grade_value"] if "cutoff_grade" in item and "grade_value" in item["cutoff_grade"] else None
        }
        data.append(row)

    # Creating DataFrame
    mineral_inventory = pd.DataFrame(data)
    return mineral_inventory

In [44]:
def create_pdf_with_tables(dataframes, output_file):
    doc = SimpleDocTemplate(output_file, pagesize=letter)
    elements = []
    styles = getSampleStyleSheet()

    # Define a style for wrapped text inside table cells
    cell_style = styles['Normal']
    cell_style.wordWrap = 'CJK'  # Use 'CJK' word wrapping which wraps on any character

    for name, dataframe in dataframes.items():
        # Add title above each table
        title_style = ParagraphStyle('TitleStyle', parent=styles['Normal'], alignment=0, fontName='Helvetica-Bold')  # Left alignment and bold font
        title = Paragraph(name, title_style)
        elements.append(title)
        
        # Convert DataFrame to list of lists
        # Ensure each cell in the dataframe is wrapped with a Paragraph object
    
        if len(dataframe.columns)>0:
            table_data = [[Paragraph(str(cell), cell_style) for cell in dataframe.columns]]  # Header row
            for row in dataframe.values.tolist():
                table_data.append([Paragraph(str(cell), cell_style) for cell in row])  # Data rows

            # Calculate the width of the table to fit within the page width
            table_width = doc.width * 0.9  

            # Create table with column width calculated to fit the page

            table = Table(table_data, colWidths=[table_width / len(dataframe.columns)] * len(dataframe.columns))

            # Add style to the table
            style = TableStyle([
                ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
                ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
                ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
                ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                ('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),  # Set font to Helvetica for table data
                ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
                ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
                ('GRID', (0, 0), (-1, -1), 1, colors.black),
                ('TEXTCOLOR', (0, 0), (-1, -1), colors.black),  # Set text color to black
                ('INNERGRID', (0, 0), (-1, -1), 0.25, colors.black),  # Set inner grid color
                ('BOX', (0, 0), (-1, -1), 0.25, colors.black),  # Draw a border around each cell
                ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),  # Center text vertically
                ('ALIGN', (0, 0), (-1, -1), 'CENTER'),  # Center text horizontally
                ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),  # Set font to Helvetica for all cells
                ('SIZE', (0, 0), (-1, -1), 9),  # Set initial font size to 9
                ('TEXTFONT', (0, 0), (-1, -1), 'Helvetica'),  # Set font to Helvetica for all cells
                ('LEADING', (0, 0), (-1, -1), 9),  # Set initial leading (line spacing) to 9
                ('BACKGROUND', (0, 0), (-1, 0), colors.gray)  # Set header row background color
            ])

            table.setStyle(style)
            elements.append(table)

            # Add space between tables
            elements.append(Spacer(1, 24))

        # Build PDF with all tables
    doc.build(elements)


In [45]:
folder_path = '../extracted/nine_month/tungsten/'
files = collect_json_files(folder_path)
print(files)

['Mount_Pleasant_W_Mo_Sn_4-2012_summary_20240502_205827.json', 'Mactung_W_4-2009_summary_20240502_204536.json', 'Grey_River_W_6-2007_summary_20240502_133105.json', 'Sisson_Brook_W_Mo_12-2009_summary_20240502_211058.json', 'Burnt_Hill_W_Mo_Sn_7-2002_summary_20240502_124457.json', 'Cantung_W_2-2003_summary_20240502_124838.json', 'Kalzas_W_10-2016_summary_20240502_171600.json', 'Sisson_Brook_W_Mo_3-2007_summary_20240502_210504.json', 'Burnt_Hill_W_Mo_Sn_8-2013_summary_20240501_203647.json', 'Cantung_W_9-2014_summary_20240501_212933.json', 'Sisson_Brook_W_Mo_6-2008_summary_20240502_211650.json', 'Cantung_W_8-2009_summary_20240501_215808.json', 'Burnt_Hill_W_Mo_Sn_2-2008_summary_20240429_153352.json', 'Burnt_Hill_W_Mo_Sn_9-2006_summary_20240501_203513.json', 'Burnt_Hill_W_Mo_Sn_9-2009_summary_20240501_203527.json', 'Kalzas_W_4-2008_summary_20240502_171920.json', 'Lucky_Mike_Cu_W_9-2012_summary_20240502_192009.json', 'Mount_Pleasant_W_Mo_Bi_9-2012_summary_20240502_204423.json', 'Cantung_W_12

In [46]:
for file in files:
    print(f"working on file: {file}")
    file_path = folder_path + file
    filename = file[:-5]
    
    with open(file_path, 'r') as f:
        # Load the JSON data into a Python dictionary
        data = json.load(f)

    # Now 'data' contains the contents of the JSON file
    json_data = data['MineralSite'][0]
    document_df = get_document_ref(json_data)
    mineral_site = get_mineral_site(json_data)
    mineral_inventory = get_mineral_inventory(json_data)
    deposit_types = get_deposit_types(json_data)
    data_dict = {'Document Reference':document_df, 'Mineral Site ':mineral_site, 
                 'Mineral Inventory':mineral_inventory, 'Deposit Types': deposit_types}
    create_pdf_with_tables(data_dict, f'../created_pdf/tungsten_pdfs/{filename}.pdf')
    print(f"Finished file {file}")

working on file: Mount_Pleasant_W_Mo_Sn_4-2012_summary_20240502_205827.json
Finished file Mount_Pleasant_W_Mo_Sn_4-2012_summary_20240502_205827.json
working on file: Mactung_W_4-2009_summary_20240502_204536.json
Finished file Mactung_W_4-2009_summary_20240502_204536.json
working on file: Grey_River_W_6-2007_summary_20240502_133105.json
Finished file Grey_River_W_6-2007_summary_20240502_133105.json
working on file: Sisson_Brook_W_Mo_12-2009_summary_20240502_211058.json
Finished file Sisson_Brook_W_Mo_12-2009_summary_20240502_211058.json
working on file: Burnt_Hill_W_Mo_Sn_7-2002_summary_20240502_124457.json
Finished file Burnt_Hill_W_Mo_Sn_7-2002_summary_20240502_124457.json
working on file: Cantung_W_2-2003_summary_20240502_124838.json
Finished file Cantung_W_2-2003_summary_20240502_124838.json
working on file: Kalzas_W_10-2016_summary_20240502_171600.json
Finished file Kalzas_W_10-2016_summary_20240502_171600.json
working on file: Sisson_Brook_W_Mo_3-2007_summary_20240502_210504.json
