In [19]:
import PyPDF2
import pandas as pd
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
    wait_incrementing,
    wait_fixed,
    wait_exponential
)  # for exponential backoff
import openai
import time
import json
import os
from collections import defaultdict
import warnings
import re
import csv
from datetime import date
from settings import API_KEY
import numpy as np
# Ignore the specific UserWarning from openpyxl
warnings.filterwarnings(action='ignore', category=UserWarning, module='openpyxl')

openai.api_key = API_KEY

@retry(wait=wait_random_exponential(min=1, max=30), stop=stop_after_attempt(16))
def chat_completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

## Update File Name For creating Json File

In [20]:
pdf_p = "./reports/mvt_zinc/"
pdf_name = "Prairie Creek Zn Pb Ag 5-2016 PFS.pdf"
curr_path = pdf_p + pdf_name
primary_commodity = "Zinc"
element_sign = "Zn"

In [21]:
def search_text_in_pdf(pdf_path, target_string):
    page_numbers = []
    
    # Open the PDF file in binary mode
    with open(pdf_path, 'rb') as file:
        
        # Create a PDF reader
        pdf = PyPDF2.PdfReader(file)
        
        # Iterate over each page
        for page_num in range(len(pdf.pages)):
            page = pdf.pages[page_num]
            text = page.extract_text()
            text_new = ' '.join(text.replace("\t", " ").split()).lower()
            # Check if target string is in the page's text
            if target_string.lower() in text_new:
                page_numbers.append(page_num)            
    return page_numbers

def get_answ(pdf_path,target_strings,model, content, pr, replace_t = False):
    all_matching_pages = []
    for target_string in target_strings:
        matching_pages = search_text_in_pdf(pdf_path, target_string)
        all_matching_pages += matching_pages
    if len(all_matching_pages)==0:
        return({})

    res = {}
    with open(pdf_path, 'rb') as file:
        pdf = PyPDF2.PdfReader(file)
        all_text = ''
        for matching_page in matching_pages:
            page = pdf.pages[matching_page]
            text = page.extract_text()
            all_text = all_text + '/n' + text
        if replace_t:
            all_text = all_text.replace("\t", " ")

        response = chat_completion_with_backoff(model=model, temperature=0, max_tokens=100, stop='.', messages=[
        {"role": "system", "content": content},
        {"role": "user", "content": pr+all_text},
        ])
        res = json.loads(response['choices'][0]['message']['content'])
        time.sleep(0.1)
    print(f"Here are all matching_pages to search the answer for: {matching_pages}")
    return(res)

#### Prompt templates

In [22]:
content = """You are a mining assistant, knowledgable in geology and skilled in 
understanding mining reports. You can extract information about mines, ore and minerals."""

commodity_pr = """You are a geology expert and you are very good in understanding mining reports. You will be given 
a text from a mining report and you have to find out what are the primary commodities and secondary commodities. 
The output should be in the following format: 
{"primary commodities": [primary commodity 1, primary commodity 2], "secondary commodities": [secondary commodity 1, secondary commodity 2]

Note that there could be no primary and secondary commodities mentioned, and in that case you should return None where appropriate.
Here is the text: 
"""
## Deposits
deposit_types_p = "./Deposit classification Scheme.xlsx"
deposit_types = ', '.join(pd.read_excel(deposit_types_p, sheet_name='Deposit classification scheme',engine='openpyxl')['Deposit type'].unique())

deposit_pr = """You are a geology expert and you are very good in understanding mining reports. You will be given 
a text from a mining report and you have to find out what are the deposit types in this mine. You can chose only from 
the provided list of the deposit types. You can chose one or more deposit types. If it is unknown, answer None.
The output should be in the following format: {"deposit types": [deposit type 1, deposit type 2]}

Note that there could be no deposit types mentioned, and in that case you should return None where appropriate.
Here is the list of the provided deposit types: """ + deposit_types + ", None." + """ Here is the text from the report: 
"""
#### TOC
content_toc = "You are a mining assistant, knowledgable in geology and skilled in understanding mining reports. You can extract information about tables of contents from reports."
content_pr = """You are a documentation expert and you can understand very well the table of contents of the mining reports.
You will be provided a table of contents and you need to understand it and return the number and page for each item in a given table of content.
The output should be in the following format: 
{"text":["number", "page"],  "text":["number", "page"]}

For example: 
{"Information Sources and References":["2.5", "7"],
"Reliance on Experts":["3.0", "7"]}

If there are no pages visible or you think there is no table of content in a text, return None.
Here is the text: 
"""
### get header
content_header = """You are a mining assistant, knowledgable in geology and 
skilled in understanding mining reports. You can extract the header of the section from the
given text."""
content_find = """You are a documentation expert and you can understand very well the 
contents of the mining reports. You will be provided a section of a paper and 
you need to understand it and see if in the text the term given is used as a header 
on the given page.
The output should only given as "Yes" or "No". Here is the  
"""
## returning from the tables
response_example = """{'Line1': {'Zone': 'zone', 'Classification': 'classification', 'element Cut-Off': 'cut-off', 'element Tonnage': 
'tonnage', 'element Grade %': 'element % number' }, 'Line2': {'Zone': 'zone', 'Classification': 'classification', 'element Cut-Off': 
'cut-off', 'element Tonnage': 'tonnage', 'element Grade %': 'element % number'},...}"""

table_pr = f"""You are a geology expert and you are very good in understanding mining reports. You will be given 
a text from a mining report and you have to find out what are the different combinations of Zones which is the name of a location,
classification which is either indicated or inferred, cut-off represented as a decimal, tonnage in Tonnes and 
grade given in % from the tables in the text, which will have most of the given headers that include the words
zone, classification or indicated or inferred, cut-off, tonnage, and element %. Please extract the name of the element and place it in the output
below without any additional text. Note we only care about the mineral {primary_commodity} represented by {element_sign}.

For each line in the table create a nested dictionary that follows this json file format as the response:  
{response_example}. 
"""

### Get Deposit Types

In [23]:
deposit_types_p = "./Deposit classification Scheme.xlsx"
deposit_groups = ', '.join(pd.read_excel(deposit_types_p, sheet_name='Deposit classification scheme',engine='openpyxl')['Deposit group'].unique())
deposit_types = ', '.join(pd.read_excel(deposit_types_p, sheet_name='Deposit classification scheme',engine='openpyxl')['Deposit type'].unique())

In [28]:
# print(f"Here are the deposit groups: {deposit_groups} \n\n")
# print(f"Here are the deposit types: {deposit_types}")

## Get commodities

In [25]:
target_strings = ["commodit"]
model = 'gpt-3.5-turbo'

all_comodities = {}
res = get_answ(pdf_p + pdf_name,target_strings,model, content, commodity_pr)
print(res)

Here are all matching_pages to search the answer for: [216, 220, 255, 279]
{'primary commodities': ['zinc', 'lead', 'silver'], 'secondary commodities': None}


## Get deposit types

In [35]:
dtype_list = deposit_types.split(", ")

test_strings = [""]
for test in test_strings:
    if test in dtype_list:
        print(f"{test} is in list")

In [38]:
for dtype in dtype_list:
    # print(f"Looking at deposit type {dtype} \n")
    pages = search_text_in_pdf(pdf_p+pdf_name, dtype)
    if len(pages) > 0:
        print(dtype + ": " + pages)
        print()

Looking at deposit type Residual placer tin 

Looking at deposit type Residual placer lead 

Looking at deposit type Fluvial placer gold 

Looking at deposit type Fluvial placer PGE 

Looking at deposit type Fluvial placer tin 

Looking at deposit type Fluvial placer niobium- tantalum 

Looking at deposit type Fluvial placer tungsten 

Looking at deposit type Fluvial placer REE 

Looking at deposit type Fluvial placer diamond 

Looking at deposit type Fluvial placer gemstones 

Looking at deposit type Fluvial placer garnet 

Looking at deposit type Heavy mineral sands 

Looking at deposit type Shoreline placer gold 

Looking at deposit type Paleoplacer heavy mineral sands 

Looking at deposit type Paleoplacer tin 

Looking at deposit type Paleoplacer gold ± uranium 

Looking at deposit type Bauxite 

Looking at deposit type Karst bauxite 

Looking at deposit type Laterite nickel 

Looking at deposit type Carbonatite laterite REE 

Looking at deposit type Laterite magnesite 

Looking at

In [None]:
target_strings = ["Deposit type"]
res = get_answ(pdf_p + pdf_name,target_strings,model, content, deposit_pr, replace_t=True)

KeyboardInterrupt: 

In [None]:
idx = 0
deposit_types = {}
deposit_types['deposit types'] = []
for dep in res['deposit types']:
    deposit_types['deposit types'].append({"id": idx, "name": dep})
    idx +=1
print(deposit_types)

## Get TOC

old content_pr: 
content_pr = """You are a documentation expert and you can understand very well the table of contents of the mining reports.
You will be provided a table of contents and you need to understand it and return the number and page for each item in a given table of content.
The output should be in the following format: 
{["number", "text", "page"],
["number", "text", "page"]}

For example: 
{["2.5", "Information Sources and References", "7"],
["3.0", "Reliance on Experts", "7"]}

If there are no pages visible or you think there is no table of content in a text, return None. 
Here is the text

#### Overall Steps
1. Get the table of contents
2. From the table of contents dictionary, look for Mineral Resource or whichever term you need. 
3. Create the term header and from the given start page look for where we have the term header to get the offset
4. With the offset, find where the tables are starting from that page to maybe the next section(?) or last page we see that table

Notes: need to determine how to stop duplicates being added. Need to determine best way to create a search term
within the document. Need to figure out best extraction method before adding to the csv.

- Should also think about best way to search for the header: i think we can get the term section number and then the term from the TOC. Can use this to search for the start page. Do not need to do chat GPT. Problems with current. If pages are not always identical or if there is a spacing problem. Should try doing a cosine or fix the paper to dismiss spacing issues. 

In [None]:
def is_json_compatible(string):
    try:
        json.loads(string)
        return True
    except ValueError:
        return False

def get_toc(file_path):
    with open(file_path, 'rb') as file:
        pdf = PyPDF2.PdfReader(file)

        # scan first n=8 pages and get the TOC
        all_res = {}
        for i in range(0,8):
            print(f"on page {i}")
            page = pdf.pages[i]
            text = page.extract_text()
            model = 'gpt-4'
            response = chat_completion_with_backoff(model=model, temperature=0, max_tokens=2000, stop='', messages=[
                {"role": "system", "content": content_toc},
                {"role": "user", "content": content_pr + text},
                ])
            res = response['choices'][0]['message']['content']
            # print(res)
            if is_json_compatible(res):
                ans = json.loads(res)
                for key in ans.keys():
                    all_res[key] = ans[key]
    return all_res

In [None]:
all_res = get_toc(curr_path)
print(all_res)

## Section to Search or Filter the document for the Mineral Resource Header
Note: We are assuming that the report uses the NUMBER SECTION + TITLE in the report to refer to the start of a new section

In [None]:
## Need to get offset 
def get_offset(term, start_page, curr_file):
        ## need to look for term 
        # print(start_page)
        with open(curr_file, 'rb') as file:
            pdf = PyPDF2.PdfReader(file)
            
            for i in range(start_page-1, start_page + 10):
                # print(f"On page {i}")
                page = pdf.pages[i]
                text = page.extract_text()
                text_new = ' '.join(text.replace("\t", " ").split()).lower()
                model = 'gpt-4'
                response = chat_completion_with_backoff(model=model, temperature=0, max_tokens=2000, stop='', messages=[
                    {"role": "system", "content": content_header},
                    {"role": "user", "content": content_find+f"Term: {term} and here is the text" + text_new},
                    ])
                # print(text)
                res = response['choices'][0]['message']['content']
                if res == "Yes":
                    # print(text)
                    return (start_page, i, i-start_page)
            return (start_page,None, None)
    
def is_int(value):
    try:
        int(value)
        return True
    except ValueError:
        return False
    
def find_header_pg(pdf_path, target_string):
    
    # Open the PDF file in binary mode
    with open(pdf_path, 'rb') as file:
        
        # Create a PDF reader
        pdf = PyPDF2.PdfReader(file)
        
        # Iterate over each page
        for page_num in range(7, len(pdf.pages)):
            page = pdf.pages[page_num]
            text = page.extract_text()
            text_new = ' '.join(text.replace("\t", " ").split()).lower()
            # Check if target string is in the page's text
            if target_string.lower() in text_new:
                return page_num
        return 0
            
def get_correct_pages(term_list, curr_path, use_section=True):
    correct_pages = {}
    
    for inner_dict in term_list:
        for title in inner_dict:
            if use_section:
                header_term = inner_dict[title][0] + " " + title
                new_pg = find_header_pg(curr_path, header_term)
            else:
                new_pg = find_header_pg(curr_path, title)
            correct_pages[title] = new_pg
    return correct_pages

In [None]:
mineral_res = []
for key in all_res.keys():
    if "mineral resource" in key.lower():
        mineral_res.append({key.lower(): all_res[key]})

print(mineral_res)

In [None]:
term = list(mineral_res[0].keys())[0]
section = mineral_res[0][term][0]
header_term = section + " " + term
og_pg =  mineral_res[0][term][1]

if is_int(og_pg):
    print("Using Chat GPT to Find OFFSET")
    og_pg = int(og_pg)
    ## Note need to use header_term to find correct section
    og_pg, new_pg, offset = get_offset(header_term, og_pg, curr_path)
    print(f"Found the offset: {offset}")

    correct_pages = {}
    if offset != None:
        for inner_dict in mineral_res:
            for title in inner_dict:
                old_pg = inner_dict[title][1]
                new_pg = int(old_pg) + offset
                correct_pages[title] = new_pg
    else:
        print("Using String Matching to Find OFFSET")
        correct_pages = get_correct_pages(mineral_res, )
else:
    print("Using String Matching to Find OFFSET")
    correct_pages = get_correct_pages(mineral_res, curr_path)
    
## trying to see if there were missing values
for key in correct_pages:
    if correct_pages[key] == 0:
        # try updating any keys that weren't updated
        addn_dict = get_correct_pages(mineral_res, curr_path, use_section = False)
        for key in addn_dict:
            correct_pages[key] = addn_dict[key]
            
print(correct_pages)

## Searching within the table:
- Classification (Indicated, Inferred...)
- Cut-off
- Tonnage
- Grade

### always look in mineral resource
- look at the page number offsets
- compare to the TOC
- Find summary and the do the offset

## Use Chat GPT to search for the table

In [None]:
def search_for_tables(pdf_path, start_page):
    uniq_dict = {}
    for page_num in range(start_page, start_page+4):
        with open(pdf_path, 'rb') as file:
            pdf = PyPDF2.PdfReader(file)
            page = pdf.pages[page_num]
            text = page.extract_text()
            model = 'gpt-4'
            #model = 'gpt-3.5-turbo'
            response = chat_completion_with_backoff(model=model, temperature=0, max_tokens=2000, stop='', messages=[
                {"role": "system", "content": content},
                {"role": "user", "content": table_pr + text},
                ])
            res = response['choices'][0]['message']['content']
            match = re.search(r'\{.*\}', res, re.DOTALL)
            if match:
                extracted_content = match.group(0).replace("'", '"')
                # print(extracted_content)
                if is_json_compatible(extracted_content):
                    ans = json.loads(extracted_content)
                    for inner_dict in ans.values():
                        inner_dict['page_num']= page_num + 1
                        if tuple(inner_dict.values()) in uniq_dict.keys():
                            pass
                        else:
                            uniq_dict[tuple(inner_dict.values())] = "seen"
                    
            # else:
            #     print("No match found.") 
    return uniq_dict

In [None]:
overall_dict = {}
for header in correct_pages:
    temp = search_for_tables(curr_path, correct_pages[header])
    print(f"\n Here is the dictionary from header {header}: \n {temp}\n")
    for key in temp:
        if key in overall_dict:
            pass
        else:
            overall_dict[key] = 'seen'

In [None]:
overall_dict

In [None]:
mineral_inventory = {}
mineral_inventory['MineralInventory'] = []
idx = 0

    
for inner_sec in overall_dict:
    inner_dict = {}
    grade_dict = {}
    page_ref = {}
    zone = inner_sec[0]
    category = inner_sec[1]
    cut_off = inner_sec[2]
    tonnage = str(inner_sec[3]).replace(',', '')
    grade = inner_sec[4]
    page_num = inner_sec[5]
    contained_metal = int(tonnage) * float(grade)
    page_ref['page'] = page_num
    grade_dict['grade_unit'] = 'percent'
    grade_dict['grade_value'] = grade
    inner_dict["id"] = idx
    inner_dict['commodity'] = primary_commodity
    inner_dict['category'] = category
    inner_dict['ore'] = tonnage
    inner_dict['grade'] = grade_dict
    inner_dict['cutoff_grade'] = cut_off
    inner_dict['contained_metal'] = contained_metal
    inner_dict['reference'] = page_ref
    inner_dict['date'] = date.today().strftime("%Y-%m-%d")
    mineral_inventory['MineralInventory'].append(inner_dict)
    idx += 1

In [None]:
# mineral_inventory

In [None]:
## create json file
combined = [deposit_types, mineral_inventory]
with open(f"{pdf_name[:-4]}.json", "w") as outfile:
    json.dump(combined, outfile)
    