# 04 Get Metadata

> Get medata from files.

In [None]:
#|default_exp core.04_metadata

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()

In [None]:
#|hide
from nbdev.showdoc import show_doc

In [None]:
#|export
import pubcrawler as proj
from pubcrawler import const, log, utils, tools
import adu_proj.utils as adutils

In [None]:
#|export
import json
import PyPDF2
from openai import OpenAI
from dotenv import load_dotenv
import os
from tqdm import tqdm
import pandas as pd
import functools
import time

In [None]:
#|export
load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(
    # This is the default and can be omitted
    api_key=API_KEY,
)

We will attempt to get metadata from each file. Currently this script assumes the file is a pdf although in the future its possible to add new methods here for extracting data from different file types. 

Passing entire documents to an LLM can be expensive so we instead only pass a subsection (first 5 pages). 

In [None]:
#|export
with open(f'{const.pre_output_path}/data_files.json', 'r') as f:
    file_links = json.load(f)

In [None]:
#|export
def extract_text_from_first_n_pages(filepath, n):
    text = ""
    with open(filepath, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for i in range(min(n, len(reader.pages))):
            page = reader.pages[i]
            text += page.extract_text()
    return text

In [None]:
extract_text_from_first_n_pages(file_links[next(iter(file_links))]['file_path'], 4)

' \nTHE STATE OF THE CLIMATE 2021\nOle Humlum\nThe Global Warming Policy Foundation\nReport 51The State of the Climate 2021\nOle Humlum\nReport 51, The Global Warming Policy Foundation\n© Copyright 2022, The Global Warming Policy Foundation\nAbout the author\nOle Humlum is former Professor of Physical Geography at the University Centre in Svalbard, Nor -\nway, and Emeritus Professor of Physical Geography, University of Oslo, Norway.Contents\nAbout the author  ii\nGeneral overview 2021  2\n1. Air temperatures  4\nSurface: spatial pattern  4\nLower Troposphere: monthly  6\nLower Troposphere: annual means  7\nSurface: monthly  8\nSurface: annual means  10\nError, consistency and quality  11\nSurface versus lower Troposphere  14\nLower Troposphere: land versus ocean  15\nBy altitude  16\nZonal air temperatures  17\nPolar air temperatures  18\n2. Atmospheric greenhouse gases  19\nWater vapour  19\nCarbon dioxide  20\n3. Ocean temperatures  22\nRecent surface temperature anomalies  22\nBy la

Lets now get metadata from this section of pdf via function calling:

In [None]:
import signal
from functools import wraps

class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

def retry_on_timeout(retries=3, timeout=30):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            for _ in range(retries):
                try:
                    signal.signal(signal.SIGALRM, timeout_handler)
                    signal.alarm(timeout)
                    result = func(*args, **kwargs)
                    signal.alarm(0)  # Reset the alarm
                    return result
                except TimeoutException:
                    print(f"Function {func.__name__} timed out, retrying...")
                finally:
                    signal.alarm(0)  # Ensure the alarm is cleared
            raise TimeoutException(f"Function {func.__name__} failed after {retries} retries")
        return wrapper
    return decorator

In [None]:
#|export
@retry_on_timeout(retries=2, timeout=30)
def get_pdf_metadata(report_text):
    functions = [{
            "name": "generate_citation",
            "description": "Generate a citation for report",
            "parameters": {
                "type": "object",
                "properties": {
                    "title": {
                        "type": "string",
                        "description": "Report title. If unknown leave empty",
                    },
                    "authors": {
                        "type": "array",
                        "description": "Array of the author's full names. If unknown leave empty",
                        "items": {
                                "description": "Author's full name",
                                "type": "string"
                            }
                    },
                    "organisation": {
                        "type": "array",
                        "description": "Array containing the names of the research organisations that produced the report. If unknown leave empty",
                        "items": {
                                "description": "Research organisation's name",
                                "type": "string"
                            }
                    },
                    "date": {
                        "type": "string",
                        "description": "Date the report was published on. Try to find the day, month and year. If unknown leave empty",
                    },
                    "keywords": {
                        "type": "array",
                        "description": "Array of keywords that indicate the content of the report. If unknown leave empty",
                        "items": {
                                "description": "Keyword title ie 'feminism'",
                                "type": "string"
                            }
                    },
                    "funders": {
                        "type": "array",
                        "description": "Array of organisations that provided funding or financial support. If unknown leave empty",
                        "items": {
                                "description": "Name of funding organisation",
                                "type": "string"
                            }
                    }
                },
                "required": ["title", "authors", "organisation", "date", "keywords", "funders"]
            },
        }]
    messages = []
    messages.append({"role": "system", "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."})
    messages.append({"role": "user", "content": f"Generate a citation for the report based on the following excerpt: {report_text}"})
    chat_response = client.chat.completions.create(
        messages=messages, functions=functions, model=const.model
    )
    assistant_response = json.loads(chat_response.choices[0].message.function_call.arguments)
    return assistant_response    

In [None]:
get_pdf_metadata(extract_text_from_first_n_pages(file_links[next(iter(file_links))]['file_path'], 5))

{'title': 'The State of the Climate 2021',
 'authors': ['Ole Humlum'],
 'organisation': ['The Global Warming Policy Foundation'],
 'date': '2022',
 'keywords': [],
 'funders': ['The Global Warming Policy Foundation']}

Now we can loop through all the files getting metadata:

In [None]:
#|export
for key in tqdm(file_links):
    try:
        m = get_pdf_metadata(extract_text_from_first_n_pages(file_links[key]['file_path'], 5))
        file_links[key]['title'] = m['title']
        file_links[key]['authors'] = m['authors']
        file_links[key]['organisation'] = m['organisation']
        file_links[key]['date'] = m['date']
        file_links[key]['keywords'] = m['keywords']
        file_links[key]['funders'] = m['funders']
    except:
        print(f"failed on {key}")

  3%|█▍                                                     | 8/297 [00:21<12:49,  2.66s/it]

failed on https://www.thegwpf.org/content/uploads/2023/02/Ridd-State-of-Coral-Reefs.pdf


  5%|██▉                                                   | 16/297 [00:41<13:05,  2.80s/it]

failed on https://www.thegwpf.org/content/uploads/2019/07/wind-impact.pdf


  7%|███▋                                                  | 20/297 [00:52<12:35,  2.73s/it]

failed on https://www.thegwpf.org/content/uploads/2023/07/TGoC-CN-Archive.pdf


 17%|████████▊                                           | 50/297 [12:27<2:16:15, 33.10s/it]

failed on https://www.thegwpf.org/content/uploads/2022/01/Onshore-Wind-LCOE-1.pdf


 40%|█████████████████████                                | 118/297 [16:03<09:20,  3.13s/it]

failed on https://www.ipcc.ch/site/assets/uploads/sites/2/2018/11/SR15_Chapter1_Low_Res.pdf


 71%|█████████████████████████████████████▍               | 210/297 [30:36<03:49,  2.64s/it]

failed on http://www.thegwpf.org/content/uploads/2013/02/Ridley-Lukewarmer%20Ten%20Tests.pdf


 82%|███████████████████████████████████████████▋         | 245/297 [32:11<02:24,  2.77s/it]

failed on https://www.ipsos.com/sites/default/files/ct/news/documents/2022-11/Net%20Zero%20Policies_October_2022.pdf


 84%|████████████████████████████████████████████▎        | 248/297 [32:21<02:48,  3.44s/it]

failed on https://nicholaslewis.org/wp-content/uploads/2022/09/Lewis_Objectively-combining-climate-sensitivity-evidence_2022-Clim-Dyn-Detailed-Summary.pdf


 86%|█████████████████████████████████████████████▌       | 255/297 [32:43<02:09,  3.08s/it]

failed on https://www.thegwpf.org/content/uploads/2019/12/RuthLea-Carbon-Futility.pdf


 87%|██████████████████████████████████████████████       | 258/297 [32:49<01:40,  2.58s/it]

failed on https://www.thegwpf.org/content/uploads/2019/10/IPCC-letter-Oct2019.pdf


 88%|██████████████████████████████████████████████▍      | 260/297 [32:56<01:54,  3.09s/it]

failed on https://www.thegwpf.org/content/uploads/2019/02/PNAS-Complaint_010219.pdf


 91%|████████████████████████████████████████████████▏    | 270/297 [33:23<01:15,  2.81s/it]

failed on http://www.indiaenvironmentportal.org.in/files/file/global%20temperature%20trends.pdf


 92%|████████████████████████████████████████████████▌    | 272/297 [33:28<01:03,  2.54s/it]

failed on http://www2.ametsoc.org/ams/assets/File/publications/BAMS_EEE_2013_Full_Report_high_res.pdf
failed on http://www.gwpf.net/content/uploads/2013/11/Khandekar-Extreme-Weather.pdf


 94%|█████████████████████████████████████████████████▊   | 279/297 [33:40<00:36,  2.04s/it]

failed on http://www.gwpf.net/content/uploads/2013/03/Nurse.pdf
failed on http://www.gwpf.net/content/uploads/2013/03/Lawson-Nurse.pdf


 95%|██████████████████████████████████████████████████▎  | 282/297 [33:43<00:22,  1.51s/it]

failed on http://www.gwpf.net/content/uploads/2013/05/Sir-Paul-Nurse-to-NL_30Apr13.pdf
failed on http://www.gwpf.net/content/uploads/2013/05/RS-Invitations.pdf
failed on http://www.gwpf.net/content/uploads/2013/05/Sir-Paul-Nurse-March2013.pdf
failed on http://www.gwpf.net/content/uploads/2013/05/GWPF-Background-Paper.pdf
failed on http://www.gwpf.net/content/uploads/2012/12/GWPF-letter-to-Lord-Hall.pdf
failed on http://www.gwpf.net/content/uploads/2012/10/Letter-to-Lord-Patten-29-October2012.pdf
failed on http://www.metoffice.gov.uk/media/pdf/j/j/global_temperatures_09.pdf


100%|████████████████████████████████████████████████████▊| 296/297 [34:01<00:01,  1.88s/it]

failed on https://www.thegwpf.org/content/uploads/2023/09/gwpf-open-peer-review-Kelly-et-al-Review-Draft.pdf


100%|█████████████████████████████████████████████████████| 297/297 [34:04<00:00,  6.88s/it]


In [None]:
#|export
pub_metadata = pd.DataFrame([file_links[key] for key in file_links]).dropna()

In [None]:
#|export
pub_metadata.to_csv(f'{const.pre_output_path}/pub_metadata.csv', index=False)