# 04 Get Metadata

> Get medata from files.

In [None]:
#|default_exp core.04_metadata

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()

In [None]:
#|hide
from nbdev.showdoc import show_doc

In [None]:
#|export
import pubcrawler as proj
from pubcrawler import const, log, utils, tools
import adu_proj.utils as adutils

In [None]:
#|export
import json
import PyPDF2
from openai import OpenAI
from dotenv import load_dotenv
import os
from tqdm import tqdm
import pandas as pd
import functools
import time
import signal
from functools import wraps

In [None]:
#|export
load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(
    api_key=API_KEY,
)

We will attempt to get metadata from each file. Currently this script assumes the file is a pdf although in the future its possible to add new methods here for extracting data from different file types. 

Passing entire documents to an LLM can be expensive so we instead only pass a subsection (first 5 pages). 

In [None]:
#|export
with open(f'{const.pre_output_path}/data_files.json', 'r') as f:
    file_links = json.load(f)

In [None]:
#|export
def extract_text_from_first_n_pages(filepath, n):
    text = ""
    with open(filepath, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for i in range(min(n, len(reader.pages))):
            page = reader.pages[i]
            text += page.extract_text()
    return text

Lets now get metadata from this section of pdf via function calling:

In [None]:
#|export
class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

def retry_on_timeout(retries=3, timeout=30):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            for _ in range(retries):
                try:
                    signal.signal(signal.SIGALRM, timeout_handler)
                    signal.alarm(timeout)
                    result = func(*args, **kwargs)
                    signal.alarm(0)  # Reset the alarm
                    return result
                except TimeoutException:
                    print(f"Function {func.__name__} timed out, retrying...")
                finally:
                    signal.alarm(0)  # Ensure the alarm is cleared
            raise TimeoutException(f"Function {func.__name__} failed after {retries} retries")
        return wrapper
    return decorator

In [None]:
#|export
@retry_on_timeout(retries=2, timeout=30)
def get_pdf_metadata(report_text):
    functions = [{
            "name": "generate_citation",
            "description": "Generate a citation for report",
            "parameters": {
                "type": "object",
                "properties": {
                    "title": {
                        "type": "string",
                        "description": "Report title. If unknown leave empty",
                    },
                    "authors": {
                        "type": "array",
                        "description": "Array of the author's full names. If unknown leave empty",
                        "items": {
                                "description": "Author's full name",
                                "type": "string"
                            }
                    },
                    "organisation": {
                        "type": "array",
                        "description": "Array containing the names of the research organisations that produced the report. If unknown leave empty",
                        "items": {
                                "description": "Research organisation's name",
                                "type": "string"
                            }
                    },
                    "date": {
                        "type": "string",
                        "description": "Date the report was published on. Try to find the day, month and year. If unknown leave empty",
                    },
                    "keywords": {
                        "type": "array",
                        "description": "Array of keywords that indicate the content of the report. If unknown leave empty",
                        "items": {
                                "description": "Keyword title ie 'feminism'",
                                "type": "string"
                            }
                    },
                    "funders": {
                        "type": "array",
                        "description": "Array of organisations that provided funding or financial support. If unknown leave empty",
                        "items": {
                                "description": "Name of funding organisation",
                                "type": "string"
                            }
                    }
                },
                "required": ["title", "authors", "organisation", "date", "keywords", "funders"]
            },
        }]
    messages = []
    messages.append({"role": "system", "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."})
    messages.append({"role": "user", "content": f"Generate a citation for the report based on the following excerpt: {report_text}"})
    chat_response = client.chat.completions.create(
        messages=messages, functions=functions, model='gpt-4'
    )
    assistant_response = json.loads(chat_response.choices[0].message.function_call.arguments)
    return assistant_response    

Now we can loop through all the files getting metadata:

In [None]:
#|export
for key in tqdm(file_links):
    try:
        m = get_pdf_metadata(extract_text_from_first_n_pages(file_links[key]['file_path'], 5))
        file_links[key]['title'] = m['title']
        file_links[key]['authors'] = m['authors']
        file_links[key]['organisation'] = m['organisation']
        file_links[key]['date'] = m['date']
        file_links[key]['keywords'] = m['keywords']
        file_links[key]['funders'] = m['funders']
    except:
        print(f"failed on {key}")

 12%|██████▌                                               | 23/190 [04:36<41:02, 14.74s/it]

failed on https://www.homeless.org.uk/sites/default/files/site-attachments/Homeless%20Link%20and%20St%20Mungo%20briefing%20on%20Universal%20Credit%20-%20money%20management%20debate.pdf
failed on https://hummedia.manchester.ac.uk/institutes/cresc/research/WDTMG%20FINAL%20-01-3-2016.pdf


 17%|█████████▍                                            | 33/190 [05:58<31:13, 11.93s/it]

failed on https://orca.cf.ac.uk/18439/1/Quality%20%26%20Independence%20of%20British%20Journalism.pdf


 28%|███████████████                                       | 53/190 [09:06<25:49, 11.31s/it]

failed on https://www.climatechangecommunication.org/wp-content/uploads/2019/10/America_Misled.pdf


 32%|█████████████████                                     | 60/190 [10:12<20:19,  9.38s/it]

failed on http://www.fatherhoodinstitute.org/wp-content/uploads/2021/05/Lockdown-Fathers-Full-Report.pdf


 33%|█████████████████▉                                    | 63/190 [10:40<20:11,  9.54s/it]

failed on https://static1.squarespace.com/static/6281708e8ff18c23842b1d0b/t/6283204b3556a5125ce13b37/1652760661661/The+Carbon+Bankroll+Report+%285-17-2022%29.pdf


 34%|██████████████████▍                                   | 65/190 [10:50<16:01,  7.69s/it]

failed on http://www.bath.ac.uk/cds/publications/bdp55.pdf


 44%|███████████████████████▊                              | 84/190 [14:11<16:14,  9.19s/it]

failed on https://economicsecurityproject.org/wp-content/uploads/2303-Cash-Policy-Talking-Points.pdf


 47%|█████████████████████████▎                            | 89/190 [15:54<31:38, 18.80s/it]

failed on https://autonomy.work/wp-content/uploads/2023/02/4DW-UK-pilot.pdf


 55%|█████████████████████████████▎                       | 105/190 [18:51<28:12, 19.91s/it]

failed on https://www.feps-europe.eu/attachments/publications/platform%20work%20in%20the%20uk%202016-2019%20v3-converted.pdf


 57%|██████████████████████████████▏                      | 108/190 [19:07<15:26, 11.30s/it]

failed on https://publications.parliament.uk/pa/ld201213/ldselect/ldpublic/140/140.pdf
failed on https://www.hse.gov.uk/statistics/causdis/stress.pdf


 58%|██████████████████████████████▉                      | 111/190 [19:08<07:29,  5.69s/it]

failed on https://autonomy.work/wp-content/uploads/2020/10/FINAL-Platforms-We-Need-Platforms-We-Want.pdf


 59%|███████████████████████████████▏                     | 112/190 [19:19<08:38,  6.65s/it]

failed on https://www.compassonline.org.uk/wp-content/uploads/2019/03/Compass_BasicIncomeForAll_2019.pdf


 62%|████████████████████████████████▋                    | 117/190 [20:15<13:23, 11.01s/it]

failed on https://progressiveeconomyforum.com/wp-content/uploads/2019/08/PEF_Skidelsky_How_to_achieve_shorter_working_hours.pdf
failed on http://www.carersuk.org/images/News__campaigns/CUK_State_of_Caring_2019_Report.pdf
failed on https://www.skillsforcare.org.uk/adult-social-care-workforce-data/Workforce-intelligence/documents/State-of-the-adult-social-care-sector/State-of-Report-2019.pdf


 72%|██████████████████████████████████████▏              | 137/190 [24:13<11:45, 13.30s/it]

failed on http://www.hse.gov.uk/statistics/causdis/stress.pdf


 74%|███████████████████████████████████████              | 140/190 [24:37<08:46, 10.53s/it]

failed on https://www2.deloitte.com/content/dam/Deloitte/uk/Documents/public-sector/deloitte-uk-mental-health-employers-monitor-deloitte-oct-2017.pdf
failed on http://eprints.whiterose.ac.uk/125589/3/VERSION%20FOR%20ARCHIVING.pdf
failed on https://www.greenparty.org.uk/assets/files/Elections/Green%20Party%20Manifesto%202019.pdf
failed on https://assets-global.website-files.com/5da42e2cae7ebd3f8bde353c/5dda924905da587992a064ba_Conservative%202019%20Manifesto.pdf


 77%|████████████████████████████████████████▋            | 146/190 [24:57<04:14,  5.79s/it]

failed on https://www.nfer.ac.uk/publications/NUFS05/NUFS05.pdf


 79%|█████████████████████████████████████████▊           | 150/190 [25:47<07:47, 11.69s/it]

failed on http://www.labour.org.uk/page/-/PDFs/9472_Alternative%20Models%20of%20Ownership%20all_v4.pdf


 83%|████████████████████████████████████████████         | 158/190 [26:58<05:40, 10.65s/it]

failed on https://www.release.org.uk/sites/default/files/pdf/publications/SEX_WORKERS_%26_THE_LAW_2017.pdf


 86%|█████████████████████████████████████████████▍       | 163/190 [27:38<04:10,  9.28s/it]

failed on http://libcom.org/files/Queering%20Anarchism,%20Essays%20on%20Gender,%20Power%20and%20Desire%20-%20Deric%20Shannon,%20Abbey%20Volcano%20et%20al.pdf
failed on https://www.cambridge.org/core/services/aop-cambridge-core/content/view/7B5E10EEC0F410D3A52371CC099D6445/9781108471930c5_69-82.pdf/sex_work_in_a_postwork_imaginary_on_abolitionism_careerism_and_respectability.pdf


 89%|███████████████████████████████████████████████▏     | 169/190 [28:15<02:37,  7.52s/it]

failed on https://www.livingwage.org.uk/sites/default/files/Living%20Hours%20Final%20Report%20110619.pdf


 91%|████████████████████████████████████████████████▎    | 173/190 [28:37<01:49,  6.43s/it]

failed on http://www.hse.gov.uk/statistics/overall/hssh1718.pdf
failed on http://www.compassonline.org.uk/wp-content/uploads/2019/03/Compass_BasicIncomeForAll_2019.pdf


 95%|██████████████████████████████████████████████████▏  | 180/190 [29:28<01:18,  7.88s/it]

failed on http://www.autonomyinstitute.org/wp-content/uploads/2017/07/Work-and-gender-V6.pdf


 97%|███████████████████████████████████████████████████▌ | 185/190 [30:03<00:40,  8.10s/it]

failed on http://www.datascienceassn.org/sites/default/files/The%20Scored%20Society%20-%20Due%20Process%20for%20Automated%20Predictions.pdf


100%|█████████████████████████████████████████████████████| 190/190 [30:31<00:00,  9.64s/it]

failed on http://www.autonomyinstitute.org/wp-content/uploads/2017/07/Precarious-work-V8.pdf
failed on http://www.autonomyinstitute.org/wp-content/uploads/2017/07/Automation-V5.pdf





In [None]:
#|export
pub_metadata = pd.DataFrame([file_links[key] for key in file_links]).dropna()

In [None]:
#|export
pub_metadata.to_csv(f'{const.pre_output_path}/pub_metadata.csv', index=False)