# 04 Get Metadata

> Get medata from files.

In [None]:
#|default_exp core.04_metadata

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()

In [None]:
#|hide
from nbdev.showdoc import show_doc

In [None]:
#|export
import pubcrawler as proj
from pubcrawler import const, log, utils, tools
import adu_proj.utils as adutils

In [None]:
import json
import PyPDF2
from openai import OpenAI
from dotenv import load_dotenv
import os
from tqdm import tqdm
import pandas as pd

In [None]:
#|export
load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(
    # This is the default and can be omitted
    api_key=API_KEY,
)

We will attempt to get metadata from each file. Currently this script assumes the file is a pdf although in the future its possible to add new methods here for extracting data from different file types. 

Passing entire documents to an LLM can be expensive so we instead only pass a subsection (first 5 pages). 

In [None]:
#|export
with open(f'{const.pre_output_path}/data_files.json', 'r') as f:
    file_links = json.load(f)

In [None]:
#|export
def extract_text_from_first_n_pages(filepath, n):
    text = ""
    with open(filepath, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for i in range(min(n, len(reader.pages))):
            page = reader.pages[i]
            text += page.extract_text()
    return text

In [None]:
extract_text_from_first_n_pages(file_links[next(iter(file_links))]['file_path'], 4)

'The Shorter \nWorking Week:\nA Radical And  \nPrag matic Proposal\nEdited by \nWill Stronge and \nAidan Harper\nContributors include:\nDanielle Guizzo\nKyle Lewis\nMadeleine Ellis-Petersen\nAidan Harper\nNic Murray\nWill Stronge\nWith other  \ncontributions from:\nHelen Hester \nMatt Cole\nPublished 2019 by: \n© Autonomy\nAutonomy Research Ltd\nCranbourne\nPilcot Road\nCrookham Village\nHampshire\nGU51 1DY\nCover photograph by Ryan Tang  \nReport design by Jack Haslehurst  \nfor Autonomy\nIn collaboration with members  \nof the 4 Day Week Campaign\nThe Shorter Working Week: Jan 20192\nAutonomyTransition to a Shorter \nWorking Week\nCurrent Model of Work\nFuture Model of Work-TimeTransition \nto a  \nShorter \nWorking \nWeekTransition \nto a  \nShorter \nWorking \nWeek\nHappy \nProductive \nworkforce \nwith a healthy \nwork-life \nbalanceLow \nproductivity \ncaused by \npoor mental \nhealth and \nwellbeing\nMore rest, \nsleep, free \ntime and \nautonomyGender \ninequality \nin paid and

Lets now get metadata from this section of pdf via function calling:

In [None]:
def get_pdf_metadata(report_text):
    functions = [{
            "name": "generate_citation",
            "description": "Generate a citation for report",
            "parameters": {
                "type": "object",
                "properties": {
                    "title": {
                        "type": "string",
                        "description": "Report title. If unknown leave empty",
                    },
                    "authors": {
                        "type": "array",
                        "description": "Array of the author's full names. If unknown leave empty",
                        "items": {
                                "description": "Author's full name",
                                "type": "string"
                            }
                    },
                    "organisation": {
                        "type": "array",
                        "description": "Array containing the names of the research organisations that produced the report. If unknown leave empty",
                        "items": {
                                "description": "Research organisation's name",
                                "type": "string"
                            }
                    },
                    "date": {
                        "type": "string",
                        "description": "Date the report was published on. Try to find the day, month and year. If unknown leave empty",
                    },
                    "keywords": {
                        "type": "array",
                        "description": "Array of keywords that indicate the content of the report. If unknown leave empty",
                        "items": {
                                "description": "Keyword title ie 'feminism'",
                                "type": "string"
                            }
                    },
                    "funders": {
                        "type": "array",
                        "description": "Array of organisations that provided funding or financial support. If unknown leave empty",
                        "items": {
                                "description": "Name of funding organisation",
                                "type": "string"
                            }
                    }
                },
                "required": ["title", "authors", "organisation", "date", "keywords", "funders"]
            },
        }]
    messages = []
    messages.append({"role": "system", "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."})
    messages.append({"role": "user", "content": f"Generate a citation for the report based on the following excerpt: {report_text}"})
    chat_response = client.chat.completions.create(
        messages=messages, functions=functions, model=const.model
    )
    assistant_response = json.loads(c.choices[0].message.function_call.arguments)
    return assistant_response    

In [None]:
get_pdf_metadata(extract_text_from_first_n_pages(file_links[next(iter(file_links))]['file_path'], 5))

{'title': 'The Shorter Working Week: A Radical And Pragmatic Proposal',
 'authors': ['Will Stronge', 'Aidan Harper'],
 'organisation': ['Autonomy Research Ltd',
  'In collaboration with members of the 4 Day Week Campaign'],
 'date': '2019',
 'keywords': ['shorter working week',
  'work-life balance',
  'gender equality',
  'automation',
  'sustainable leisure',
  'low carbon alternatives',
  'high-skilled workforce',
  'sustainable economy'],
 'funders': ['Autonomy']}

Now we can loop through all the files getting metadata:

In [None]:
for key in tqdm(file_links):
    try:
        m = get_pdf_metadata(extract_text_from_first_n_pages(file_links[key]['file_path'], 5))
        file_links[key]['title'] = m['title']
        file_links[key]['authors'] = m['authors']
        file_links[key]['organisation'] = m['organisation']
        file_links[key]['date'] = m['date']
        file_links[key]['keywords'] = m['keywords']
        file_links[key]['funders'] = m['funders']
    except:
        print(f"failed on {key}")

100%|█████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.85s/it]


In [None]:
pub_metadata = pd.DataFrame([file_links[key] for key in file_links]).dropna()

In [None]:
pub_metadata.to_csv(f'{const.pre_output_path}/pub_metadata.csv', index=False)