# Function definitions

In [1]:
import fitz
import re
import pandas as pd

In [2]:
def get_convocatory_page(doc, letters_size, letters_font):
    """
    Function for obtaining a dictionary which stores at which page each section starts
    
    doc -> pdf document
    letter_size -> Size of the letters of the titles of the convocatories
    letters_font -> Font of the letters of the titles of the convocatories
    """
    convocatory_pages = {}
    
    # iterate through all the lines of the text checking if the properties match with the ones of the subtitles indicating the begining of a section
    for page_num, page in enumerate(doc, start=1): 
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            for line in block.get("lines", []):
                for fragment in line["spans"]:
                    size = fragment["size"]
                    font = fragment["font"]
                    
                    # if we find one of this titles, we store the ID of the convocatory and store the page at which it starts
                    if size==letters_size and  font==letters_font and "HORIZON" in fragment["text"]: 
                        convocatory_pages[fragment["text"].split(":", 1)[0].strip()] = page_num

    return convocatory_pages

In [3]:
def get_convocatory_text(doc, convocatory_pages, body_size):
    """
    Function for obtaining a dictionary which stores the text for each of the convocatories

    doc -> pdf document
    convocatory_pages -> dictionary whoch stores at which page each convocatory starts
    header_size -> Size of the letters of the headers
    header_font -> Font of the letters of the headers
    foot_size -> Size of the letters of the foot
    foot_font -> Font of the letters of the foot
    super_idx_size -> Size of the letters of the super indexes
    super_idx_font -> Font of the letters of the foot
    """
    convocatory_pairs = list(zip(convocatory_pages.keys(), list(convocatory_pages.keys())[1:])) + [(list(convocatory_pages.keys())[-1], None)]
    convocatory_texts = {}

    for init_conv, final_conv in convocatory_pairs: # iterate through all the pages between to sections 
        init_page = convocatory_pages[init_conv]
        final_page = convocatory_pages[final_conv] if final_conv is not None else len(doc)

        text = []
        inside_section = False # Variable for detecting if we are inside a convocaotry section 

        for page_num in range(init_page-1, final_page):
            page = doc.load_page(page_num)
            blocks = page.get_text("dict")["blocks"]

            for block in blocks:
                for line in block.get("lines", []):
                    for fragment in line["spans"]:
                        size = fragment["size"]
                        font = fragment["font"]
                        text_content = fragment["text"]

                        # Check if we are inside a convocatory section 
                        if final_conv is None or init_conv in text_content:
                            inside_section = True 

                        elif final_conv in text_content  :
                            inside_section = False

                        # store all the text at thos sections in case it is not a header of a foot page
                        if size==body_size and inside_section:  # check that the text is part of the body
                            text.append(fragment["text"])

        complete_text = "\n".join(text)
        convocatory_texts[init_conv] = complete_text # store the text at the dictionary
   
    return convocatory_texts

In [5]:
def get_df_convocatory(convocatory_texts):
    """
    Function for creating the dataset containing all the necessary data for each of the convocatories

    convocatory_texts -> dictionary containing the text for each of the convocatories
    """
    df_conv = pd.DataFrame(columns=["Convocatory", "Budget", "Type of Action", "Expected Outcome", "Scope"])

    # Budget section starts at The commission" and ends at "Indicative Budget"
    init_budget = "The Commission"
    final_budget = "Indicative"
    

    # Type of action section starts at "Type of Action" and ends at "Eligibility "
    init_action = "Type of Action"
    final_action_1 = "Eligibility "
    final_action_2 = "Admissibility "
    final_action_3 = "Expected Outcome:"
    final_action_4 = "Procedure"
    final_action_5 = "Technology"
    final_action_6 = "Legal"
    final_action_7 = "Award"
    final_action_8 = "Security"
    final_action_9 = "Expected Outcomes:"


    # Expected outcome section starts at "Expected Outcome" and ends at "Scope "
    init_outcome_1 = "Expected Outcome:"
    init_outcome_2 = "Expected Outcomes:"
    final_outcome = "Scope:"

    # Scope section starts at "Scope and ends at "HORIZON" or "Call"
    init_scope = "Scope:"
    final_scope_1 = "HORIZON"
    final_scope_2 = "Call"
    final_scope_3 = "Destination"
    final_scope_4 = "Other"

    for conv, text in convocatory_texts.items(): # iterate through all convocatories texts
        budget = []
        action = []
        outcome = []            
        scope = []

        sections = {}
        lines = text.split("\n")

        for i, line in enumerate(lines): # iterate through all the lines of the texts
          # Get budget text
          if (init_budget in line):
            for j in range(i, len(lines)):
                if final_budget not in lines[j]:
                    budget.append(lines[j])

                else:
                    break


          # Get type of action text
          if init_action in line:
            for j in range(i+1, len(lines)):
                if final_action_1 not in lines[j] and final_action_2 not in lines[j] and final_action_3 not in lines[j] and final_action_4 not in lines[j] and final_action_5 not in lines[j] and final_action_6 not in lines[j]  and final_action_7 not in lines[j] and final_action_8 not in lines[j]and final_action_9 not in lines[j]:
                    action.append(lines[j])
                else:
                    break

          # Get exepcted outcome text
          if init_outcome_1 in line or init_outcome_2 in line:
            for j in range(i, len(lines)):
                if final_outcome not in lines[j]:
                    outcome.append(lines[j])
                else:
                    break

          # Get Scope text
          if init_scope in line:
            for j in range(i, len(lines)):
                if final_scope_1 not in lines[j] and final_scope_2 not in lines[j] and final_scope_3 not in lines[j] and final_scope_4 not in lines[j]:
                    scope.append(lines[j])
                else:
                    break
            
            break
        
        # transform the budget text into a numerical range
        pattern =r"EUR(?: contribution per project)? (\d+\.\d{2}) (?:and (\d+\.\d{2}) )?million|\baround EUR (\d+\.\d{2}) million"
        if re.search(pattern, "".join(budget)):
            budget = re.search(pattern, "".join(budget)).group()
        else:
            budget = "".join(budget)

        # store all the information ina  dictionary and create a dataset
        sections["Convocatory"] = conv
        sections["Budget"] =  budget
        sections["Type of Action"] = "".join(action)
        sections["Expected Outcome"] = "".join(outcome).replace("Expected Outcome: ", "")
        sections["Scope"] = "".join(scope).replace("Scope: ", "")
        sections = pd.DataFrame([sections])

        df_conv = pd.concat([df_conv, sections], ignore_index=True)
    
    # remove the empty lines 
    index_to_delete = []
    for i in range(df_conv.shape[0]):
        if df_conv["Budget"][i] == "" and df_conv["Type of Action"][i] == "" and df_conv["Expected Outcome"][i] == ""  and df_conv["Scope"][i] == "":
            index_to_delete.append(i)

    df_conv = df_conv.drop(index=index_to_delete, axis=0).reset_index(drop=True)

    return df_conv

# Obtaining the database

For obtainig the database, the first step is to obtain the pdf document and extract the different types of text (Destinations, calls and convocatories).

In [17]:
LOOP = True # Set to false in case just interested in one program

# path = "/Users/mbalairon/Desktop/Horizon/"
path = ""

# program = "health"
program = "culture"
# program = "security"
# program = "digital"
# program = "climate"
# program = "food"

programs = ["health", "culture", "security", "digital", "climate", "food"]

# Get hyperparameters related with each of the kinds of texts at the doc
subtitle_size = 12.0
subtitle_font = 'TimesNewRomanPS-BoldMT'

body_size = 12.0

In [21]:
if LOOP:
    for program in programs:
        # Open the document
        doc = fitz.open(path+program+".pdf")

        # Obtain a dictionary which stores at which page each section starts
        convocatory_pages = get_convocatory_page(doc, subtitle_size, subtitle_font)

        # Obtain a dictionary which stores the text for each convocatory
        convocatory_texts = get_convocatory_text(doc, convocatory_pages, body_size)

        # Obtain the convocatory dataframe containing all information about each of the convocatories
        df_convocatory = get_df_convocatory(convocatory_texts)

        # Store the df as a csv document
        df_convocatory.to_parquet(program+".parquet", index=False)
        
else:
    program = programs[1]
    # Open the document
    doc = fitz.open(path+program+".pdf")

    # Obtain a dictionary which stores at which page each section starts
    convocatory_pages = get_convocatory_page(doc, subtitle_size, subtitle_font)

    # Obtain a dictionary which stores the text for each convocatory
    convocatory_texts = get_convocatory_text(doc, convocatory_pages, body_size)
    # Obtain the convocatory dataframe containing all information about each of the convocatories
    df_convocatory = get_df_convocatory(convocatory_texts)
    
    # Store the df as a csv document
    df_convocatory.to_parquet(program+".parquet")
    