# Function definitions

In [1]:
import fitz
import re
import pandas as pd

In [2]:
def get_call_page(doc, letters_size, letters_font):
    """
    Function for obtaining a dictionary which stores at which page each section starts
    
    doc -> pdf document
    letter_size -> Size of the letters of the titles of the calls
    letters_font -> Font of the letters of the titles of the calls
    """
    call_pages = {}
    
    # iterate through all the lines of the text checking if the properties match with the ones of the subtitles indicating the begining of a section
    for page_num, page in enumerate(doc, start=1): 
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for fragment in line["spans"]:
                    size = fragment["size"]
                    font = fragment["font"]
                    
                    # if we find one of this titles, we store the ID of the call and store the page at which it starts
                    if size==letters_size and  font==letters_font and "HORIZON" in fragment["text"]: 
                        call_pages[fragment["text"].split(":", 1)[0].strip()] = page_num

    return call_pages

In [89]:
def get_call_text(doc, call_pages, body_size):
    """
    Function for obtaining a dictionary which stores the text for each of the calls

    doc -> pdf document
    call_pages -> dictionary whoch stores at which page each call starts
    header_size -> Size of the letters of the headers
    header_font -> Font of the letters of the headers
    foot_size -> Size of the letters of the foot
    foot_font -> Font of the letters of the foot
    super_idx_size -> Size of the letters of the super indexes
    super_idx_font -> Font of the letters of the foot
    """
    call_pairs = list(zip(call_pages.keys(), list(call_pages.keys())[1:])) + [(list(call_pages.keys())[-1], None)]
    call_texts = {}

    for init_call, final_call in call_pairs: # iterate through all the pages between to sections 
        init_page = call_pages[init_call]
        final_page = call_pages[final_call] if final_call is not None else len(doc)

        text = []
        inside_section = False # Variable for detecting if we are inside a call section 

        for page_num in range(init_page-1, final_page):
            page = doc.load_page(page_num)
            blocks = page.get_text("dict")["blocks"]

            for block in blocks:
                for line in block.get("lines", []):
                    for fragment in line["spans"]:
                        size = fragment["size"]
                        font = fragment["font"]
                        text_content = fragment["text"]

                        # Check if we are inside a call section 
                        if final_call is None or init_call in text_content:
                            inside_section = True 

                        elif final_call in text_content  :
                            inside_section = False

                        # store all the text at those sections in case it is not a header of a foot page
                        if size==body_size and inside_section:  # check that the text is part of the body
                            text.append(fragment["text"])

        complete_text = "\n".join(text)
        call_texts[init_call] = complete_text # store the text at the dictionary
   
    return call_texts

In [126]:
def get_df_call(call_texts, programme):
    """
    Function for creating the dataset containing all the necessary data for each of the calls

    call_texts -> dictionary containing the text for each of the calls
    """
    df_call = pd.DataFrame(columns=["Work Programme", "Call", "Budget", "Type of Action", "Expected Outcome", "Scope"])

    # Budget section starts at The commission" and ends at "Indicative Budget"
    init_budget = "The Commission"
    final_budget = "Indicative"
    

    # Type of action section starts at "Type of Action" and ends at "Eligibility "
    init_action = "Type of Action"
    final_action_1 = "Eligibility "
    final_action_2 = "Admissibility "
    final_action_3 = "Expected Outcome:"
    final_action_4 = "Procedure"
    final_action_5 = "Technology"
    final_action_6 = "Legal"
    final_action_7 = "Award"
    final_action_8 = "Security"
    final_action_9 = "Expected Outcomes:"


    # Expected outcome section starts at "Expected Outcome" and ends at "Scope "
    init_outcome_1 = "Expected Outcome:"
    init_outcome_2 = "Expected Outcomes:"
    final_outcome = "Scope:"

    # Scope section starts at "Scope and ends at "HORIZON" or "Call"
    init_scope = "Scope:"
    final_scope_1 = "HORIZON"
    final_scope_2 = "Call"
    final_scope_3 = "Destination"
    final_scope_4 = "Other"

    for call, text in call_texts.items(): # iterate through all calls texts
        budget = []
        action = []
        outcome = []            
        scope = []

        sections = {}
        lines = text.split("\n")
        

        # get the title
        try:
            title_index = lines.index("Specific conditions")
            title_list = lines[:title_index]
            title = ''.join(title_list)
            title = title.split(': ')[1]
        except:
            title = ''
            
        for i, line in enumerate(lines): # iterate through all the lines of the texts
          # Get budget text
          if (init_budget in line):
            for j in range(i, len(lines)):
                if final_budget not in lines[j]:
                    budget.append(lines[j])

                else:
                    break


          # Get type of action text
          if init_action in line:
            for j in range(i+1, len(lines)):
                if final_action_1 not in lines[j] and final_action_2 not in lines[j] and final_action_3 not in lines[j] and final_action_4 not in lines[j] and final_action_5 not in lines[j] and final_action_6 not in lines[j]  and final_action_7 not in lines[j] and final_action_8 not in lines[j]and final_action_9 not in lines[j]:
                    line = lines[j].strip()
                    line = ' '.join(line.split())
                    action.append(line)
                else:
                    break

          # Get exepcted outcome text
          if init_outcome_1 in line or init_outcome_2 in line:
            for j in range(i, len(lines)):
                if final_outcome not in lines[j]:
                    line = lines[j].strip()
                    line = ' '.join(line.split())
                    outcome.append(line)
                else:
                    break

          # Get Scope text
          if init_scope in line:
            for j in range(i, len(lines)):
                if final_scope_1 not in lines[j] and final_scope_2 not in lines[j] and final_scope_3 not in lines[j] and final_scope_4 not in lines[j]:
                    line = lines[j].strip()
                    line = ' '.join(line.split())
                    scope.append(line)
                else:
                    break
            
            break
        
        # transform the budget text into a numerical range
        pattern =r"EUR(?: contribution per project)? (\d+\.\d{2}) (?:and (\d+\.\d{2}) )?million|\baround EUR (\d+\.\d{2}) million"
        if re.search(pattern, "".join(budget)):
            budget = re.search(pattern, "".join(budget)).group()
        else:
            budget = "".join(budget)

        # store all the information in a  dictionary and create a dataset
        sections["Work Programme"] = programme
        sections["Call"] = call
        sections["Budget"] =  budget
        sections["Type of Action"] = "".join(action)
        sections["Expected Outcome"] = "".join(outcome).replace("Expected Outcome: ", "")
        sections["Scope"] = "".join(scope).replace("Scope: ", "")
        sections['Title'] = title
        sections = pd.DataFrame([sections])

        df_call = pd.concat([df_call, sections], ignore_index=True)
    
    # remove the empty lines 
    index_to_delete = []
    for i in range(df_call.shape[0]):
        if df_call["Budget"][i] == "" and df_call["Type of Action"][i] == "" and df_call["Expected Outcome"][i] == ""  and df_call["Scope"][i] == "":
            index_to_delete.append(i)

    df_call = df_call.drop(index=index_to_delete, axis=0).reset_index(drop=True)

    return df_call

# Obtaining the database

For obtainig the database, the first step is to obtain the pdf document and extract the different types of text (Destinations, calls and projects).

In [124]:
LOOP = True # Set to false in case just interested in one program

# path = "/Users/mbalairon/Desktop/Horizon/"
path = "pdf documents/"

program = "health"
# program = "culture"
# program = "security"
# program = "digital"
# program = "climate"
# program = "food"

programs = ["health", "culture", "security", "digital", "climate", "food"]

# Get hyperparameters related with each of the kinds of texts at the doc
subtitle_size = 12.0
subtitle_font = 'TimesNewRomanPS-BoldMT'

body_size = 12.0

In [127]:
if LOOP:
    df = pd.DataFrame()
    prgrams = ["health"]
    for programme in programs:
        # Open the document
        doc = fitz.open(path+programme+".pdf")

        # Obtain a dictionary which stores at which page each section starts
        call_pages = get_call_page(doc, subtitle_size, subtitle_font)

        # Obtain a dictionary which stores the text for each call
        call_texts = get_call_text(doc, call_pages, body_size)

        # Obtain the call dataframe containing all information about each of the calls
        df_call = get_df_call(call_texts, programme)
        
        df = pd.concat([df, df_call], ignore_index=True)
        

        # Store the df as a csv document
        df.to_parquet("horizon_work_programmes.parquet", index=False)
        
else:
    programme = programs[1]
    # Open the document
    doc = fitz.open(path+programme+".pdf")

    # Obtain a dictionary which stores at which page each section starts
    call_pages = get_call_page(doc, subtitle_size, subtitle_font)

    # Obtain a dictionary which stores the text for each call
    call_texts = get_call_text(doc, call_pages, body_size)

    # Obtain the call dataframe containing all information about each of the calls
    df_call = get_df_call(call_texts, programme)
    
    # Store the df as a csv document
    # df_call.to_parquet(programme+".parquet", index=False)

df_call  

Unnamed: 0,Work Programme,Call,Budget,Type of Action,Expected Outcome,Scope,Title
0,food,HORIZON-CL6-2023-BIODIV-01-1,around EUR 5.50 million,Research and Innovation Actions,In line with the European Green Deal and in pa...,"According to IPBES global assessment report, p...",Better understanding of routes of exposure and...
1,food,HORIZON-CL6-2023-BIODIV-01-2,around EUR 3.50 million,Research and Innovation Actions,In line with the European Green Deal and in pa...,Scope:Light pollutionis the alteration of natu...,Impact of light and noise pollution on biodive...
2,food,HORIZON-CL6-2023-BIODIV-01-3,around EUR 6.00 million,Research and Innovation Actions,The expected outcomes should feed in the imple...,Scope:These activities will foster a collecti...,Interdisciplinary assessment of changes affect...
3,food,HORIZON-CL6-2023-BIODIV-01-4,around EUR 4.00 million,Research and Innovation Actions,In line with the objectives of the European Gr...,Proposals should address Area A or Area B as f...,Nature protection
4,food,HORIZON-CL6-2023-BIODIV-01-5,around EUR 9.00 million,Research and Innovation Actions,In line with the European Green Deal and in pa...,Proposals should work in one or more European ...,Understanding and reducing bycatch of protecte...
...,...,...,...,...,...,...,...
171,food,HORIZON-CL6-2024-GOVERNANCE-01-9,around EUR 3.00 million,Coordination and Support Actions,"In support of the European Green Deal, the EU ...",Proposals should address the following activit...,Thematic networks to compile and share knowled...
172,food,HORIZON-CL6-2024-GOVERNANCE-01-10,around EUR 4.00 million,Coordination and Support Actions,Successful proposals will support the objectiv...,"Transformative changes, such as the ones calle...",Organic farming thematic networks to compile a...
173,food,HORIZON-CL6-2024-GOVERNANCE-01-11,around EUR 3.00 million,Coordination and Support Actions,"In support of the European Green Deal, the EU ...",Proposals should address the following activit...,Biodiversity thematic networks to compile and ...
174,food,HORIZON-CL6-2024-GOVERNANCE-01-12,around EUR 4.00 million,Coordination and Support Actions,"In support of the European Green Deal, the EU ...",Proposals should address the following activit...,Developing EU advisory networks on forestry
