# Function definitions

In [33]:
import fitz
import re
import pandas as pd

In [34]:
def get_convocatory_page(doc, letters_size, letters_font, letters_color, section):
    """
    Function for obtaining a dictionary which stores at which page each section starts
    
    doc -> pdf document
    letter_size -> Size of the letters of the titles of the convocatories
    letters_font -> Font of the letters of the titles of the convocatories
    letters_color -> Color of the letters of the titles of the convocatories
    section -> Name of the section we are interested in
    """
    # iterate through all the lines of the text checking if the properties match with the ones of the subtitles indicating the begining of a section
    
    convocatory_pages = {}
    for page_num, page in enumerate(doc, start=1): 
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            for line in block.get("lines", []):
                for fragment in line["spans"]:
                    size = fragment["size"]
                    font = fragment["font"]
                    color = fragment["color"]

                    # if we find one of this titles, we store the ID of the convocatory and store the page at which it starts
                    if size==letters_size and  font==letters_font and color==letters_color and section in fragment["text"]: 

                        # Check that the string is clean. If not remove the indicative number before the convocatory
                        if fragment["text"].split()[0] == "EIC":   
                            convocatory_pages[fragment["text"]] = page_num
                        else:
                            splited_text = fragment["text"].split()  
                            convocatory_pages[' '.join(splited_text[1:])] = page_num                            
            
    return convocatory_pages

In [35]:
def get_convocatory_text(doc, convocatory_pages, body_size, body_font, subtitle_size, subtitle_font):
    """
    Function for obtaining a dictionary which stores the text for each of the convocatories

    convocatory_pages -> dictionary whoch stores at which page each convocatory starts
    numpage_size -> Size of the letters of the page numbers
    numpage_font -> Font of the letters of the page numbers
    foot_size -> Size of the letters of the foot
    foot_font -> Font of the letters of the foot
    """
    convocatory_pairs = list(zip(convocatory_pages.keys(), list(convocatory_pages.keys())[1:])) + [(list(convocatory_pages.keys())[-1], None)]
    convocatory_texts = {}

    for init_conv, final_conv in convocatory_pairs: # iterate through all the pages between to sections 
        init_page = convocatory_pages[init_conv]
        final_page = convocatory_pages[final_conv] if final_conv is not None else len(doc)

        text = []
        inside_section = False # Variable for detecting if we are inside a convocatory section 

        for page_num in range(init_page-1, final_page):
            page = doc.load_page(page_num)
            blocks = page.get_text("dict")["blocks"]

            for block in blocks:
                for line in block.get("lines", []):
                    for fragment in line["spans"]:
                        size = fragment["size"]
                        font = fragment["font"]
                        text_content = fragment["text"]

                        # Check if we are inside a convocatory section 
                        if final_conv is None or init_conv in text_content:
                            inside_section = True 

                        elif final_conv in text_content:
                            inside_section = False

                        # store all the text at thos sections in case it is not a header of a foot page
                        if ((size==body_size and font==body_font) or (size==subtitle_size and font==subtitle_font)) and inside_section:  # check that the text is part of the body
                            text.append(text_content)

        complete_text = "\n".join(text)
        convocatory_texts[init_conv] = complete_text # store the text at the dictionary


   
    return convocatory_texts

In [36]:
def get_df_convocatory(convocatory_texts):
    """
    Function for creating the dataset containing all the necessary data for each of the convocatories

    convocatory_texts -> dictionary containing the text for each of the convocatories
    """
    df_conv = pd.DataFrame(columns=["Challenge", "Background and Scope", "Overall Goal & Specific Objectives", "Expected Outcomes & Impacts"])

    # Scope section starts at "Background and scope" and ends at "Overall goal and specific objectives"
    init_scope = "Background and scope"
    final_scope = "Overall goal and specific objectives"
    

    # Goal section starts at "Overall goal and specific objectives" and ends at "Expected outcomes and impacts"
    init_goal = "Overall goal and specific objectives"
    final_goal = "Expected outcomes and impacts"

    # Outcome section starts at "Expected outcomes and impacts" and ends at "EIC"
    init_outcome = "Expected outcomes and impacts"
    final_outcome_1 = "EIC"
    final_outcome_2 = "Specific Conditions"
    
    for conv, text in convocatory_texts.items(): # iterate through all convocatories texts
        scope = []
        goal = []
        outcome = []            

        sections = {}
        lines = text.split("\n")

        for i, line in enumerate(lines): # iterate through all the lines of the texts
          
            # Get scope text
            if (init_scope in line):
                for j in range(i, len(lines)): # skip two lines
                    if final_scope not in lines[j]:
                        scope.append(lines[j])
                    else:
                        break


            # Get goal text
            if init_goal in line:
                for j in range(i, len(lines)):
                    if final_goal not in lines[j]:
                        goal.append(lines[j])
                    else:
                        break

            # Get outcome text
            if init_outcome in line :
                for j in range(i, len(lines)):
                    if final_outcome_1 not in lines[j] and final_outcome_2 not in lines[j]:
                        outcome.append(lines[j])
                    else:
                        break
            
    
        # store all the information in a dictionary and create a dataset
        sections["Challenge"] = conv
        sections["Background and Scope"] = "".join(scope).replace("Background and scope", "")
        sections["Overall Goal & Specific Objectives"] = "".join(goal).replace("Overall goal and specific objectives ", "")
        sections["Expected Outcomes & Impacts"] = "".join(outcome).replace("Expected outcomes and impacts ", "")

        sections = pd.DataFrame([sections])

        df_conv = pd.concat([df_conv, sections], ignore_index=True)
    
    return df_conv

# Obtaining the database

For obtainig the database, the first step is to obtain the pdf document and extract the different types of text (Destinations, calls and convocatories).

In [47]:
LOOP = False # Set to false in case just interested in one program

# path = "/Users/mbalairon/Desktop/Horizon/"
path = ""
doc_name = "eic"

section = "EIC Pathfinder Challenge"
# section = "EIC Transition Challenge"

sections = ["EIC Pathfinder Challenge", "EIC Transition Challenge"]

# Get hyperparameters related with each of the kinds of texts at the doc
title_size =  12.0
title_font = 'SegoeUI-Bold'
title_color = 5454240

body_size = 12.0
body_font = "SegoeUI"

subtitle_size = 12.0
subtitle_font = "SegoeUI-Bold"


In [48]:
if LOOP:
    for section in sections:
        # Open the document
        doc = fitz.open(path+doc_name+".pdf")

        # Obtain a dictionary which stores at which page each section starts
        convocatory_pages = get_convocatory_page(doc, title_size, title_font, title_color, section)

        # Obtain a dictionary which stores the text for each convocatory
        convocatory_texts = get_convocatory_text(doc, convocatory_pages, body_size, body_font, subtitle_size, subtitle_font)

        # Obtain the convocatory dataframe containing all information about each of the convocatories
        df_convocatory = get_df_convocatory(convocatory_texts)

        # save as csv
        df_convocatory.to_parquet(section+".parquet")
else:
    # Open the document
    doc = fitz.open(path+doc_name+".pdf")

    # Obtain a dictionary which stores at which page each section starts
    convocatory_pages = get_convocatory_page(doc, title_size, title_font, title_color, section)

    # Obtain a dictionary which stores the text for each convocatory
    convocatory_texts = get_convocatory_text(doc, convocatory_pages, body_size, body_font, subtitle_size, subtitle_font)

    # Obtain the convocatory dataframe containing all information about each of the convocatories
    df_convocatory = get_df_convocatory(convocatory_texts)

    # save as csv
    df_convocatory.to_parquet(section+".parquet")

In [49]:
df_convocatory

Unnamed: 0,Challenge,Background and Scope,Overall Goal & Specific Objectives,Expected Outcomes & Impacts
0,EIC Pathfinder Challenge: Clean and efficient ...,Cooling is an essential process across many a...,This EIC Pathfinder Challenge aims at advancin...,Expected outcomes and impactsThe supported pro...
1,"EIC Pathfinder Challenge: Architecture, Engine...",Life cycle greenhouse gas (GHG) emissions of...,"The potential of the digitalised, mutually int...",Expected outcomes and impacts Projects must c...
2,EIC Pathfinder Challenge: Precision nutrition,Dietary guidelines provide recommendations on...,Overall goal and specific objectivesThe goal o...,Expected outcomes and impactsUnderstanding and...
3,EIC Pathfinder Challenge: Responsible electron...,Responsible electronics represents a unique o...,The overall goal of this Challenge is to creat...,This Challenge is expected to contribute to th...
4,EIC Pathfinder Challenge: In-space solar energ...,Thermonuclear reactions in the Sun are practi...,Overall goal and specific objectivesThe overal...,Expected outcomes and impactsThis Challenge ai...
