# Function definitions

In [1]:
import fitz
import re
import pandas as pd

In [2]:
def get_challenge_page(doc, letters_size, letters_font, letters_color, section):
    """
    Function for obtaining a dictionary which stores at which page each section starts
    
    doc -> pdf document
    letter_size -> Size of the letters of the titles of the challenges
    letters_font -> Font of the letters of the titles of the challenges
    letters_color -> Color of the letters of the titles of the challenges
    section -> Name of the section we are interested in
    """
    # iterate through all the lines of the text checking if the properties match with the ones of the subtitles indicating the begining of a section
    
    challenge_pages = {}
    for page_num, page in enumerate(doc, start=1): 
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            for line in block.get("lines", []):
                for fragment in line["spans"]:
                    size = fragment["size"]
                    font = fragment["font"]
                    color = fragment["color"]

                    # if we find one of this titles, we store the ID of the challenge and store the page at which it starts
                    if size==letters_size and  font==letters_font and color==letters_color and section in fragment["text"]: 

                        # Check that the string is clean. If not remove the indicative number before the challenge
                        if fragment["text"].split()[0] == "EIC":   
                            challenge_pages[fragment["text"]] = page_num
                        else:
                            splited_text = fragment["text"].split()  
                            challenge_pages[' '.join(splited_text[1:])] = page_num                            
            
    return challenge_pages

In [3]:
def get_challenge_text(doc, challenge_pages, body_size, body_font, subtitle_size, subtitle_font):
    """
    Function for obtaining a dictionary which stores the text for each of the challenges

    challenge_pages -> dictionary whoch stores at which page each challenge starts
    numpage_size -> Size of the letters of the page numbers
    numpage_font -> Font of the letters of the page numbers
    foot_size -> Size of the letters of the foot
    foot_font -> Font of the letters of the foot
    """
    challenge_pairs = list(zip(challenge_pages.keys(), list(challenge_pages.keys())[1:])) + [(list(challenge_pages.keys())[-1], None)]
    challenge_texts = {}

    for init_challenge, final_challenge in challenge_pairs: # iterate through all the pages between to sections 
        init_page = challenge_pages[init_challenge]
        final_page = challenge_pages[final_challenge] if final_challenge is not None else len(doc)

        text = []
        inside_section = False # Variable for detecting if we are inside a challenge section 

        for page_num in range(init_page-1, final_page):
            page = doc.load_page(page_num)
            blocks = page.get_text("dict")["blocks"]

            for block in blocks:
                for line in block.get("lines", []):
                    for fragment in line["spans"]:
                        size = fragment["size"]
                        font = fragment["font"]
                        text_content = fragment["text"]

                        # Check if we are inside a challenge section 
                        if final_challenge is None or init_challenge in text_content:
                            inside_section = True 

                        elif final_challenge in text_content:
                            inside_section = False

                        # store all the text at thos sections in case it is not a header of a foot page
                        if ((size==body_size and font==body_font) or (size==subtitle_size and font==subtitle_font)) and inside_section:  # check that the text is part of the body
                            text.append(text_content)

        complete_text = "\n".join(text)
        challenge_texts[init_challenge] = complete_text # store the text at the dictionary


   
    return challenge_texts

In [50]:
def get_df_challenge(challenge_texts, work_programme):
    """
    Function for creating the dataset containing all the necessary data for each of the challenges

    challenge_texts -> dictionary containing the text for each of the challenges
    """
    df_challenge = pd.DataFrame(columns=["Work Programme", "Challenge", "Background and Scope", "Overall Goal & Specific Objectives", "Expected Outcomes & Impacts"])

    # Scope section starts at "Background and scope" and ends at "Overall goal and specific objectives"
    init_scope = "Background and scope"
    final_scope = "Overall goal and specific objectives"
    

    # Goal section starts at "Overall goal and specific objectives" and ends at "Expected outcomes and impacts"
    init_goal = "Overall goal and specific objectives"
    final_goal = "Expected outcomes and impacts"

    # Outcome section starts at "Expected outcomes and impacts" and ends at "EIC"
    init_outcome = "Expected outcomes and impacts"
    final_outcome_1 = "EIC"
    final_outcome_2 = "Specific Conditions"
    
    for challenge, text in challenge_texts.items(): # iterate through all challenges texts
        scope = []
        goal = []
        outcome = []            

        sections = {}
        lines = text.split("\n")

        for i, line in enumerate(lines): # iterate through all the lines of the texts
          
            # Get scope text
            if (init_scope in line):
                for j in range(i, len(lines)): # skip two lines
                    if final_scope not in lines[j]:
                        line = lines[j].strip()
                        line = ' '.join(line.split())
                        scope.append(line)
                    else:
                        break
    
            # Get goal text
            if init_goal in line:
                for j in range(i, len(lines)):
                    if final_goal not in lines[j]:
                        line = lines[j].strip()
                        line = ' '.join(line.split())
                        goal.append(line)
                    else:
                        break            


            # Get outcome text
            if init_outcome in line :
                for j in range(i, len(lines)):
                    if final_outcome_1 not in lines[j] and final_outcome_2 not in lines[j]:
                        line = lines[j].strip()
                        line = ' '.join(line.split())            
                        outcome.append(line)
                    else:
                        break
            

        # store all the information in a dictionary and create a dataset
        sections["Work Programme"] = work_programme
        sections["Challenge"] = challenge
        sections["Background and Scope"] = "".join(scope).replace("Background and scope", "")
        sections["Overall Goal & Specific Objectives"] = "".join(goal).replace("Overall goal and specific objectives", "")
        sections["Expected Outcomes & Impacts"] =  "".join(outcome).replace("Expected outcomes and impacts", "")

        sections = pd.DataFrame([sections])

        df_challenge = pd.concat([df_challenge, sections], ignore_index=True)
    
    return df_challenge

# Obtaining the database

For obtainig the database, the first step is to obtain the pdf document and extract the different types of text (Destinations, calls and challenge).

In [51]:
LOOP = True # Set to false in case just interested in one program

path = "pdf documents/"
doc_name = "eic"

section = "EIC Pathfinder Challenge"
# section = "EIC Transition Challenge"

sections = ["EIC Pathfinder Challenge", "EIC Transition Challenge"]

# Get hyperparameters related with each of the kinds of texts at the doc
title_size =  12.0
title_font = 'SegoeUI-Bold'
title_color = 5454240

body_size = 12.0
body_font = "SegoeUI"

subtitle_size = 12.0
subtitle_font = "SegoeUI-Bold"


In [52]:
if LOOP:
    df = pd.DataFrame() # data frame for storing all the information
    
    for section in sections:
        # Open the document
        doc = fitz.open(path+doc_name+".pdf")

        # Obtain a dictionary which stores at which page each section starts
        challenge_pages = get_challenge_page(doc, title_size, title_font, title_color, section)

        # Obtain a dictionary which stores the text for each challenge
        challenge_texts = get_challenge_text(doc, challenge_pages, body_size, body_font, subtitle_size, subtitle_font)

        # Obtain the challenge dataframe containing all information about each of the challenges
        df_challenge = get_df_challenge(challenge_texts, section)
        
        # join to the orevious df
        df = pd.concat([df, df_challenge], ignore_index=True)
        
        # save as parquet
        df.to_parquet("EIC_work_programmes.parquet")
else:
    # Open the document
    doc = fitz.open(path+doc_name+".pdf")

    # Obtain a dictionary which stores at which page each section starts
    challenge_pages = get_challenge_page(doc, title_size, title_font, title_color, section)

    # Obtain a dictionary which stores the text for each challenge
    challenge_texts = get_challenge_text(doc, challenge_pages, body_size, body_font, subtitle_size, subtitle_font)

    # Obtain the challenge dataframe containing all information about each of the challenges
    df_challenge = get_df_challenge(challenge_texts, section)

    # save as csv
    # df_challenge.to_parquet(section+".parquet")