In [None]:
import pandas as pd
import pdfplumber

import os
os.chdir("../")

X_VALUE_CO_FOUNDERS = 257.44449036
X_VALUE_WHATS_DIFFERENT = 245.53287961199996
X_VALUE_EMPLOYEES = 0
X_VALUE_COVER = 29.971149623999995

X_VALUE_RESPONSES = 119.88459849599998
PDF_DIR = "./data/Startups/Factsheets"
       

In [None]:
def get_texts(pdf):
    pdf_path = os.path.join(PDF_DIR, pdf)

    with pdfplumber.open(pdf_path) as pdf:
        lines = []
        words = []
        for page in pdf.pages:
            lines += page.extract_text().split("\n")
            words += page.extract_words()
        pdf.close()
    
    description_lines, difference_lines = get_lines(lines)
    description_words, difference_words = get_words(words)

    description = get_description(description_lines, description_words)
    difference = get_difference(difference_lines, difference_words)

    return description, difference

def get_lines(lines):
    return filter_word_or_lines(lines, get_lines_equal)

def get_lines_equal(line, text):
    return line == text

def get_words(words):
    description, difference = filter_word_or_lines(words, get_words_equal)
    return filter(description), filter(difference)

def get_words_equal(word, text):
    if text == "CO-FOUNDERS":
        return word["x0"] == X_VALUE_CO_FOUNDERS and word["text"] == text
    if text == "WHAT'S DIFFERENT":
        return word["x0"] == X_VALUE_WHATS_DIFFERENT and word["text"] == "WHAT'S"
    if text == "EMPLOYEES":
        return word["x0"] == X_VALUE_EMPLOYEES and word["text"] == text
    if text == "RESPONSES":
        return word["x0"] == X_VALUE_RESPONSES and word["text"] == text
    
def filter(words, where="COVER"):
    if where == "COVER":
        X_VALUE = X_VALUE_COVER
    elif where == "RESPONSES":
        X_VALUE = X_VALUE_RESPONSES
    return [word for word in words if word["x0"] == X_VALUE]

def filter_word_or_lines(lines_words, equal_function):
    description = []
    difference = []

    in_description = True
    in_difference = False
    for line_word in lines_words:
        if equal_function(line_word, "CO-FOUNDERS"):
            in_description = False
            continue
        if equal_function(line_word, "WHAT'S DIFFERENT"):
            in_difference = True
            continue
        if equal_function(line_word, "EMPLOYEES") or equal_function(line_word, "RESPONSES"):
            in_difference = False
            continue
        if in_description:
            description.insert(0, line_word)
        if in_difference:
            difference.append(line_word)

    return description, difference

def get_description(lines, words):
    description = ""
    if len(words) > 0:
        for word in words:
            while not lines[0].startswith(word["text"]):
                lines = lines[1:]
            description = lines[0] + " " + description
    return description

def get_difference(lines, words):
    difference = ""
    if len(words) > 0:
        for word in words:
            while not lines[0].startswith(word["text"]):
                lines = lines[1:]
            difference += " " + lines[0]
    return difference

In [None]:
pdf_files = pd.DataFrame([file for file in os.listdir(PDF_DIR) if file.lower().endswith(".pdf")], columns=["Filename"])
pdf_files = pdf_files[pdf_files['Filename'] == "bitsCrunch.pdf"]
pdf_files.head()

In [None]:
pdf_files[['Description', 'Difference']] = pdf_files['Filename'].apply(get_texts).apply(pd.Series)
pdf_files
