In [138]:
import pdfplumber
import re
import db_engine

In [134]:
pdf_path = "data/pdf/2024/20024580.pdf"
# Define the regex patterns
column_names = re.compile(r'ID Owner Asset Transaction Date Notification Amount Cap.\nType Date Gains >\n\$200\?')
separator = re.compile(r'\nF S: New\n(?:D: .*\n)?')    
stock_pattern = r'^[SP]+\s+(.+?)\s+(S \(partial\)|S|P)\s+'
date_pattern = r'(\d{2}/\d{2}/\d{4})'
amount_pattern = r'\$\d{1,3}(?:,\d{3})*(?:\s*-\s*\$\d{1,3}(?:,\d{3})*|(?: -)?(?:[\d,]+)?)'

def printls(array):
    print('[')
    for item in array:
        print(item)
    print(']')
    print()

def has_table(page):
    text = page.extract_text()
    return column_names.search(text)
    
def get_extract(text):
    text = page.extract_text()
    result = column_names.search(text)
    if result: 
        # Get the end position of the matched text
        start_position = result.end()
        # Slice the text from the end position of the matched text to the end of the string
        remaining_text = text[start_position:].strip()    
        return remaining_text

def pages_with_tables(pdf):
    return [index for index, page in enumerate(pdf.pages) if has_table(page)]

def get_filings_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page for page in pdf.pages if has_table(page)]
        all_text = ""
        for page in pages:
            important_text = get_extract(page)
            all_text += important_text
        cleaned_text = all_text.replace('\x00', '')
        return cleaned_text

In [139]:
def parse_string(s):
    # Extract stock and type
    stock_type_match = re.search(stock_pattern, s)
    if not stock_type_match:
        return None
    
    stock = stock_type_match.group(1).strip()
    type_ = stock_type_match.group(2).strip()
    
    # Extract dates
    date_matches = re.findall(date_pattern, s)
    if len(date_matches) < 2:
        return None
    
    date = date_matches[0]
    notification_date = date_matches[1]
    
    # Extract amount
    amount_match = re.search(amount_pattern, s)
    if not amount_match:
        return None
    
    amount = amount_match.group(0).strip()
    
    return (stock, type_, date, notification_date, amount)

def parse_list1(array):
    array = array[:-1]
    processed_list = []
    for item in array:
        lines = item.split('\n')
        if '[ST]' in lines[-1] and len(lines) > 1:
            last_line = lines.pop()
            # Find the position of the first date (assumed to be in the format MM/DD/YYYY)
            match = re.search(r' S \(partial\) | P | S ', lines[0])
            if match:
                first_date_pos = match.start()
                lines[0] = lines[0][:first_date_pos] + last_line + ' ' + lines[0][first_date_pos:]
        processed_list.append('\n'.join(lines))
    return processed_list

def parse_list2(array):
    for item in array:
        print(parse_string(item))
    

def fillings_text_to_df(text):
    result = re.split(separator, text)
    parsed1 = parse_list1(result)
    parsed =parse_list2(parsed1)
    return parsed1            

In [136]:
text = get_filings_from_pdf(pdf_path)
lines = fillings_text_to_df(text)

('Alphabet Inc. - Class A CommonStock (GOOGL) [ST]', 'S (partial)', '02/01/2024', '03/01/2024', '$1,001 - $15,000')
('Amazon.com, Inc. - Common Stock(AMZN) [ST] $50,000', 'P', '02/12/2024', '03/01/2024', '$50,000')
('Koninklijke Philips N.V. NY RegistryShares (PHG) [ST]', 'P', '02/12/2024', '03/01/2024', '$1,001 - $15,000')
('Pfizer, Inc. Common Stock (PFE) [ST]', 'P', '02/12/2024', '03/01/2024', '$1,001 - $15,000')
('QUALCOMM Incorporated -Common Stock (QCOM) [ST]', 'P', '02/12/2024', '03/01/2024', '$1,001 - $15,000')
('Unilever PLC Common Stock (UL)[ST]', 'S', '02/12/2024', '03/01/2024', '$1,001 - $15,000')
[
SP Alphabet Inc. - Class A CommonStock (GOOGL) [ST]  S (partial) 02/01/2024 03/01/2024 $1,001 - $15,000
SP Amazon.com, Inc. - Common Stock(AMZN) [ST] $50,000  P 02/12/2024 03/01/2024 $15,001 -
SP Koninklijke Philips N.V. NY RegistryShares (PHG) [ST]  P 02/12/2024 03/01/2024 $1,001 - $15,000
SP Pfizer, Inc. Common Stock (PFE) [ST] P 02/12/2024 03/01/2024 $1,001 - $15,000
SP QUALC