In [435]:
import json
import re
from bs4 import BeautifulSoup
import requests
from io import BytesIO
from zipfile import ZipFile
import xml.etree.ElementTree as ET
import pandas as pd

In [436]:
#460
#id='208364016'
#700
id='208573987'
get_filings='https://netfile.com/Connect2/api/public/list/filing?AID=coak'
base_url='https://netfile.com/Connect2/api'
lobbyist_list='https://netfile.com/Connect2/api/public/lobbyist/list/filer?AID=coak'


In [437]:

BASE_URL = 'https://netfile.com/Connect2/api/'

def fetch_calfile(filing_id):
    url = BASE_URL + "public/efile/" + str(filing_id)
    headers = {'Accept': 'application/zip'}
    print(url)
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        raise Exception("Request Failed: " + str(response.status_code))
    
    with ZipFile(BytesIO(response.content)) as zip_file:
        for file in zip_file.namelist():
            with zip_file.open(file) as f:
                return f.read()


In [438]:
calfile=fetch_calfile(id)
calfile

https://netfile.com/Connect2/api/public/efile/208573987


b'<disclosure agency="NetFile" type="netfile.render.fppc._2023.RenderableFppc700" version="1.0">\r\n  <current_page>0</current_page>\r\n  <hide_signature_names>false</hide_signature_names>\r\n  <show_blank_pages>false</show_blank_pages>\r\n  <show_draft>false</show_draft>\r\n  <show_redacted>true</show_redacted>\r\n  <report_year>2022</report_year>\r\n  <total_pages>3</total_pages>\r\n  <x_position />\r\n  <y_position />\r\n  <cover>\r\n    <first_name>Anya</first_name>\r\n    <is_cover_offices_approved>true</is_cover_offices_approved>\r\n    <is_cover_info_approved>false</is_cover_info_approved>\r\n    <jurisdiction>\r\n      <description_city>Oakland</description_city>\r\n      <is_city>true</is_city>\r\n      <is_county>false</is_county>\r\n      <is_judge_or_court>false</is_judge_or_court>\r\n      <is_multi_county>false</is_multi_county>\r\n      <is_other>false</is_other>\r\n      <is_state>false</is_state>\r\n      <id>147a76ee-49b2-44c1-bf40-23d500b49165</id>\r\n      <version_

In [439]:
LEGEND = {
    'fair_market_value': [
        '$2,000-$10,000', '$10,001-$100,000', '$100,001-$1,000,000', 'Over $1,000,000'
    ],
    'fair_market_value_schedule_a_2': [
        '$0-$1,999', '$2,000-$10,000', '$10,001-$100,000', '$100,001-$1,000,000', 'Over $1,000,000'
    ],
    'gross_income_received': [
        '$0-$499', '$500-$1,000', '$1,001-$10,000', '$10,001-$100,000', 'Over $100,000'
    ],
    'nature_of_interest': [
        'Ownership/Deed of Trust', 'Easement', 'Leasehold', 'Other'
    ],
    'nature_of_investment': [
        'Partnership', 'Sole Proprietorship', 'Other'
    ],
    'gross_income_received_schedule_c_1': [
        'No Income - Business Position Only', '$500-$1,000', '$1,001-$10,000', '$10,001-$100,000', 'Over $100,000'
    ],
    'reason_for_income': [
        'Salary', "Spouse's or registered domestic partner's income", 'Partnership',
        'Sale', 'Loan Repayment', 'Commission', 'Rental income', 'Other'
    ],
    'business_type': [
        'Trust', 'Business Entity'
    ],
    'type_of_payment': [
        'Gift', 'Income'
    ]
}


def file_name(xml=None):
    if xml is not None:
        first_name = xml.xpath('//disclosure/cover/first_name')
        last_name = xml.xpath('//disclosure/cover/last_name')
        return ' '.join([first_name, last_name])
    else:
        return super()

def filer_title(xml=None, offices=None):
    if xml is not None and offices:
        return Forms.Form700.title_from_office(offices[0])
    else:
        return super()

def offices(xml=None):
    if xml is not None:
        offices = xml.xpath('//disclosure/cover/offices/office')
        return [{el.tag: el.text for el in office} for office in offices]
    else:
        return []

def schedule_a1(xml=None):
    if xml is not None:
        schedules = xml.xpath('//disclosure/schedule_a_1s/schedule_a_1')
        return [{el.tag: LEGEND[el.tag][int(el.text) - 1] if el.tag in LEGEND else el.text for el in schedule} for schedule in schedules]
    else:
        return []

def schedule_a2(xml=None):
    if xml is not None:
        schedules = xml.xpath('//disclosure/schedule_a_2s/schedule_a_2')
        return [{el.tag: LEGEND[el.tag][int(el.text) - 1] if el.tag in LEGEND else el.text for el in schedule} for schedule in schedules]
    else:
        return []

def schedule_b(xml=None):
    if xml is not None:
        schedules = xml.xpath('//disclosure/schedule_bs/schedule_b')
        return [{el.tag: LEGEND[el.tag][int(el.text) - 1] if el.tag in LEGEND else el.text for el in schedule} for schedule in schedules]
    else:
        return []

def schedule_c1(xml=None):
    if xml is not None:
        schedules = xml.xpath('//disclosure/schedule_c_1s/schedule_c_1')
        return [{el.tag: LEGEND[el.tag][int(el.text) - 1] if el.tag in LEGEND else el.text for el in schedule} for schedule in schedules]
    else:
        return []

def schedule_d(xml=None):
    if xml is not None:
        schedules = xml.xpath('//disclosure/schedule_ds/schedule_d')
        return [{el.tag: LEGEND[el.tag][int(el.text) - 1] if el.tag in LEGEND else el.text if el.tag != 'gifts' else [{'amount': float(g.xpath('amount/text()')[0]), 'description': g.xpath('description/text()')[0], 'gift_date': g.xpath('gift_date/text()')[0]} for g in el.xpath('gift')] for el in schedule} for schedule in schedules]
    else:
        return []

def schedule_e(xml=None):
    if xml is not None:
        schedules = xml.xpath('//disclosure/schedule_es/schedule_e')
        return [{el.tag: LEGEND[el.tag][int(el.text) - 1] if el.tag in LEGEND else el.text for el in schedule} for schedule in schedules]
    else:
        return []

def no_reportable_interests(xml=None):
    if xml is not None:
        counts = [
            xml.xpath('//schedule_a_1s/@count'),
            xml.xpath('//schedule_a_2s/@count'),
            xml.xpath('//schedule_bs/@count'),
            xml.xpath('//schedule_c_1s/@count'),
            xml.xpath('//schedule_c_2s/@count'),
            xml.xpath('//schedule_ds/@count'),
            xml.xpath('//schedule_es/@count'),
        ]
        return sum(int(count[0]) if count else 0 for count in counts) == 0
    else:
        return None


In [440]:
tree = ET.ElementTree(ET.fromstring(calfile))
root = tree.getroot()

In [441]:
def fetch_elements(x,element,results=None):
    if results is None:
        results = []

    if element.tag == x:    
            for child in element:
                if child.text and child.text.strip():
                    child_data={}
                    child_data[child.tag]=child.text.strip()
                    if child_data:
                        results.append(child_data)
    for child in element:
        fetch_elements(x,child,results)
    return results

In [442]:
def fetch_all(x,element,results=None):
    if results is None:
        results = []
    if element.tag == x:
        for child in element:
            fetch_all(child.tag,child,results)
        for child in element:
            fetch_elements(child.tag,child,results)
    for child in element:
        fetch_all(x,child,results)
    return results

In [449]:
def format(x):
    data=fetch_all(x,root)
    combined_dict = {}
    for d in data:
        for key, value in d.items():
            if key in combined_dict:
                combined_dict[key].append(value)
            else:
                combined_dict[key] = value
    combined_dict['Schedule']=x
    combined_dict['Filing Id']=id
    return combined_dict


In [450]:
def combine(list):
    combined=[]
    for item in list:
        row=format(item)
        combined.append(row)
    return combined

In [451]:
list=['schedule_a_1s','schedule_a_2s','schedule_bs','schedule_c_1s','schedule_ds','schedule_es']
pd.DataFrame(combine(list))

Unnamed: 0,Schedule,Filing Id,city,state,zip,business_position,business_type,description,entity_name,fair_market_value_schedule_a_2,...,nature_of_investment,id,version_for_add,version_for_delete,version_for_edit,business_activity,gross_income_received_schedule_c_1,name_of_income_source,reason_for_income,reason_for_income_other
0,schedule_a_1s,208573987,,,,,,,,,...,,,,,,,,,,
1,schedule_a_2s,208573987,Oakland,CA,94619.0,"Owner, photographer",2.0,Small portrait photography business,Anya Ku d/b/a Kutography,2.0,...,1.0,c7e037eb-39c8-4c5f-a39c-28886506e889,0.0,0.0,0.0,,,,,
2,schedule_bs,208573987,,,,,,,,,...,,,,,,,,,,
3,schedule_c_1s,208573987,Oakland,CA,94619.0,"Owner, photographer",,,,,...,,83c2a876-64f4-4ffe-809f-b8e97a84e760,0.0,0.0,0.0,Portrait photography services,3.0,Anya Ku d/b/a/ Kutography,6.0,Payment from clients for photography services
4,schedule_ds,208573987,,,,,,,,,...,,,,,,,,,,
5,schedule_es,208573987,,,,,,,,,...,,,,,,,,,,
