In [1]:
import pandas as pd
from datetime import datetime
import re

In [2]:
""" Get stuff out of Netfile v2 API
"""
from pprint import PrettyPrinter
from pathlib import Path
import os
import requests

BASE_URL = 'https://netfile.com/api/campaign'
CONTRIBUTION_FORM = 'F460A'
EXPENDITURE_FORM = 'F460E'

PARAMS = { 'aid': 'COAK' }

def get_auth_from_env_file(filename: str='.env'):
    """ Split .env file on newline and look for API_KEY and API_SECRET
        Return their values as a tuple
    """
    env_file=Path(filename)
    auth_keys = [ 'API_KEY', 'API_SECRET' ]
    if env_file.exists():
        auth = tuple( v for _, v in sorted([
            ln.split('=') for ln in
            env_file.read_text(encoding='utf8').strip().split('\n')
            if ln.startswith(auth_keys[0]) or ln.startswith(auth_keys[1])
        ], key=lambda ln: auth_keys.index(ln[0])))
    else:
        auth=tuple(os.environ[key] for key in auth_keys)
            
    return auth

AUTH=get_auth_from_env_file()

pp = PrettyPrinter()

def get_filing(offset=0):
    """ Get a filing
    """
    url = f'{BASE_URL}/filing/v101/filings?Limit=100000'

    params = { **PARAMS }
    if offset > 0:
        params['offset'] = offset

    res = requests.get(url, params=params, auth=AUTH)
    if res.status_code == 500:
        print('ping')
        return get_filing(offset=0)
    else:
        print(res)
        body = res.json()
        results = body.pop('results')
        return results, body
    
def get_activities(offset=0):
    """ Get a filing
    """
    url = f'{BASE_URL}/filing/v101/filing-activities?Limit=100000'

    params = { **PARAMS }
    if offset > 0:
        params['offset'] = offset

    res = requests.get(url, params=params, auth=AUTH)
    if res.status_code == 500:
        print('ping')
        return get_filing(offset=0)
    else:
        print(res)
        body = res.json()
        results = body.pop('results')
        return results, body
    
def get_element_activities(*querys, offset=0):
    """ Get a filing
    """
    url = f'{BASE_URL}/filing/v101/element-activities?Limit=100001'

    params = { **PARAMS }
    for q in querys:
        params.update(q)
    if offset > 0:
        params['offset'] = offset

    res = requests.get(url, params=params, auth=AUTH)
    if res.status_code == 500:
        print('ping')
        return get_filing(offset=0)
    else:
        print(res)
        body = res.json()
        results = body.pop('results')
        return results, body
    
def get_filing_elements(*querys, offset=0):
    """ Get a filing
    """
    url = f'{BASE_URL}/filing/v101/filing-elements?Limit=100001'

    params = { **PARAMS }
    for q in querys:
        params.update(q)
    if offset > 0:
        params['offset'] = offset

    res = requests.get(url, params=params, auth=AUTH)
    if res.status_code == 500:
        print('ping')
        return get_filing(offset=0)
    else:
        print(res)
        body = res.json()
        results = body.pop('results')
        return results, body

def get_form(form,offset=0):
    """ Get filings with matching form type
    """
    url = f'{BASE_URL}/filing/v101/filings?Limit=100000&SpecificationForm={form}'

    params = { **PARAMS }
    if offset > 0:
        params['offset'] = offset

    res = requests.get(url, params=params, auth=AUTH)
    if res.status_code == 500:
        return get_form(form,offset=0)
    else:
        body = res.json()
        results = body.pop('results')

        return results, body
def get_filer(filer_nid):
    """ Get one filer
    """
    url = f'{BASE_URL}/filer/v101/filers?'

    res = requests.get(url, params={ **PARAMS, 'filerNid': filer_nid }, auth=AUTH)
    if res.status_code == 500:
        return get_filer(filer_nid)
    else:
        body = res.json()

        return body['results']
def list_filers():
    """ Get all the elections
    """
    url = f'{BASE_URL}/filer/v101/filers?Limit=100000'

    res = requests.get(url, params={ **PARAMS}, auth=AUTH)
    if res.status_code == 500:
        print('ping')
        return list_filers()
    else:
        body = res.json()
        return body['results']
def list_elections_influences_for_election(id):
    """ Get all the elections
    """
    url = f'{BASE_URL}/election/v101/election-influences?Limit=100000&ElectionNid={id}'

    res = requests.get(url, params={ **PARAMS}, auth=AUTH)
    if res.status_code == 500:
        return list_elections_influences(id)
    else:
        body = res.json()
        return body['results']
    
def list_elections_influences():
    """ Get all the elections
    """
    url = f'{BASE_URL}/election/v101/election-influences?Limit=100000'

    res = requests.get(url, params={ **PARAMS}, auth=AUTH)
    if res.status_code == 500:
        return list_elections_influences()
    else:
        body = res.json()
        return body
    
def list_filer_elections_influences(id):
    """ Get all the elections
    """
    url = f'{BASE_URL}/election/v101/election-influences?Limit=100000&FilerNid={id}'

    res = requests.get(url, params={ **PARAMS}, auth=AUTH)
    if res.status_code == 500:
        return list_elections_influences(id)
    else:
        body = res.json()
        return body['results']

def list_elections():
    """ Get all the elections
    """
    url = f'{BASE_URL}/election/v101/elections?Limit=100000'

    res = requests.get(url, params={ **PARAMS}, auth=AUTH)
    if res.status_code == 500:
        return list_elections()
    else:
        body = res.json()

        return body['results']
def export_transactions(id,offset=0):
    """ Get a filing
    """
    url = f'{BASE_URL}/filing/v101/filings/{id}'

    params = { **PARAMS }
    if offset > 0:
        params['offset'] = offset

    res = requests.get(url, params=params, auth=AUTH)
    if res.status_code == 500:
        return export_transactions(id,offset=0)
    else:
        body = res.json()

        return body
def public_api_candiate_id(id):
    """ Get all the elections
    """
    url = f'https://public.netfile.com/pub2/AllFilingsByCandidate.aspx?id={id}'

    res = requests.get(url)
    if res.status_code == 500:
        return public_api_candiate_id(id)
    else:
        body = res

        return body
    
mapping = {
    'City Council Member At-Large - City of Oakland':'City Council Member At-Large',
    'Mayor - City of Oakland':'Mayor',
    'City Council - City of Oakland - {x}':'City Council District {x}',
    'Director - Oakland Unified School District - {x}':'OUSD District {x}',
    'City Attorney - City of Oakland': 'City Attorney',
    'City Auditor - City of Oakland':'City Auditor'
    }

def transform_position(position):
    for pattern, replacement in mapping.items():
        if '{x}' in pattern:
            match = re.match(pattern.replace('{x}', '(\\d+)'), position)
            if match:
                return replacement.replace('{x}', match.group(1))
        elif position == pattern:
            return replacement
    return position

def transform_name(name):
    if name:
        parts = name.split(', ')
        if len(parts) == 2:
            return f'{parts[1]} {parts[0]}'
        return name
    return name


# initial data requests

In [3]:
filers_response=list_filers()

In [4]:
filings = get_filing()[0]

<Response [200]>


In [5]:
elections=list_elections()
seats = [e['seats'] for e in elections]
seatOfficeNid = [{s['seatOfficeNid']:s['officeName']} for seat in seats for s in seat]
seatOfficeNid_to_name = [dict(t) for t in {frozenset(d.items()) for d in seatOfficeNid}]
seats_in_election = [{e['electionNid']:set([s['seatOfficeNid'] for s in e['seats']])} for e in elections]
name_to_id = {transform_position(item[1]): item[0] 
              for dic in seatOfficeNid_to_name 
              for item in dic.items()}

In [6]:
candidates=[item for item in filers_response if item['committeeTypes'] in [['Candidate or Officeholder']]]
no_committee=[item for item in filers_response if item['committeeTypes'] in [['Person']]]

In [7]:
form470=get_form('FPPC470')
form410=get_form('FPPC410')
form501 = get_form('FPPC501')

In [8]:
form460 = get_form('FPPC460')

# form data

In [9]:
comparison_date = datetime.fromisoformat('2011-01-01T00:00:00-08:00') # start of 2014 pst

# Filter and create the data
form410data = [
    {
        'filerNid': item['filerMeta']['filerId'],
        'issuedFilingId': item['filingMeta']['issuedFilingId'],
        'commonName': item['filerMeta']['commonName'],
        'RegType1': item['filerMeta']['strings']['RegType1'],
        'SOS ID': item['filerMeta']['strings'].get('Registration_CA SOS', None),
        'status': item['filerMeta']['status'],
        # 'IsTermination':item['filingMeta']['booleans']['IsTermination'],
        # 'amendmentType': item['filingMeta']['amendmentType'],
        # 'amendmentSequence': item['filingMeta']['amendmentSequence'],
        'datetime': datetime.fromisoformat(item['filingMeta']['legalFilingDateTime'])
    } 
    for item in form410[0]
    if datetime.fromisoformat(item['filingMeta']['legalFilingDateTime']) > comparison_date
]
df410=pd.DataFrame(form410data)

df410 = df410[df410['RegType1'].isin(['Candidate or Officeholder', 'Person'])]


# df410=df410.sort_values('datetime',ascending=False).drop_duplicates(subset=['filerNid', 'status', 'IsTermination'])

# df410[df410.duplicated(subset='filerNid',keep=False)].sort_values('filerNid')
df410.head()

Unnamed: 0,filerNid,issuedFilingId,commonName,RegType1,SOS ID,status,datetime
1,211901697,211902005,Re-Elect Noel Gallo for Oakland City Council 2024,Candidate or Officeholder,1472382,Active,2024-08-09 17:27:56.161244+00:00
3,211868217,211868294,Harbin-Forte for City Attorney 2024,Candidate or Officeholder,Pending,Active,2024-08-05 07:00:00+00:00
4,211901697,211901733,Re-Elect Noel Gallo for Oakland City Council 2024,Candidate or Officeholder,1472382,Active,2024-08-02 07:00:00+00:00
6,210526254,211824323,Wang for Oakland City Council 2024,Candidate or Officeholder,1467592,Active,2024-07-31 07:00:00+00:00
12,151524257,211843775,Treva Reid for Oakland City Council 2020 Offic...,Candidate or Officeholder,1366759,Active,2024-08-01 04:38:34.731720+00:00


In [10]:
form470data = [
    {
        'filerNid': item['filerMeta']['filerId'],
        'issuedFilingId': item['filingMeta']['issuedFilingId'],
        'commonName':item['filerMeta']['commonName'],
        'RegType1': item['filerMeta']['strings']['RegType1'],
        'SOS ID': item['filerMeta']['strings'].get('Registration_CA SOS', None),
        'status': item['filerMeta']['status'],
        'datetime': datetime.fromisoformat(item['filingMeta']['legalFilingDateTime'])
    } 
    for item in form470[0]
    if datetime.fromisoformat(item['filingMeta']['legalFilingDateTime']) > comparison_date
]
df470=pd.DataFrame(form470data)
# df470=df470.sort_values('datetime',ascending=False).drop_duplicates(subset=['filerNid','commonName','SOS ID'])
df470.head()

Unnamed: 0,filerNid,issuedFilingId,commonName,RegType1,SOS ID,status,datetime
0,208303480,211722063,Selika Thomas for City Council 2024,Candidate or Officeholder,1461262.0,Active,2024-07-10 07:00:00+00:00
1,208303480,210314391,Selika Thomas for City Council 2024,Candidate or Officeholder,1461262.0,Active,2024-02-05 08:00:00+00:00
2,209211407,210009458,"Sherman, Debra B.",Person,,Active,2024-01-03 08:00:00+00:00
3,202005464,206555545,Committee to Elect John Mimosa Marks for Oakla...,Candidate or Officeholder,1446906.0,Active,2023-02-21 23:47:06.291382+00:00
4,200831481,205292848,Jordan for Mayor 2022,Candidate or Officeholder,1439246.0,Active,2022-10-26 00:39:26.932273+00:00


In [11]:
form501data = [
    {
        'filerNid': item['filerMeta']['filerId'],
        'issuedFilingId': item['filingMeta']['issuedFilingId'],
        'commonName':item['filerMeta']['commonName'],
        'RegType1': item['filerMeta']['strings']['RegType1'],
        'SOS ID': item['filerMeta']['strings'].get('Registration_CA SOS', None),
        'status': item['filerMeta']['status'],
        'datetime': datetime.fromisoformat(item['filingMeta']['legalFilingDateTime'])
    } 
    for item in form501[0]
    if datetime.fromisoformat(item['filingMeta']['legalFilingDateTime']) > comparison_date
]
df501=pd.DataFrame(form501data)
# df501=df501.sort_values('datetime',ascending=False).drop_duplicates(subset=['filerNid','commonName','SOS ID'])
df501.head()

Unnamed: 0,filerNid,issuedFilingId,commonName,RegType1,SOS ID,status,datetime
0,211693079,211774134,Benjamin Salop for Oakland School Board Distri...,Candidate or Officeholder,Pending,Active,2024-07-19 07:00:00+00:00
1,200879031,211755729,Re-Elect Treva Reid for Oakland City Council 2024,Candidate or Officeholder,1439424,Active,2024-07-18 07:00:00+00:00
2,168581762,211755681,"Sidebotham, Nancy",Person,,Active,2024-07-18 07:00:00+00:00
3,121710662,211755668,"Gallo, Noel",Person,,Active,2024-07-16 07:00:00+00:00
4,211755624,211755637,"Danino, Shawn",Person,,Active,2024-07-16 07:00:00+00:00


In [12]:
combined_df = pd.concat([df410,df470,df501])

In [13]:
# get city, state, and zip from Disclosure addresses
addresses=[{'addressList':item.get('addressList',{}),'filerNid':item['filerNid']} for item in filers_response]
address_dic = {'city':[],'state':[],'zip':[],'filerNid':[]}
for item in addresses:
    for address in item['addressList']:
        if 'Disclosure' in address['addressTypes']:
            address_dic['city'].append(address['city'])
            address_dic['state'].append(address['state'])
            address_dic['zip'].append(address['zip'])
            address_dic['filerNid'].append(item['filerNid'])
address_df=pd.DataFrame(address_dic)

In [14]:
# associate a filer id to a fppc id
regs=[{'fppc_id':item.get('registrations',{}).get('CA SOS',None),'filerNid':item['filerNid']} for item in filers_response]
# get all filers with officers
officers=[[item['officers'], item['filerNid']] for item in filers_response if item['officers']]
# set up dictionary
treasurer_dic={}
# loop through filers with officers and add offcier names if officer position is treasurer the key will be the filler id
for officer in officers:
    for item in officer[0]:
        if item['position']=='Treasurer':
            treasurer_dic[officer[1]]=item['officerName']
# match the filer id key in treasurer dic with the filer ids associated with an fppc id
for key, value in treasurer_dic.items():
    for item in regs:
        if key==item['filerNid']:
            # if a key matches a filer id then add treasure name to the dictionaries
            item['Treasurer']=value
# get only the cases with a treasurer
fppc_with_treasurer=[reg for reg in regs if reg.get('Treasurer', None)]
treasurer_df=pd.DataFrame(fppc_with_treasurer)
treasurer_df

Unnamed: 0,fppc_id,filerNid,Treasurer
0,1471118,211693079,"Salop, Benjamin"
1,1472483,211868217,"Ivery, Cine"
2,1453436,204429424,"Lowe, Andrea"
3,1446325,202521854,"Guzman, Ben"
4,1471304,211755512,"Owens, Stacy"
...,...,...,...
455,1304061,121706546,"Raphael, Leonard"
456,1245211,121706516,"Lee, Mari"
457,931297,121706486,"Owens, Stacy"
458,1303541,121706456,"Linney, Kenneth"


# elections endpoint

In [15]:
# from the list_elections response we ... 
election_list=[]
previous_df=pd.DataFrame()
for election in elections:
    # collect 
    candidates=election['candidates']
    seats=election['seats']
    election_name=election['electionCaption']
    electionNid=election['electionNid']
    election_year=election['electionDate'][:4] # get the year from the four first character, the format is yyyy-mm-dd
    election_key={'election_name':election_name, 'electionNid':electionNid, 'election year':election_year}
    election_list.append(election_key)
    if candidates and seats:
        seat_df=pd.DataFrame(seats)
        candidate_df=pd.DataFrame(candidates)
        merge_df=candidate_df.merge(seat_df, on='seatNid')
        current_df=merge_df[['candidateNid','candidateName','seatNid', 'seatOfficeNid', 'officeName','electionNid','isIncumbent','isWinner']]
        previous_df=pd.concat([previous_df,current_df],ignore_index=True)
election_df=pd.DataFrame(election_list)
final_df=previous_df.merge(election_df, on='electionNid')
# final_df.drop_duplicates(subset=['candidateNid','seatNid','electionNid'])

# Convert 'election year' to numeric, coercing errors to NaN (optional)
final_df['election year'] = pd.to_numeric(final_df['election year'], errors='coerce')

# Filter rows where 'election year' is greater than 2011
result_df = final_df[final_df['election year'] > 2011]

In [16]:
result_df

Unnamed: 0,candidateNid,candidateName,seatNid,seatOfficeNid,officeName,electionNid,isIncumbent,isWinner,election_name,election year
0,211486273,"Brown, Latanya",210326315,121709611,Mayor - City of Oakland,210326313,False,False,11/3/2026 - General,2026
1,201330961,"Ramachandran, Janani",210326366,121709717,City Council - City of Oakland - 4,210326313,False,False,11/3/2026 - General,2026
2,210517020,"Afolabi, Baba",207972552,121709414,City Council - City of Oakland - 3,200879019,False,False,11/5/2024 - General,2024
3,211599725,"Aikens, Jr., Dwayne",207972567,121709556,Director - Oakland Unified School District - 3,200879019,False,False,11/5/2024 - General,2024
4,211570258,"Armstrong, Erin",207972554,121709440,City Council - City of Oakland - 5,200879019,False,False,11/5/2024 - General,2024
...,...,...,...,...,...,...,...,...,...,...
262,134932992,"Tolbert, Carol Lee",126388868,121709365,City Council Member At-Large - City of Oakland,125088328,False,False,11/6/2012 - General,2012
263,134379082,"Torres, Roseann",131466380,121710529,Director - Oakland Unified School District - 5,125088328,False,False,11/6/2012 - General,2012
264,121707384,"Tuman, Joseph",126388868,121709365,City Council Member At-Large - City of Oakland,125088328,False,False,11/6/2012 - General,2012
265,132415332,"Walton, Sheryl",129471500,121709476,City Council - City of Oakland - 7,125088328,False,False,11/6/2012 - General,2012


In [17]:
result_df.loc[:, 'officeName'] = result_df['officeName'].apply(transform_position)
result_df.loc[:, 'candidateName'] = result_df['candidateName'].apply(transform_name)
result_df.loc[:,'category'] = result_df.loc[:,'officeName'].apply(lambda x: re.search(r'\w+\s\w+', x).group() if re.search(r'\w+\s\w+', x) else x)
result_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df.loc[:,'category'] = result_df.loc[:,'officeName'].apply(lambda x: re.search(r'\w+\s\w+', x).group() if re.search(r'\w+\s\w+', x) else x)


Unnamed: 0,candidateNid,candidateName,seatNid,seatOfficeNid,officeName,electionNid,isIncumbent,isWinner,election_name,election year,category
0,211486273,Latanya Brown,210326315,121709611,Mayor,210326313,False,False,11/3/2026 - General,2026,Mayor
1,201330961,Janani Ramachandran,210326366,121709717,City Council District 4,210326313,False,False,11/3/2026 - General,2026,City Council
2,210517020,Baba Afolabi,207972552,121709414,City Council District 3,200879019,False,False,11/5/2024 - General,2024,City Council
3,211599725,"Aikens, Jr., Dwayne",207972567,121709556,OUSD District 3,200879019,False,False,11/5/2024 - General,2024,OUSD District
4,211570258,Erin Armstrong,207972554,121709440,City Council District 5,200879019,False,False,11/5/2024 - General,2024,City Council


election_ids=list(set(final_df['electionNid'].to_list()))
previous_df=pd.DataFrame()
for id in election_ids:
    influences=list_elections_influences(id)
    influences_dic={'filerNid': [],'electionNid': [],'seatNid': [],'candidateNid': [],'committeeName':[],'election_name': []}
    for candidate in influences:   
        influences_dic['filerNid'].append(candidate.get('filerNid', 'None'))
        influences_dic['election_name'].append(candidate.get('electionCaption', 'None'))
        influences_dic['committeeName'].append(candidate.get('committeeName', 'None'))
        influences_dic['electionNid'].append(candidate.get('electionNid', 'None'))
        influences_dic['seatNid'].append(candidate.get('seatNid', 'None'))
        influences_dic['candidateNid'].append(candidate.get('candidateNid', 'None'))
        current_df=pd.DataFrame(influences_dic)
        current_df=current_df
        previous_df=pd.concat([previous_df,current_df],ignore_index=True)

previous_df=previous_df.dropna(subset=['seatNid','candidateNid'])
previous_df.drop_duplicates(subset=['seatNid','candidateNid'])
previous_df

no_influence=[item for item in filers_response if not item['electionInfluences']]
no_influence=[item for item in no_influence if item['committeeTypes'] in [['Candidate or Officeholder'],['Person']]]
no_influence

merge_df=final_df.merge(previous_df[['electionNid', 'seatNid', 'candidateNid', 'filerNid']], how='left', on=['electionNid', 'seatNid', 'candidateNid'])

# PUBLIC NET FILE PORTAL CANDIDATE ID TO FILLING ID TO FILER ID

In [18]:
from lxml import html
from bs4 import BeautifulSoup as bs

In [19]:
def get_pub2_html_from_candidateNid(id):
    """ Get the html string from https://public.netfile.com/pub2/AllFilingsByCandidate.aspx?id=id
    """
    url = f'https://public.netfile.com/pub2/AllFilingsByCandidate.aspx?id={id}'

    res = requests.get(url, params={ **PARAMS})
    if res.status_code == 500:
        return public_api_candiate_id(id)
    
    elif res.status_code != 200:
        raise Exception("Initial request failed")
    
    else:
        soup = bs(res.content, 'html.parser')
        return res, soup
    
def get_page_data(id, initial_res, soup, page_number):
    url = f'https://public.netfile.com/pub2/AllFilingsByCandidate.aspx?id={id}'
    # Extract necessary fields
    viewstate = soup.find('input', {'name': '__VIEWSTATE'}).get('value')
    viewstate_generator = soup.find('input', {'name': '__VIEWSTATEGENERATOR'}).get('value')
    script_manager = soup.find('input', {'name': 'ctl00_MainScriptManager_TSM'}).get('value')
    event_target_suffix = 7 + (page_number - 2) * 2
    # Make sure single digit numbers have a leading 0
    if event_target_suffix < 10:
        event_target = f'ctl00$phBody$FilingGrid$ctl00$ctl03$ctl01$ctl0{event_target_suffix}'
    else:
        event_target = f'ctl00$phBody$FilingGrid$ctl00$ctl03$ctl01$ctl{event_target_suffix}'
    # Construct the POST data to request the desired page
    post_data = {
        '__VIEWSTATE': viewstate,
        '__VIEWSTATEGENERATOR': viewstate_generator,
        '__EVENTTARGET': event_target,
        '__EVENTARGUMENT': '',
        'ctl00_MainScriptManager_TSM': script_manager,
        'ctl00$phBody$FilingGrid$ctl00$ctl03$ctl01$PageSizeComboBox': '20',
        'ctl00_phBody_FilingGrid_ctl00_ctl03_ctl01_PageSizeComboBox_ClientState': '',
        'ctl00_phBody_FilingGrid_ClientState': ''
    }
    
    # Make the POST request to fetch the data for the desired page
    res = requests.post(url, data=post_data, cookies=initial_res.cookies)
    
    if res.status_code != 200:
        raise Exception("Request for page data failed")
    page_soup = bs(res.content, 'html.parser')
    return page_soup

def create_df_from_pub2_table(current_page_soup, candidateNid, multi_page=True):
    tables = current_page_soup.find_all('table')
    df = pd.read_html(str(tables))[0]
    headers = ['issuedFilingId', 'filer', 'filing_date', 'form', 'Seq#', 'Rpt#', 'period_covered', 'view_filing', 'candidateNid']
    df['candidateNid'] = candidateNid
    df.columns = headers
    if multi_page:
        clean_df = df.dropna(subset=['filing_date'])[:-1].reset_index(drop=True)
        return clean_df
    return df

def get_filing_by_id(id,offset=0):
    """ Get a filing
    """
    url = f'{BASE_URL}/filing/v101/filings?IssuedFilingId={id}'

    params = { **PARAMS }
    if offset > 0:
        params['offset'] = offset

    res = requests.get(url, params=params, auth=AUTH)
    if res.status_code == 500:
        print('ping')
        return get_filing_by_id(offset=0)
    else:
        print(res)
        body = res.json()
        results = body.pop('results')
        return results, body
    
def get_table_data_from_html(document, x, y):
    soup = document[1]
    # Locate the table tow by id
    row = soup.find('tr', id=f"ctl00_phBody_FilingGrid_ctl00__{y}")
    columns = row.find_all('td')
    data = columns[x].get_text(strip=True)
    return data

def update_candidate_filing_filer_table(table, candidateNid, filerNid, most_recent_filing_id, filed_by):
    table['candidateNid'].append(candidateNid)
    table['filerNid'].append(filerNid)
    table['recent_filing_id'].append(most_recent_filing_id)
    table['filed_by'].append(filed_by)
    return table

def get_filerNid_from_filingNid(filingNid, filings):
    # first check filings to save time
    file=[filing for filing in filings if filing['originalFilingId'] == filingNid]
    if len(file) != 1: # anything but 1 is an error
        file=get_filing_by_id(filingNid)[0]
    filerNid = file[0].get('filerMeta', {}).get('filerId', None)
    return filerNid

def do_the_damn_thing(i, candidate_id, table, filings, error_list, attempt_limit=3):
    try:
        html_body=get_pub2_html_from_candidateNid(candidate_id)
        filingNid = get_table_data_from_html(html_body, 0, i)
        filed_by = get_table_data_from_html(html_body, 1, i)
        filerNid = get_filerNid_from_filingNid(filingNid, filings)
        table = update_candidate_filing_filer_table(table, candidate_id, filerNid, filingNid, filed_by)
    except:
        i+=1
        if i > attempt_limit:
            print(filingNid, candidate_id)
            error_list.append({candidate_id:filingNid})
            return table
        else:
            do_the_damn_thing(i, candidate_id, table, filings, error_list, attempt_limit)
    return table

def create_candidateNid_to_filerNid_table(df, attempt=3, filings=None):
    if not filings:
        # if user doesn't provide filings
        filings = get_filing()[0]
    # get unique candidateNids
    candiate_ids=list(set(df['candidateNid'].to_list()))
    # create table schema
    table = {'candidateNid':[], 'filerNid':[],'recent_filing_id':[], 'filed_by':[]}
    # create list for errors
    failed_candidateNids = []
    # loop through ids
    for candidate_id in candiate_ids:
        # attempt number
        i=0
        table = do_the_damn_thing(i, candidate_id, table, filings, failed_candidateNids, attempt)
    return table, failed_candidateNids

def get_all_pages(id, res=None, soup= None, page_number=1, last_page=99, previous_table_df=None):
    if page_number == 1:
        res, soup = get_pub2_html_from_candidateNid(id)
        items = soup.find_all('strong')
        if items[-1].text:
            last_page = int(items[-1].text)
        else:
            last_page = 1
            print(f'page {page_number} of {last_page}')
            previous_table_df = create_df_from_pub2_table(soup, id, False)
            return previous_table_df
        previous_table_df = create_df_from_pub2_table(soup, id, True)
        print(f'page {page_number} of {last_page}')
    page_number += 1
    print(f'page {page_number} of {last_page}')
    current_page_soup = get_page_data(id, res, soup, page_number)
    current_table_df = create_df_from_pub2_table(current_page_soup, id)
    ##
    previous_table_df=pd.concat([previous_table_df,current_table_df],ignore_index=True)
    if page_number < last_page:
        previous_table_df = get_all_pages(id, res, current_page_soup, page_number, last_page, previous_table_df)
    return previous_table_df

In [20]:
previous_df = pd.DataFrame(columns=['issuedFilingId', 'filer', 'filing_date', 'form', 'Seq#', 'Rpt#', 'period_covered', 'view_filing', 'candidateNid'])
candidateNids = list(set(result_df['candidateNid'].to_list()))
errors = []
for candidateNid in candidateNids:
    try:
        current_df = get_all_pages(candidateNid)
        previous_df=pd.concat([previous_df,current_df],ignore_index=True)
    except:
        errors.append(candidateNid)
previous_df

page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  previous_df=pd.concat([previous_df,current_df],ignore_index=True)


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]
  previous_df=pd.concat([previous_df,current_df],ignore_index=True)


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 4
page 2 of 4


  df = pd.read_html(str(tables))[0]


page 3 of 4


  df = pd.read_html(str(tables))[0]


page 4 of 4


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 5
page 2 of 5


  df = pd.read_html(str(tables))[0]


page 3 of 5


  df = pd.read_html(str(tables))[0]


page 4 of 5


  df = pd.read_html(str(tables))[0]


page 5 of 5


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 3
page 2 of 3


  df = pd.read_html(str(tables))[0]


page 3 of 3


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 5
page 2 of 5


  df = pd.read_html(str(tables))[0]


page 3 of 5


  df = pd.read_html(str(tables))[0]


page 4 of 5


  df = pd.read_html(str(tables))[0]


page 5 of 5


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 3
page 2 of 3


  df = pd.read_html(str(tables))[0]


page 3 of 3


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 4
page 2 of 4


  df = pd.read_html(str(tables))[0]


page 3 of 4


  df = pd.read_html(str(tables))[0]


page 4 of 4


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 4
page 2 of 4


  df = pd.read_html(str(tables))[0]


page 3 of 4


  df = pd.read_html(str(tables))[0]


page 4 of 4


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 3
page 2 of 3


  df = pd.read_html(str(tables))[0]


page 3 of 3


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 7
page 2 of 7


  df = pd.read_html(str(tables))[0]


page 3 of 7


  df = pd.read_html(str(tables))[0]


page 4 of 7


  df = pd.read_html(str(tables))[0]


page 5 of 7


  df = pd.read_html(str(tables))[0]


page 6 of 7


  df = pd.read_html(str(tables))[0]


page 7 of 7


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 3
page 2 of 3


  df = pd.read_html(str(tables))[0]


page 3 of 3


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 5
page 2 of 5


  df = pd.read_html(str(tables))[0]


page 3 of 5


  df = pd.read_html(str(tables))[0]


page 4 of 5


  df = pd.read_html(str(tables))[0]


page 5 of 5


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 4
page 2 of 4


  df = pd.read_html(str(tables))[0]


page 3 of 4


  df = pd.read_html(str(tables))[0]


page 4 of 4


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 13
page 2 of 13


  df = pd.read_html(str(tables))[0]


page 3 of 13


  df = pd.read_html(str(tables))[0]


page 4 of 13


  df = pd.read_html(str(tables))[0]


page 5 of 13


  df = pd.read_html(str(tables))[0]


page 6 of 13


  df = pd.read_html(str(tables))[0]


page 7 of 13


  df = pd.read_html(str(tables))[0]


page 8 of 13


  df = pd.read_html(str(tables))[0]


page 9 of 13


  df = pd.read_html(str(tables))[0]


page 10 of 13


  df = pd.read_html(str(tables))[0]


page 11 of 13


  df = pd.read_html(str(tables))[0]


page 12 of 13


  df = pd.read_html(str(tables))[0]


page 13 of 13


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 3
page 2 of 3


  df = pd.read_html(str(tables))[0]


page 3 of 3


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 4
page 2 of 4


  df = pd.read_html(str(tables))[0]


page 3 of 4


  df = pd.read_html(str(tables))[0]


page 4 of 4


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 4
page 2 of 4


  df = pd.read_html(str(tables))[0]


page 3 of 4


  df = pd.read_html(str(tables))[0]


page 4 of 4


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 4
page 2 of 4


  df = pd.read_html(str(tables))[0]


page 3 of 4


  df = pd.read_html(str(tables))[0]


page 4 of 4


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]
  df = pd.read_html(str(tables))[0]


page 1 of 2
page 2 of 2


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


page 1 of 1


  df = pd.read_html(str(tables))[0]


Unnamed: 0,issuedFilingId,filer,filing_date,form,Seq#,Rpt#,period_covered,view_filing,candidateNid
0,211873698,LeRonne Armstrong for City Council 2024,08/06/2024,FPPC 460 Amendment,1,,( 01/01/2024 to 06/30/2024 ),View,211584796
1,211820712,LeRonne Armstrong for City Council 2024,07/31/2024,FPPC 460,Original,,( 01/01/2024 to 06/30/2024 ),View,211584796
2,211794055,LeRonne Armstrong for City Council 2024,07/26/2024,FPPC 410 Amendment,2,,( 06/23/2024 ),View,211584796
3,211585963,LeRonne Armstrong for City Council 2024,05/24/2024,FPPC 410 Amendment,1,,( 05/23/2024 ),View,211584796
4,211585937,LeRonne Armstrong for City Council 2024,05/24/2024,FPPC 410,Original,,( 05/23/2024 ),View,211584796
...,...,...,...,...,...,...,...,...,...
3416,152209927,Andrew Park for City Council 2014,08/08/2014,FPPC 700,Original,,,Paper,149845769
3417,152076519,Andrew Park for City Council 2014,07/30/2014,FPPC 460,Original,,( 01/01/2014 to 06/30/2014 ),View,149845769
3418,149845964,Andrew Park for City Council 2014,02/20/2014,FPPC 410,Original,,,Paper,149845769
3419,149846499,Andrew Park for City Council 2014,02/20/2014,OCRA 301,Original,,,Paper,149845769


In [21]:
errors

['149330975', '128811848']

In [22]:
def original_to_0(seq):
    if seq == 'Original':
        return 0
    return seq

previous_df['Seq#'] = previous_df['Seq#'].apply(original_to_0)

previous_df = previous_df.astype({'issuedFilingId': 'str', 'filer': 'str', 'filing_date': 'str', 'form': 'str', 'Seq#': 'int', 'Rpt#': 'str', 'period_covered': 'str', 'view_filing': 'str', 'candidateNid': 'str'})

In [23]:
previous_df = previous_df.sort_values('Seq#',ascending=False)
filtered_df = previous_df[previous_df['form'].isin(['FPPC 410 Amendment', 'FPPC 501', 'FPPC 410', 'FPPC 501 Amendment'])]
filing_dict = {'issuedFilingId':[], 'filerNid':[]}
for row in filtered_df.iterrows():
    form = row[1]['form']
    filingNid = row[1]['issuedFilingId']
    filing_dict['issuedFilingId'].append(filingNid)
    try:
        if form in ['FPPC 410 Amendment', 'FPPC 410']:
            filerNid = get_filerNid_from_filingNid(filingNid, form410)
        else:
            filerNid = get_filerNid_from_filingNid(filingNid, form501)
        filing_dict['filerNid'].append(filerNid)
    except:
        try:
            filerNid = get_filerNid_from_filingNid(filingNid, filings)
            filing_dict['filerNid'].append(filerNid)
        except:
            filing_dict['filerNid'].append('ERROR')
filing_df = pd.DataFrame(filing_dict)
filer_df = filtered_df.merge(filing_df, on='issuedFilingId')
def forms_edit(seq):

    if seq == 'FPPC 501 Amendment':
        return 'FPPC 501'

    if seq == 'FPPC 410 Amendment':
        return 'FPPC 410'
    return seq

def time(x):
    y = datetime.strptime(x,'%d/%m/%Y')
    return y
filer_df['form'] = filer_df['form'].apply(forms_edit)
filer_df['filing_date'] = pd.to_datetime(filer_df['filing_date'], format='%m/%d/%Y')
no460df = filer_df[filer_df['form'].isin(['FPPC 501', 'FPPC 410'])]
no460df.sort_values('Seq#',ascending=False)

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [400]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [400]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [400]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200

Unnamed: 0,issuedFilingId,filer,filing_date,form,Seq#,Rpt#,period_covered,view_filing,candidateNid,filerNid
1,204050668,Sheng Thao for Oakland City Council 2018 Offic...,2022-06-20,FPPC 410,6,,,Paper,171131353,171131442
0,211843775,Treva Reid for Oakland City Council 2020 Offic...,2024-07-31,FPPC 410,6,,( 07/30/2024 ),View,151524242,151524257
6,209881410,Larry Reid for Council 2016 Officeholder Commi...,2024-01-30,FPPC 410,5,,( 01/16/2024 ),View,121706806,161120367
4,195655298,Treva Reid for Oakland City Council 2020,2021-01-15,FPPC 410,5,,( 01/13/2021 ),View,151524242,151524257
3,169977630,Shanthi Gonzales School Board 2014 Officeholde...,2018-03-20,FPPC 410,5,,,Paper,150090223,150090442
...,...,...,...,...,...,...,...,...,...,...
461,190537495,Marchon Tatmon for D7 Oakland City Council 2020,2020-06-10,FPPC 410,0,,,Paper,168644279,168644306
462,192414540,Meron Semedar for Oakland City Council District 1,2020-09-03,FPPC 410,0,,,Paper,190812862,190812878
463,190813345,"Semedar, Meron",2020-06-29,FPPC 501,0,,,Paper,190812862,190812878
464,204583696,Orozco Max for School District Director 2022,2022-09-02,FPPC 410,0,,,Paper,204232995,204233009


In [24]:
# a candidate is in sequential election, each one will have a corresponing 501 that must also be filed in sequence 
# similarly between each commitee there must be a termination and a new committee formation this happens sequentially, each filer id there for should have a temporal position that can be mapped to the elections.
error_df = filtered_df[filtered_df['issuedFilingId'].isin(['207709578', '202375717', '207973067', '211409087'])]

In [25]:
df = filer_df[filer_df['form']=='FPPC 501']
dfdf = df.sort_values('filing_date', ascending=False)
unique_filerid_501 = dfdf.drop_duplicates(subset='filerNid')

In [26]:
df = filer_df[filer_df['form']=='FPPC 410']
dfdf = df.sort_values('filing_date', ascending=False)
unique_filerid_410 = dfdf.drop_duplicates(subset='filerNid')

# Using Regex to Guess Office Sought

In [27]:
import re


In [28]:
def extract_year(x):
    match = re.search(r'\d{2,4}', x)
    if match:
        year = match.group()
        if len(year) == 2:
            return '20' + year
        return int(year)
    return 0
# logic
def guess(clue, cat):
    cat_code = clue[0]
    atlarge = clue[1]
    district = clue[2]
    district_num = str(clue[3])

    guess = None
    confidence = 0

    try:
        if cat_code == 0:
            if district:
                guess = name_to_id[cat[16] + ' District ' + district_num]
                confidence = 2
            return {'seatOfficeNid':guess, 'confidence':confidence}
        if district and district_num == '0':
            return {'seatOfficeNid':guess, 'confidence':confidence}
        category = cat[cat_code] # 0 is not in the index so I've place this here so it won't error

        if cat_code < 8:
            guess = name_to_id[category]
            confidence = 3
            if district:
                confidence = 1

        if cat_code >= 8:
            if district:
                guess = name_to_id[cat[16] + ' District ' + district_num]
                confidence = 3
                if cat_code != 16:
                    guess = name_to_id[category + ' ' + district_num]

            else:
                if cat_code == 16:
                    guess = name_to_id[category + ' Member At-Large']
                    confidence = 2
                    if atlarge:
                        guess = name_to_id[category + ' Member At-Large']
                        confidence = 3
    except:
        guess = None
        confidence = 0
        return {'seatOfficeNid':guess, 'confidence':confidence}
    return {'seatOfficeNid':guess, 'confidence':confidence}

def guesser(original_df, column):
    df = original_df.copy()
    # patterns
    name_pattern = r'^[A-Za-z]+(?:[-\s\'][A-Za-z]+)*, [A-Za-z]+(?:[-\s\'][A-Za-z]+)*(?: [A-Z]\.?(?:[A-Z]\.?)?)*$'
    officeholder_pattern = r'officeholder'
    #
    df[f'{column}'] = df[f'{column}'].apply(lambda x: str(x))
    df['name'] = df[f'{column}'].apply(lambda x: bool(re.search(name_pattern, x)))
    df['election year'] = df[f'{column}'].apply(lambda x: extract_year(x))
    df['district #'] = df[f'{column}'].apply(lambda x: re.search(r'\b[Dd]?\d{1}\b', x).group()[-1] if re.search(r'\b[Dd]?\d{1}\b', x) else False)
    df['officeholder'] = df[f'{column}'].apply(lambda x: bool(re.search(officeholder_pattern, x, re.IGNORECASE)))
    #
    add={}

    add['at-large'] = [r'at[-\s]?large', r'al']
    add['district'] = [r'district', r'\bd[-\s]?\d', r'dist', r'seat', r'area']
    add['officeholder'] = [r'officeholder']
    add['name'] = [r'^[A-Za-z"]+(?:[-\s\'][A-Za-z"]+)*[.\s]*, [A-Za-z"]+(?:[-\s\'][A-Za-z"]+)*(?: [A-Z"]\.?(?:[A-Z"]\.?)?)*$', r'^[A-Za-z]+\s[A-Za-z]+$']

    for key in add.keys():
        df[key] = df[f'{column}'].apply(lambda x: any(bool(re.search(pattern, x, re.IGNORECASE)) for pattern in add[key]))

    # filtered_df = df[df['name']==False]
    filtered_df = df[df['officeholder']==False]

    #
    pd.set_option('future.no_silent_downcasting', True)

    cat={r'.*mayor.*':1,r'.*auditor.*':2,r'.*attorney.*':4, r'.*school.*':8,r'.*director.*':8,r'.*board.*':8,r'.*ousd.*':8,r'.*unified.*':8,r'.*at[-\s]?large.*':16,r'.*councilmember.*':16,r'.*council.*':16, r'.*supervisor.*':32}
    filtered_df['cat_code'] = filtered_df[f'{column}'].str.lower().replace(cat,regex=True)
    filtered_df['cat_code'] = filtered_df['cat_code'].replace({r'.*':0},regex=True)
    cat={1:'Mayor',2:'City Auditor',4:'City Attorney', 8:'OUSD District',16:'City Council', 32:'Supervisor District'}
    filtered_df['cat'] = filtered_df['cat_code'].replace(cat)

    index_df = filtered_df.reset_index(drop=True)
    clues = index_df[['cat_code', 'at-large', 'district', 'district #']].to_numpy().astype(int)

    return pd.concat([index_df, pd.DataFrame([guess(clue, cat) for clue in clues])], axis=1)
guess_df = guesser(no460df, 'filer')
clean_guess_df = guess_df.sort_values('filing_date').drop_duplicates(subset=['filerNid'])[['confidence','filer','filerNid','election year','candidateNid','seatOfficeNid']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['cat_code'] = filtered_df[f'{column}'].str.lower().replace(cat,regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['cat_code'] = filtered_df['cat_code'].replace({r'.*':0},regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['cat'] = filtered_df['cat_

# election influences endpoint

In [29]:
# this end point misses some people (or does it may it's the filer endpoint that has problems?) but it's obviously way simpler and the regex and html bullshit so I should use this then only use the other method for the stagglers especially considering the time that the web scrape takes.
influences = list_elections_influences()

In [30]:
[i for i in influences['results'] if i['aid']=='COAK' and i['candidateNid']]

[{'influenceNid': '175812192',
  'filerNid': '175812180',
  'startDate': None,
  'endDate': None,
  'adsid': 163882,
  'electionNid': '128869701',
  'seatNid': '148866485',
  'seatOfficeNid': '121709698',
  'candidateNid': '151026012',
  'electionCandidateNid': '151026010',
  'measureNid': None,
  'electionItemReferenceId': None,
  'committeeEntityId': '175812182',
  'committeeName': 'Guillen, Abel',
  'doesSupport': True,
  'aid': 'COAK',
  'electionDate': '2010-11-02',
  'electionCaption': '11/2/2010 - General',
  'electionCodes': 'Special',
  'isPublic': False,
  'measure': None,
  'seat': {'seatNid': '148866485',
   'electionNid': '128869701',
   'seatOfficeNid': '121709698',
   'seatCaption': 'City Council - City of Oakland - 2',
   'officeName': 'City Council - City of Oakland - 2'},
  'candidate': {'candidateNid': '151026012',
   'candidateName': 'Guillen, Abel',
   'seatNid': '148866485',
   'isIncumbent': False,
   'isWinner': False,
   'partyAffiliationId': '0',
   'partyAffi

In [31]:
cutoff = datetime.strptime('2011-01-01', '%Y-%m-%d')
influences_df = pd.DataFrame(
    {
        'aid':influence['aid'],
        'filerNid': influence['filerNid'],
        'electionNid': influence['electionNid'],
        'seatNid': influence.get('candidate', {}).get('seatNid', None) if influence.get('candidate', {}) else influence['seatNid'],
        'seatOfficeNid': influence['seatOfficeNid'],
        'candidateNid': influence['candidateNid'],
        'electionCandidateNid': influence['electionCandidateNid'],
        'measureNid': influence['measureNid'],
        'electionItemReferenceId': influence['electionItemReferenceId'],
        'committeeEntityId': influence['committeeEntityId'],
        'committeeName': influence['committeeName'],
        'doesSupport': influence['doesSupport'],
        'electionDate': influence['electionDate'],
        'electionCaption': influence['electionCaption'],
        'electionCodes': influence['electionCodes'],
    }
    for influence in influences['results'] if influence['aid'] in ['COAK'] and influence['candidateNid'] and not influence['candidateNid'] == '0' and influence['doesSupport'] and datetime.strptime(influence['electionDate'], '%Y-%m-%d') > cutoff
)

In [32]:
influences_df

Unnamed: 0,aid,filerNid,electionNid,seatNid,seatOfficeNid,candidateNid,electionCandidateNid,measureNid,electionItemReferenceId,committeeEntityId,committeeName,doesSupport,electionDate,electionCaption,electionCodes
0,COAK,175789537,155034677,164704809,121709611,121708068,164704811,,,175789539,"Schaaf, Libby",True,2018-11-06,11/6/2018 - General,General
1,COAK,196534785,155034677,166492969,121709771,167774163,167774166,,,196534787,Loren Taylor for City Council 2018 Officeholde...,True,2018-11-06,11/6/2018 - General,General
2,COAK,182101261,165189423,180940134,121709393,129453919,181923234,,,182101263,Re-Elect Dan Kalb Oakand City Council 2020,True,2020-11-03,11/3/2020 - General,General
3,COAK,190794094,165189423,188163037,121709381,129467329,190794022,,,190794096,Re-Elect City Attorney Barbara Parker 2020,True,2020-11-03,11/3/2020 - General,General
4,COAK,190603336,165189423,188163037,121709381,190603323,190603326,,,190603338,Elias Ferran for City Attorney 2020,True,2020-11-03,11/3/2020 - General,General
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,COAK,208542745,208083731,208083739,121710529,189891611,208542730,,,208542747,Jorge Lerma for Oakland School Board 2023,True,2023-11-07,11/7/2023 - Special,Special
133,COAK,208539270,208083731,208083739,121710529,208539261,208539263,,,208539272,"Martinez, Christian M",True,2023-11-07,11/7/2023 - Special,Special
134,COAK,208728735,208728634,208728636,121709679,208728667,208728669,,,208728737,Michael Houston for Oakland City Auditor 2024,True,2024-03-05,3/5/2024 - Special,Special
135,COAK,211486282,210326313,210326315,121709611,211486273,211486275,,,211486284,"Brown, Latanya",True,2026-11-03,11/3/2026 - General,General


In [33]:
inf_comp_df = influences_df[['filerNid','electionNid','seatNid','seatOfficeNid','candidateNid','electionCandidateNid', 'committeeName']].copy()
inf_comp_df.rename(columns={'committeeName':'filer'},inplace=True)
res_inf_merge_df = result_df.merge(inf_comp_df,how='left',on=['candidateNid', 'seatNid', 'electionNid', 'seatOfficeNid'])
res_inf_merge_df[res_inf_merge_df.duplicated(subset=['candidateNid', 'seatNid', 'electionNid'],keep=False)].sort_values('candidateNid')
forms_inf_merge_df = combined_df.merge(res_inf_merge_df,how='left',on=['filerNid'])
guess_results_df = result_df.merge(clean_guess_df,how='left',on=['candidateNid', 'seatOfficeNid','election year'])

In [34]:
clean_guess_df[clean_guess_df['candidateNid']=='208555126']

Unnamed: 0,confidence,filer,filerNid,election year,candidateNid,seatOfficeNid
231,0,"Frank, Edward C",208555135,0,208555126,


In [35]:
def filer_to_election_for_single_occurances(candidate_to_election_df, candidate_to_filer_df):
    # create a dictionary of lists where the key is the number of races a candidate id is related to
    candidate_counts = candidate_to_election_df['candidateNid'].value_counts()
    candidate_counts_dict = {}
    for i in range(1, int(candidate_counts.max())+1):
        candidate_counts_dict[i] = candidate_counts[candidate_counts == i].index.tolist()

# get candidates in 1 race
    candidate_to_election_df_case_1 = candidate_to_election_df[candidate_to_election_df['candidateNid'].isin(candidate_counts_dict[1])]

# create dictionary
    candidateNid_to_election = dict(zip(candidate_to_election_df_case_1['candidateNid'], candidate_to_election_df_case_1['seatNid']))

# create a dictionary of lists where the key is the number of races a candidate id is related to
    filer_counts = candidate_to_filer_df['candidateNid'].value_counts()
    filer_counts_dict = {}
    for i in range(1, int(filer_counts.max())+1):
        filer_counts_dict[i] = filer_counts[filer_counts == i].index.tolist()

# get filers in 1 race
    guess_df_case_1 = candidate_to_filer_df[candidate_to_filer_df['candidateNid'].isin(filer_counts_dict[1])]

# create dictionary 
    filerNid_to_candidateNid = dict(zip(guess_df_case_1['candidateNid'], guess_df_case_1['filerNid']))

# get valid keys
    shared_1 = [candidateNid for candidateNid in filer_counts_dict[1] if candidateNid in candidate_counts_dict[1]]

# put it together
    return {filerNid_to_candidateNid[candidateNid]:candidateNid_to_election[candidateNid] for candidateNid in shared_1}

filer_to_election_from_guess = filer_to_election_for_single_occurances(guess_results_df, clean_guess_df)

In [36]:
combined_df.rename(columns={'commonName':'filer'},inplace=True)
combined_df

Unnamed: 0,filerNid,issuedFilingId,filer,RegType1,SOS ID,status,datetime
1,211901697,211902005,Re-Elect Noel Gallo for Oakland City Council 2024,Candidate or Officeholder,1472382,Active,2024-08-09 17:27:56.161244+00:00
3,211868217,211868294,Harbin-Forte for City Attorney 2024,Candidate or Officeholder,Pending,Active,2024-08-05 07:00:00+00:00
4,211901697,211901733,Re-Elect Noel Gallo for Oakland City Council 2024,Candidate or Officeholder,1472382,Active,2024-08-02 07:00:00+00:00
6,210526254,211824323,Wang for Oakland City Council 2024,Candidate or Officeholder,1467592,Active,2024-07-31 07:00:00+00:00
12,151524257,211843775,Treva Reid for Oakland City Council 2020 Offic...,Candidate or Officeholder,1366759,Active,2024-08-01 04:38:34.731720+00:00
...,...,...,...,...,...,...,...
337,128869446,130355256,Eaves for City Council 2012,Candidate or Officeholder,1341150,Terminated,2011-09-02 07:00:00+00:00
338,125690881,129452253,Jane Brunner for Oakland City Attorney 2012,Candidate or Officeholder,1338543,Terminated,2011-05-25 07:00:00+00:00
339,127738921,127739027,Celia Davis,Candidate or Officeholder,1327723,Terminated,2011-05-12 07:00:00+00:00
340,125694590,129453261,Parents for Richard Fuentes for Oakland School...,Candidate or Officeholder,1337648,Terminated,2011-03-25 07:00:00+00:00


In [37]:
guesser(combined_df)

TypeError: guesser() missing 1 required positional argument: 'column'

In [None]:
# candidate id + year find crossover between the valid guesses

In [None]:
filers_response

In [None]:
inf_misses = res_inf_merge_df[res_inf_merge_df['filerNid'].isna()][['candidateNid', 'candidateName', 'seatNid', 'seatOfficeNid',
       'officeName', 'electionNid', 'isIncumbent', 'isWinner', 'election_name',
       'election year', 'category']]

In [None]:
from collections import defaultdict
filerdf = pd.DataFrame({
        'candidateName':transform_name(filer['candidateName']) if filer['candidateName'] else filer['candidateName'],
        'filerNid':filer['filerNid'],
        'filerName':filer['filerName'],
        'nameHistory':filer['nameHistory']
    }
    for filer in filers_response if filer['committeeTypes'] in [['Candidate or Officeholder'], ['Person']])

def crazy(df, filterdf):
    mergedf = df.merge(filterdf[['candidateNid', 'filerNid', 'seatNid']], 'left', ['candidateNid', 'seatNid'])
#
    merge_filter = mergedf[mergedf['filerNid'].isna()==False]
#
    done = merge_filter[['candidateNid', 'candidateName', 'seatNid', 'seatOfficeNid',
       'officeName', 'electionNid', 'isIncumbent', 'isWinner', 'election_name',
       'election year', 'category']]
#
    return pd.concat([df, done]).drop_duplicates(keep=False), done

def year_based_guess(df, filerdf):
    filerdf, valid_seatNids_for_candidateNid = prep_filer_df(df, filerdf)


#

# Initialize a defaultdict with list as the default factory
    valid_guesses = defaultdict(list)

# Loop through the elections and populate the defaultdict
    for election in elections:
        year = election['electionDate'][:4]
        election_info = [item.get('seatNid', None) for item in election['seats']]
    # Append the election info to the list for the specific year
        valid_guesses[year].extend(election_info)

# Convert the defaultdict back to a regular dict if needed
    valid_guesses = dict(valid_guesses)
#
    filerdf['seatNid'] = filerdf['key'].apply(lambda x: [seat for seat in valid_seatNids_for_candidateNid[f'{x[0]}'] if seat in valid_guesses[f'{x[1]}']])
    filerdf['seatNid_count'] = filerdf['seatNid'].apply(lambda x: len(x))
#
    filterdf = filerdf[filerdf['seatNid_count']==1].copy()
    filterdf['seatNid'] = filterdf['seatNid'].apply(lambda x: x[0])
    return filterdf

def prep_filer_df(df, filerdf):
    valid_seatNids_for_candidateNid = df.groupby('candidateNid')['seatNid'].apply(list).to_dict()
    valid_candidateNid_for_names = df.groupby('candidateName')['candidateNid'].apply(set).to_dict()

    officeholder_pattern = r'officeholder'

    filerdf['officeholder'] = filerdf['filerName'].apply(lambda x: bool(re.search(officeholder_pattern, x, re.IGNORECASE)))
    filerdf = filerdf[filerdf['officeholder']==False].copy()
    filerdf['candidateNid'] = filerdf['candidateName'].apply(lambda x: valid_candidateNid_for_names[x] if x in valid_candidateNid_for_names.keys() else set())
    filerdf['candidateNid_count'] = filerdf['candidateNid'].apply(lambda x: len(x))
    filerdf = filerdf[filerdf['candidateNid_count']==1]
    filerdf['candidateNid'] = filerdf['candidateNid'].apply(lambda x: tuple(x)[0])
    filerdf['election year'] = filerdf['filerName'].apply(extract_year).apply(int)
    filerdf = filerdf[filerdf['election year']>2010]
    filerdf = filerdf[filerdf['election year'] % 2 == 0]
    filerdf['key'] = tuple(zip(filerdf['candidateNid'],filerdf['election year']))

    filerdf = filerdf[filerdf['election year']!=0]
    return filerdf, valid_seatNids_for_candidateNid
filterdf = year_based_guess(inf_misses, filerdf)
misses_df, hits_df = crazy(inf_misses, filterdf)

In [None]:
# understand this behavior where it loops between 280 and 163 rows the theoretical behavior is supposed to be that we return result_df 
# for everything we failled to connect then with the connected rows gone there should be more cases that are easy because they have no alternative
# when I get a hit I need to take it out of the pool


In [None]:
misses_df

In [None]:
hits_df

In [None]:
def two_func(df):
    misses, hits = crazy(df, year_based_guess(df, filerdf))
    if len(df) == len(misses):
        return df
    else:
        print(len(hits))
        return new_func(misses)
def new_func(df):
    filer_df = prep_filer_df(inf_misses, filerdf)[0]
    filer_to_election = filer_to_election_for_single_occurances(df, filer_df)
    filer_df['seatNid'] = filer_df['filerNid'].apply(lambda x: filer_to_election[x] if x in filer_to_election.keys() else None)
    hits = filer_df[filer_df['seatNid'].isna()==False].copy()
    misses, hits = crazy(df, hits)
    if len(df) == len(misses):
        return df
    else:
        print(len(hits))
        return two_func(misses)
misses = new_func(inf_misses)

In [None]:
misses

In [None]:
filerdf, valid_seatNids_for_candidateNid = prep_filer_df(misses, filerdf)
# Initialize a defaultdict with list as the default factory
valid_guesses = defaultdict(list)

# Loop through the elections and populate the defaultdict
for election in elections:
    year = election['electionDate'][:4]
    election_info = [item.get('seatNid', None) for item in election['seats']]
# Append the election info to the list for the specific year
    valid_guesses[year].extend(election_info)

# Convert the defaultdict back to a regular dict if needed
valid_guesses = dict(valid_guesses)
#
filerdf['seatNid'] = filerdf['key'].apply(lambda x: [seat for seat in valid_seatNids_for_candidateNid[f'{x[0]}'] if seat in valid_guesses[f'{x[1]}']])
filerdf['seatNid_count'] = filerdf['seatNid'].apply(lambda x: len(x))

In [None]:
filerdf

In [None]:
elections

In [None]:
found = influences_df['filerNid'].unique()

# Brainstorming

In [None]:
# let's say a candidate is running in five races and has x number of filerNids how can we infer the connection?
# if len(filerNid) / len(races) == 2 
# look through filerNids in sequence looking for matches investigate in betweens
# 

In [None]:
form460

In [None]:
filerdf = pd.DataFrame({
        'candidateName': transform_name(filer['filerName']) if filer['committeeTypes'] == ['Person'] else transform_name(filer['candidateName']),
        'filerNid':filer['filerNid'],
        'filerName':filer['filerName'],
        'nameHistory':filer['nameHistory'],
        'committeeTypes':filer['committeeTypes']
    }
    for filer in filers_response if filer['committeeTypes'] in [['Candidate or Officeholder'], ['Person']]
    )
filerdf

In [None]:
final_df[final_df['candidateNid']=='121710635']

In [None]:
filer_df

In [None]:
form_type_df = filing_df.merge(pd.DataFrame(
    {
        'specificationRef':(filing['specificationRef']['name'], filing['filingMeta']['amendmentSequence']),
        'issuedFilingId':filing['filingMeta']['issuedFilingId'], 
        'originalFilingId':filing['originalFilingId'],
    }
    for filing in filings),
    'left'
)

In [None]:
forms_for_filer = form_type_df.dropna().groupby('filerNid')['specificationRef'].apply(list).to_dict()

In [None]:
filings

In [None]:
inf_misses

In [None]:
def list_filers(*querys:dict):
    """ Get all the elections
    """
    url = f'{BASE_URL}/filer/v101/filers?Limit=100000'
    params={ **PARAMS }
    for q in querys:
        params.update(q)
    res = requests.get(url, params=params, auth=AUTH)
    if res.status_code == 500:
        print('ping')
        return list_filers()
    else:
        body = res.json()
        return body['results']
    
def get_filing(*querys:dict, offset=0):
    """ Get a filing
    """
    url = f'{BASE_URL}/filing/v101/filings?Limit=100000'

    params = { **PARAMS }
    for q in querys:
        params.update(q)
    if offset > 0:
        params['offset'] = offset

    res = requests.get(url, params=params, auth=AUTH)
    if res.status_code == 500:
        print('ping')
        return get_filing(offset=0)
    else:
        print(res)
        body = res.json()
        results = body.pop('results')
        return results, body

In [None]:
t = list_filers({'CandidateNid': 121710635})
t2 = get_filing({'CandidateNid': 121710635})

In [None]:
t

In [None]:
filing_activities = get_activities()
filing_activities

In [None]:
element_activities = get_element_activities(offset=0)
element_activities2 = get_element_activities(offset=100000)
element_activities3 = get_element_activities(offset=200000)

In [None]:
element_activities_candidate = get_element_activities({'elementType': 'Candidate'})
element_activities_cover = get_element_activities({'elementType': 'Cover'})

In [None]:
candidateSeat = pd.DataFrame(
    {
        'filerNid':e['element']['filerNid'],
        'isEmpty':e.get('element', {}).get('elementModel', {}).get('candidateSeat', {}).get('isEmpty', None),
        'jurisdiction':e.get('element', {}).get('elementModel', {}).get('candidateSeat', {}).get('jurisdiction', None),
        'officeSought':e.get('element', {}).get('elementModel', {}).get('candidateSeat', {}).get('officeSought', None),
        'districtNumber':e.get('element', {}).get('elementModel', {}).get('candidateSeat', {}).get('districtNumber', None),
        'isCurrentlyHeld':e.get('element', {}).get('elementModel', {}).get('candidateSeat', {}).get('isCurrentlyHeld', None),
        'jurisdictionDescription':e.get('element', {}).get('elementModel', {}).get('candidateSeat', {}).get('jurisdictionDescription', None),
        'officeSoughtDescription':e.get('element', {}).get('elementModel', {}).get('candidateSeat', {}).get('officeSoughtDescription', None),
        'filerId':e.get('element', {}).get('elementModel', {}).get('filerId', None),
        'electionDate':e.get('element', {}).get('elementModel', {}).get('electionDate', None)[:10] if e.get('element', {}).get('elementModel', {}).get('electionDate', None) else None,
        'filingDate':e.get('element', {}).get('elementModel', {}).get('filingDate', None),
        'filer':e.get('element', {}).get('elementModel', {}).get('filer', {}).get('fullNameWithPrefixAndSuffix', None),
        'candidate':e.get('element', {}).get('elementModel', {}).get('candidate', {}).get('fullNameWithPrefixAndSuffix', None),
        'statementType':e.get('element', {}).get('elementModel', {}).get('statementType', None),

    }
    for e in element_activities_cover[0] if e.get('element', {}).get('elementModel', {}).get('committeeType', None) == 'ControlledCommittee'
)

In [None]:
candidateOffice = pd.DataFrame(
    {
        'filerNid':e['element']['filerNid'],
        'isEmpty':e.get('element', {}).get('elementModel', {}).get('candidateOffice', {}).get('isEmpty', None),
        'jurisdiction':e.get('element', {}).get('elementModel', {}).get('candidateOffice', {}).get('jurisdiction', None),
        'officeSought':e.get('element', {}).get('elementModel', {}).get('candidateOffice', {}).get('officeSought', None),
        'districtNumber':e.get('element', {}).get('elementModel', {}).get('candidateOffice', {}).get('districtNumber', None),
        'isCurrentlyHeld':e.get('element', {}).get('elementModel', {}).get('candidateOffice', {}).get('isCurrentlyHeld', None),
        'jurisdictionDescription':e.get('element', {}).get('elementModel', {}).get('candidateOffice', {}).get('jurisdictionDescription', None),
        'officeSoughtDescription':e.get('element', {}).get('elementModel', {}).get('candidateOffice', {}).get('officeSoughtDescription', None),
        'filerId':e.get('element', {}).get('elementModel', {}).get('filerId', None),
        'electionDate':e.get('element', {}).get('elementModel', {}).get('electionDate', None)[:10] if e.get('element', {}).get('elementModel', {}).get('electionDate', None) else None,
        'filingDate':e.get('element', {}).get('elementModel', {}).get('filingDate', None),
        'filer':e.get('element', {}).get('elementModel', {}).get('filer', {}).get('fullNameWithPrefixAndSuffix', None),
        'candidate':e.get('element', {}).get('elementModel', {}).get('candidate', {}).get('fullNameWithPrefixAndSuffix', None),
        'statementType':e.get('element', {}).get('elementModel', {}).get('statementType', None),


    }
    for e in element_activities_candidate[0] if e.get('element', {}).get('elementModel', {}).get('committeeType', None) == 'ControlledCommittee'
)

In [None]:
candidateOfficeClean = candidateOffice.drop_duplicates().dropna(thresh=3).copy()
candidateSeatClean = candidateSeat.drop_duplicates().dropna(thresh=3).copy()

In [None]:
candidateSeatClean

In [None]:
candidateSeatClean = candidateSeatClean.fillna('placeholder')
candidateSeatCleaner = candidateSeatClean.map(lambda x: None if x == 'Unset' else x).copy()
candidateSeatCleaner = candidateSeatCleaner.map(lambda x: None if x == 'placeholder' else x).dropna(thresh=3).copy()
candidateSeatCleaner

In [None]:
candidateSeatCleaner = candidateSeatCleaner.map(lambda x: None if x == 'placeholder' else x)
candidateSeatCleanerHeld = candidateSeatCleaner[candidateSeatCleaner['isCurrentlyHeld']!='Held'].copy()

In [None]:
candidateSeatCleanerHeld

In [None]:
filerdf = pd.DataFrame({
        'candidateName':transform_name(filer['candidateName']) if filer['candidateName'] else filer['candidateName'],
        'filerNid':filer['filerNid'],
        'filerName':filer['filerName'],
        'nameHistory':filer['nameHistory']
    }
    for filer in filers_response if filer['committeeTypes'] in [['Candidate or Officeholder'], ['Person']])


In [None]:
candidateSeatCleanerHeld.columns

In [None]:
ndf=filerdf.merge(candidateSeatCleanerHeld, 'left', ['filerNid']).drop_duplicates(subset=['candidateName', 'filerNid', 'filerName'])

In [None]:
gdf = ndf.dropna(subset=['officeSought','officeSoughtDescription', 'electionDate'], how='all')
gdf

In [None]:
bdf = pd.concat([ndf,gdf]).drop_duplicates(keep=False, subset=['candidateName', 'filerNid', 'filerName', 'isEmpty',
       'jurisdiction', 'officeSought', 'districtNumber', 'isCurrentlyHeld',
       'jurisdictionDescription', 'officeSoughtDescription', 'filerId',
       'electionDate', 'filingDate', 'filer', 'candidate', 'statementType'])[['candidateName', 'filerNid', 'filerName', 'nameHistory']]

In [None]:
element_activities_cover

In [None]:
electionNidToElectionDate = pd.DataFrame(
    {
        'electionNid':e['electionNid'],
        'electionDate':e['electionDate']
    }
    for e in elections
)
electionNidToElectionDate

In [None]:
filerNidToElectionNid = candidateSeatCleanerHeld[['filerNid', 'electionDate', 'filer', 'candidate', 'officeSought', 'districtNumber', 'officeSoughtDescription']].merge(electionNidToElectionDate, 'left', ['electionDate']).drop_duplicates()

In [None]:
filerNidToElectionNid['officeSought'].unique()

In [None]:
seatOfficeNid_to_name

In [None]:
valid_seatNids_for_seatOfficeNid = result_df.groupby('seatOfficeNid')['seatNid'].apply(set).to_dict()

In [None]:
filerNidToElectionNid['districtNumber'] = filerNidToElectionNid['districtNumber'].apply(lambda x: '' if x == 0 or x == '0' else x)

In [None]:
filerNidToElectionNid

In [None]:
filerNidToElectionNid['officeSoughtDescriptionPlusD'] = filerNidToElectionNid['officeSoughtDescription'] + ' d ' + filerNidToElectionNid['districtNumber'] + ' ' + filerNidToElectionNid['officeSought']

In [None]:
filerNidToElectionNidGuess = guesser(filerNidToElectionNid.fillna(''), 'officeSoughtDescriptionPlusD') # make work

In [None]:
filerNidToElectionNidGuess

In [None]:
filerNidToElectionNid['officeSought'] = filerNidToElectionNid['officeSought'].map({
    'CityCouncilMember':'City Council District ',
    'BoardOfEducation':'OUSD District ',
    'Mayor':'Mayor',
    'CountySupervisor':'Supervisor District ',
    'CityAttorney':'City Attorney'
})

In [None]:
filerNidToElectionNid['seatOfficeNid'] = filerNidToElectionNid['officeSought'] + filerNidToElectionNid['districtNumber']

In [None]:
filerNidToElectionNid['seatOfficeNid'] = filerNidToElectionNid['seatOfficeNid'].map(name_to_id)

In [None]:
filerNidToElectionNid