In [1]:
import pandas as pd
import numpy as np
from PyPDF2 import PdfReader
from tqdm.notebook import tqdm

# Helper functions

In [3]:
def get_date_accessed(reader):
    t = reader.pages[0].extract_text()
    return t.split("M ")[-2].split(")")[-1].split(",")[0]

In [4]:
def get_officers_start_end_pages(reader):
    start = None
    end = None
    for i, p in enumerate(reader.pages):
        t = p.extract_text().replace(" ", "").replace("\n", "")
        if not start and "ALLOFFICERSANDDISBURSEMENTSTOOFFICERS" in t:
            start = i
        if start and "TotalOfficerDisbursements" in t:
            end = i
        if start and end:
            break
    return start, end

def get_employees_start_end_pages(reader):
    start = None
    end = None
    for i, p in enumerate(reader.pages):
        t = p.extract_text().replace(" ", "").replace("\n", "")
        if not start and "DISBURSEMENTSTOEMPLOYEES" in t:
            start = i
        if start and "TOTALSRECEIVEDBYEMPLOYEESMAKING$10,000ORLESS" in t:
            end = i
        if start and end:
            break
    return start, end

In [5]:
def process_employee_or_officer(emp):
    lines = emp.strip().split("\n")
    last, first = [a.replace(" ", "") for a in lines[0].split(",", 1)]
    title = lines[1].strip()
    disbursements = [int(a.replace(",", "_").replace(" ", "")) for a in lines[2].split("$")[1:]]
    salary = disbursements[-1] if len(disbursements) > 0 else None
    activities = {}
    activity_start_line = min([i for i, l in enumerate(lines) if "Schedule" in l])
    for L in " ".join(lines[activity_start_line:]).split("Schedule ")[1:]:
        if "Admin" in L and "Lobbying" in L:
            L = L.replace("Lobbying", "")
        activity = "".join(a for a in L.split(" ", 1)[1] if (not a.isdigit() and a != "%")).strip()
        if activity == "Political Activities and":
            activity = "Political Activities and Lobbying"
        pct = "".join(a for a in L.replace(" ", "")[-4:] if a.isdigit())
        if len(pct) > 0:
            pct = int(pct)
        else:
            pct = 0
        activities[activity] = pct
    try:
        assert len(activities) == 5
    except:
        print(activities)
        print(emp)
        print("")
    return dict({'Last Name': last,
            'First Name': first,
            'Title': title,
            'Salary': salary,
                }, **activities)

In [6]:
def create_officers_list(reader):
    startpage, endpage = get_officers_start_end_pages(reader)
    date_acc = get_date_accessed(reader)
    officers = "\n".join([p.extract_text().split("TOTAL\n")[1].split(f"{date_acc}")[0] for p in reader.pages[startpage:endpage+1]])
    assert(officers.count("Total") == 1)
    officers = officers.split("Total")[0]
    officer_list = officers.split("A\nB\nC")[1:]
    clean_officer_list = []
    for o in officer_list:
        if "A\nB" in o:
            o1, o2 = o.split("A\nB")
            clean_officer_list.append(o1)
            o2 = o2.replace("CNONE", "NONE")
            o2_disbursements = o2.split("$",1)[1].split("\n")[0]
            o2 = o2.replace(o2_disbursements, "")
            o2 = o2.replace("NONE", f"NONE{o2_disbursements}", 1)
            clean_officer_list.append(o2)
        else:
            clean_officer_list.append(o)
    officers_processed = [process_employee_or_officer(e) for e in clean_officer_list]
    return officers_processed

def create_employees_list(reader):
    startpage, endpage = get_employees_start_end_pages(reader)
    date_acc = get_date_accessed(reader)
    employees = "\n".join([p.extract_text().split("TOTAL\n")[1].split(f"{date_acc}")[0] for p in reader.pages[startpage:endpage+1]])
    assert(employees.count("TOTALS") == 1)
    employees = employees.split("TOTALS")[0]
    employees_list = employees.split("A\nB\nC")[1:]
    clean_employees_list = []
    for e in employees_list:
        if "A\nB" in e:
            e1, e2 = e.split("A\nB")
            clean_employees_list.append(e1)
            e2 = e2.replace("CNONE", "NONE")
            e2_disbursements = e2.split("$",1)[1].split("\n")[0]
            e2 = e2.replace(e2_disbursements, "")
            e2 = e2.replace("NONE", f"NONE{e2_disbursements}", 1)
            clean_employees_list.append(e2)
        else:
            clean_employees_list.append(e)
    employees_processed = [process_employee_or_officer(e) for e in clean_employees_list]
    return employees_processed

# UAW

In [None]:
readers = [PdfReader(f"documents/UAW LM2s/{year}.pdf") for year in range(2000, 2024)]

In [7]:
# Generate CSVs for 2005 - 2023
# Previous years are in old format that doesn't read well
officer_dfs = []
employee_dfs = []
for i, R in enumerate(tqdm(readers[5:])):
    try:
        O = pd.DataFrame(create_officers_list(R))
        E = pd.DataFrame(create_employees_list(R))
        officer_dfs.append(O)
        employee_dfs.append(E)
        O.to_csv(f"documents/UAW LM2s/{2005+i}_officers.csv")
        E.to_csv(f"documents/UAW LM2s/{2005+i}_staff.csv")
    except:
        print(2005+i)

  0%|          | 0/19 [00:00<?, ?it/s]

In [20]:
# Quality control: Make sure columns are as expected
expected_columns = ['Last Name', 'First Name', 'Title', 'Salary', 'Representational Activities', 'Political Activities and Lobbying', 'Contributions', 'General Overhead', 'Administration']
for i in range(19):
    year = 2005+i
    if list(officer_dfs[i].columns) != expected_columns:
        print(year)
    if list(employee_dfs[i].columns) != expected_columns:
        print(year)

# Teamsters

In [21]:
readers = [PdfReader(f"documents/Teamsters LM2s/{year}.pdf") for year in range(2000, 2024)]

In [None]:
# Generate CSVs for 2005 - 2023
# Previous years are in old format that doesn't read well
officer_dfs = []
employee_dfs = []
for i, R in enumerate(tqdm(readers[5:])):
    try:
        O = pd.DataFrame(create_officers_list(R))
        E = pd.DataFrame(create_employees_list(R))
        officer_dfs.append(O)
        employee_dfs.append(E)
        O.to_csv(f"documents/Teamsters LM2s/{2005+i}_officers.csv")
        E.to_csv(f"documents/Teamsters LM2s/{2005+i}_staff.csv")
    except:
        print(2005+i)

  0%|          | 0/19 [00:00<?, ?it/s]

In [23]:
# Quality control: Make sure columns are as expected
expected_columns = ['Last Name', 'First Name', 'Title', 'Salary', 'Representational Activities', 'Political Activities and Lobbying', 'Contributions', 'General Overhead', 'Administration']
for i in range(19):
    year = 2005+i
    if list(officer_dfs[i].columns) != expected_columns:
        print(year)
    if list(employee_dfs[i].columns) != expected_columns:
        print(year)

IndexError: list index out of range