In [220]:
# PART A: DOWNLOAD ALL OF ELECTIONLINE WEEKLY

# import requests
# from bs4 import BeautifulSoup
# import os

# YEARS = reversed(range(2011, 2024))

# # Define user-agent to simulate a web browser request
# headers = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
# }

# base_url = 'https://electionline.org'

# for year in YEARS:
#     dir_path = f"electionline-weekly/{year}"
#     os.makedirs(dir_path, exist_ok=True)
    
#     url = f"{base_url}/electionline-weekly/{year}"
    
#     # Send an HTTP GET request with headers
#     response = requests.get(url, headers=headers)

#     soup = BeautifulSoup(response.text, 'html.parser')
    
#     weeks = soup.find('ul', class_='weeks').find_all('li')
#     weeks = [f"{base_url}{week.find('a')['href']}" for week in weeks]

#     for week in weeks:
#         response = requests.get(week, headers=headers)
#         with open(f"{week.split('electionline.org/')[-1]}.html", 'w') as f:
#             f.write(response.text)


In [238]:
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
from urllib.parse import urlparse


pd.options.display.html.use_mathjax = False
pd.options.display.min_rows = 100
pd.options.display.max_rows = 100

YEARS = reversed(range(2011, 2024))

job_df = pd.DataFrame()

for year in YEARS:
    dir = f"electionline-weekly/{year}"
    weeks = os.listdir(dir)

    for week in weeks:
        date = week[:5]
        
        with open(os.path.join(dir, week)) as f:
            text = f.read()
        soup = BeautifulSoup(text, 'html.parser')
    
        # Find all divs with the class 'article-wrapper'
        divs_with_class = soup.find_all('div', class_='article-wrapper')
    
    
        for div in divs_with_class:
            h2_tags = div.find_all('h2', string=re.compile(r'^job', re.I))
            
            if h2_tags:
                for h2_tag in h2_tags:
                    # Find all p elements within the div containing the matched h2 tag
                    # Skip the first paragraph
                    job_paragraphs = div.find_all('p')[1:]  

                    # Skip intro and empty paragraphs
                    job_paragraphs = [para for para in job_paragraphs if (not para.text.startswith('electionlineWeekly')) and (len(para.text)>10)]    
                    
                    for paragraph in job_paragraphs:
                        # Extract job information from the paragraph
                        link = paragraph.find('a')
                        job_title = link.get_text() if link is not None else ""
                        
                        employer_match = re.search(r'\/a>[^,]*(?:,|-)\s*([^-–—]*)', str(paragraph))
                        employer = employer_match.group(1) if employer_match else ""
                        
                        # salary_match = re.search(r'Salary[^:]*:\s(.*?)(?=\.\s)', paragraph.get_text())
                        salary_match = re.search(r'Salary[^:]*:\s(.*?)(?=\.\s|Dead)', paragraph.get_text())
                        salary = salary_match.group(1) if salary_match else ""
                        
                        description = paragraph.get_text()
                        job_link = link['href'] if link is not None else ""
    
                        # Append job information to the list as a dictionary
                        new_row = pd.DataFrame({'Job Title': job_title,
                                                'Employer': employer,
                                                'Salary': salary,
                                                'Description': description,
                                                'Link': job_link,
                                                'Date': date,
                                                'Year': year}, index=[0])
                        job_df = pd.concat([job_df, new_row], ignore_index=True)

job_df = job_df.drop_duplicates(subset=['Job Title', 'Employer', 'Salary', 'Link'], keep='last')

# exclude listings from some of the top URLs belonging to private employers
excluded_domains = ['dominionvoting.com',
                   'clearballot.com',
                   'electioninnovation.org',
                   'runbeck.net',
                   'rockthevote.com',
                   'hartintercivic.com',
                   'fordfoundation.org',
                   'techandciviclife.org',
                   'bipartisanpolicy.org',
                   'cdt.org',
                   'ericstates.org',
                   'centerfortechandciviclife.recruitee.com',
                   'democracy.works',
                   'electionreformers.org',
                   'verifiedvoting.org']

def is_not_excluded_domain(url):
    netloc = urlparse(url).netloc.replace('www.', '')
    return netloc not in excluded_domains

job_df = job_df[job_df['Link'].apply(is_not_excluded_domain)]

def pay_basis(x):
    x = x.lower()
    if re.search('hr|hour', x):
        return 'hourly'
    elif re.search('month', x):
        return 'monthly'
    elif re.search('biweek', x):
        return 'biweekly'
    elif re.search('week', x):
        return 'weekly'
    else:
        return 'yearly'

job_df['Pay basis'] = job_df['Salary'].apply(pay_basis)

 # just a guess to correct the typo. It could be 110, or 101,... not sure.
job_df.loc[(job_df['Job Title']=='Elections Manager') & (job_df['Employer'] == 'Anoka County, Minnesota'), 'Salary']='$88,628-$101,878'

def clean_salary(x):
    cleaned_string = x.replace(' to ', '-').replace(' and ', '-')
    cleaned_string = cleaned_string.replace('Grade 14-', '')
    cleaned_string = re.sub(f'[^0-9\.\-–—]', '', cleaned_string) # get rid of things that aren't dashes and hyphens, periods, en, or em dashes.
    return cleaned_string.replace('–', '-').replace('—', '-') # replace en and em dashes with hyphens

job_df['Clean salary'] = job_df['Salary'].apply(clean_salary)

def split_salary(x, end='low'):
    # pattern = r'[-–—]'  # Matches hyphen, en dash, or em dash
    pattern = '-'
    ends = [part.strip().strip('$').strip('.') for part in re.split(pattern, x)]
    if end=='low':
        return ends[0]
    else:
        if len(ends) > 1:
            return ends[1]
        else:
            return None
        
job_df['Salary low end'] = pd.to_numeric(job_df['Clean salary'].apply(lambda x: split_salary(x, end='low')))
job_df['Salary high end'] = pd.to_numeric(job_df['Clean salary'].apply(lambda x: split_salary(x, end='high')))

# todo: replace K with 000. if it's under 100, assume hourly.


yearly = job_df[job_df['Pay basis']=='yearly']


job_df.sort_values('Salary high end', ascending=False)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [245]:
# check outliers
# yearly.sort_values('Salary high end', ascending=False)
yearly.sort_values('Salary low end', ascending=True)

Unnamed: 0,Job Title,Employer,Salary,Description,Link,Date,Year,Pay basis,Clean salary,Salary low end,Salary high end
3595,Senior Research Program Specialist,U.S. Election Assistance Commission,June 7,"Senior Research Program Specialist, U.S. Elect...",https://www.usajobs.gov/GetJob/ViewDetails/533...,06-06,2019,yearly,7,7.00,
4782,cblaney@monongaliacountyclerk.com,,August 10,"Voter Registration Clerk, Monongalia County, W...",mailto:cblaney@monongaliacountyclerk.com,08-09,2018,yearly,10,10.00,
2486,Senior Mapping Technician,"Wake County, North Carolina Board of Elections",Hiring Range: $17.49 – $23.60,"Senior Mapping Technician, Wake County, North ...",https://ewaketalent.csod.com/ux/ats/careersite...,08-05,2021,yearly,17.49-23.60,17.49,23.60
2637,Voter Registration Specialist,"Wake County, North Carolina Board of Elections",Hiring Range: $17.49 – $23.60,"Voter Registration Specialist, Wake County, No...",https://ewaketalent.csod.com/ux/ats/careersite...,08-26,2021,yearly,17.49-23.60,17.49,23.60
2828,Early Voting Specialist,"Wake County, North Carolina",Hiring Range: $17.49 – $23.60,"Early Voting Specialist, Wake County, North Ca...",https://ewaketalent.csod.com/ux/ats/careersite...,09-16,2021,yearly,17.49-23.60,17.49,23.60
2815,Early Voting Specialist,"Wake County, North Carolina",$17.49 – $23.60,"Early Voting Specialist, Wake County, North Ca...",https://ewaketalent.csod.com/ux/ats/careersite...,01-21,2021,yearly,17.49-23.60,17.49,23.60
2736,Staffing Specialist,"Wake County, North Carolina",Hiring Range: $17.49 – $23.60,"Staffing Specialist, Wake County, North Caroli...",https://ewaketalent.csod.com/ux/ats/careersite...,06-24,2021,yearly,17.49-23.60,17.49,23.60
2733,Inventory Control Specialist,"Wake County, North Carolina",Hiring Range: $17.49 – $23.60,"Inventory Control Specialist, Wake County, Nor...",https://ewaketalent.csod.com/ux/ats/careersite...,06-24,2021,yearly,17.49-23.60,17.49,23.60
1399,Staffing Specialist,"Wake County, North Carolina",$17.49 – $23.60,"Staffing Specialist, Wake County, North Caroli...",https://ewaketalent.csod.com/ux/ats/careersite...,02-03,2022,yearly,17.49-23.60,17.49,23.60
4272,Elections Technician I,"Larimer County, Colorado",Hiring range $17.67 – $24.74,"Elections Technician I, Larimer County, Colora...",https://careers-larimer.icims.com/jobs/3505/el...,05-23,2019,yearly,17.67-24.74,17.67,24.74


In [242]:
len(job_df)

1111

In [240]:
sum(job_df['Salary'] != '')

740

In [64]:
job_df.to_csv('jobs.csv')

In [None]:
job_df.sort_values('Salary high end', ascending=False)

In [219]:
yearly

Unnamed: 0,Job Title,Employer,Salary,Description,Link,Date,Year,Salary low end,Salary high end,Pay basis,Clean salary
22,Assistant Registrar of Voters,"Kern County, California","$120,886-$144,461","Assistant Registrar of Voters, Kern County, Ca...",https://www.kerncounty.com/Home/Components/New...,10-19,2023,120886.00,144461.00,yearly,120886-144461
25,City Secretary,"Denton, Texas","$85,260.- $136,416","City Secretary, Denton, Texas— Denton, Texas (...",https://www.governmentjobs.com/careers/bakerti...,10-19,2023,85260.00,136416.00,yearly,85260.-136416
29,Departmental Analyst,Michigan Department of State,"$57,553 – $84,115","Departmental Analyst (Data & Programs Unit), M...",https://www.governmentjobs.com/careers/michiga...,10-19,2023,57553.00,84115.00,yearly,57553-84115
31,Director of Purchasing,Chicago Board of Elections,"$100,000 – $105,000","Director of Purchasing, Chicago Board of Elect...",https://app.chicagoelections.com/Documents/gen...,10-19,2023,100000.00,105000.00,yearly,100000-105000
33,Election Outreach Administrator or Senior Outr...,"Arapahoe County, Colorado","$49,589 – $81,769",Election Outreach Administrator or Senior Outr...,https://www.governmentjobs.com/careers/arapaho...,10-19,2023,49589.00,81769.00,yearly,49589-81769
34,Elections Attorney,Iowa Secretary of State,"$68,764.- $105,872","Elections Attorney, Iowa Secretary of State– T...",https://www.governmentjobs.com/careers/iowa/jo...,10-19,2023,68764.00,105872.00,yearly,68764.-105872
148,Technology Division Leader,"Boulder County, Colorado","$61,680 – $88,836","Technology Division Leader, Boulder County, Co...",https://www.governmentjobs.com/careers/boulder...,04-20,2023,61680.00,88836.00,yearly,61680-88836
150,Assistant Director,"Utah County, Utah","$65,540.80 – $75,379.20","Assistant Director, Utah County, Utah— Under g...",https://utahcounty.wd1.myworkdayjobs.com/en-US...,08-03,2023,65540.80,75379.20,yearly,65540.80-75379.20
165,Voter Registration Specialist,Illinois State Board of Elections,"$3,750 – $5,834","Voter Registration Specialist, Illinois State ...",https://illinois.jobs2web.com/job/Springfield-...,08-03,2023,3750.00,5834.00,yearly,3750-5834
167,Voting System Specialist,Illinois State Board of Elections,"$3,750 – $5,834","Voting System Specialist, Illinois State Board...",https://illinois.jobs2web.com/job/Springfield-...,08-03,2023,3750.00,5834.00,yearly,3750-5834


In [198]:
job_df.iloc[552]['Description']

'Elections Specialist, Wisconsin Elections Commission — This position functions as part of the elections administration team and is a resource for the public on election-related laws and procedures. The Elections Specialists regularly conduct public outreach, education, training, technical assistance workshops, seminars, and certification classes. This position is also responsible for core election administration tasks, including, but not limited to review of state and federal candidate ballot access documents, ballot design and review, and canvass of election results. This positions also works in, and provides services regarding, Wisconsin’s statewide voter registration system, which is a database of voter and election information as well as a primary tool for administration of elections in the state. This position is a contact for county and municipal clerks to provide customer service, training, and guidance in the administration of elections using WisVote. Salary: $17.96 and $29.62

In [184]:
job_df

Unnamed: 0,Job Title,Employer,Salary,Description,Link,Date,Year,Salary low end,Salary high end,Pay basis,Clean salary
22,Assistant Registrar of Voters,"Kern County, California","$120,886-$144,461","Assistant Registrar of Voters, Kern County, Ca...",https://www.kerncounty.com/Home/Components/New...,10-19,2023,120886,144461,yearly,120886-144461
25,City Secretary,"Denton, Texas","$85,260.- $136,416","City Secretary, Denton, Texas— Denton, Texas (...",https://www.governmentjobs.com/careers/bakerti...,10-19,2023,85260,136416,yearly,85260.-136416
29,Departmental Analyst,Michigan Department of State,"$57,553 – $84,115","Departmental Analyst (Data & Programs Unit), M...",https://www.governmentjobs.com/careers/michiga...,10-19,2023,57553,84115,yearly,57553-84115
31,Director of Purchasing,Chicago Board of Elections,"$100,000 – $105,000","Director of Purchasing, Chicago Board of Elect...",https://app.chicagoelections.com/Documents/gen...,10-19,2023,100000,105000,yearly,100000-105000
32,Early Voting Specialist,"Ottawa County, Michigan",$27.82 – $36.18 Hourly,"Early Voting Specialist, Ottawa County, Michig...",https://www.governmentjobs.com/careers/miottaw...,10-19,2023,27.82,36.18,hourly,27.82-36.18
33,Election Outreach Administrator or Senior Outr...,"Arapahoe County, Colorado","$49,589 – $81,769",Election Outreach Administrator or Senior Outr...,https://www.governmentjobs.com/careers/arapaho...,10-19,2023,49589,81769,yearly,49589-81769
34,Elections Attorney,Iowa Secretary of State,"$68,764.- $105,872","Elections Attorney, Iowa Secretary of State– T...",https://www.governmentjobs.com/careers/iowa/jo...,10-19,2023,68764,105872,yearly,68764.-105872
114,Elections Technician or Specialist,"Larimer County, Colorado",$22.69 – $29.95/hr,"Elections Technician or Specialist, Larimer Co...",https://www.governmentjobs.com/careers/colarim...,01-19,2023,22.69,29.95,hourly,22.69-29.95
138,Election Program Supervisor,"King County, Washington",$36.67 – $46.48 Hourly,"Election Program Supervisor, King County, Wash...",https://electionline.org/wp-admin/post.php?pos...,04-20,2023,36.67,46.48,hourly,36.67-46.48
148,Technology Division Leader,"Boulder County, Colorado","$61,680 – $88,836","Technology Division Leader, Boulder County, Co...",https://www.governmentjobs.com/careers/boulder...,04-20,2023,61680,88836,yearly,61680-88836


In [138]:
pattern = r'[0-9\.\-–—\s]'
input_string = "	$27.82 – $36.18 Hourly"
cleaned_string = re.sub(f'[^{pattern}]', '', input_string)

cleaned_string

'\t$27.82 – $36.18 Hourly'

'27.82x36.18'

Unnamed: 0,Job Title,Employer,Salary,Description,Link,Date,Year,Salary low end,Salary high end
22,Assistant Registrar of Voters,"Kern County, California","$120,886-$144,461","Assistant Registrar of Voters, Kern County, Ca...",https://www.kerncounty.com/Home/Components/New...,10-19,2023,120886,144461
25,City Secretary,"Denton, Texas","$85,260.- $136,416","City Secretary, Denton, Texas— Denton, Texas (...",https://www.governmentjobs.com/careers/bakerti...,10-19,2023,85260,136416
29,Departmental Analyst,Michigan Department of State,"$57,553 – $84,115","Departmental Analyst (Data & Programs Unit), M...",https://www.governmentjobs.com/careers/michiga...,10-19,2023,57553,84115
31,Director of Purchasing,Chicago Board of Elections,"$100,000 – $105,000","Director of Purchasing, Chicago Board of Elect...",https://app.chicagoelections.com/Documents/gen...,10-19,2023,100000,105000
32,Early Voting Specialist,"Ottawa County, Michigan",$27.82 – $36.18 Hourly,"Early Voting Specialist, Ottawa County, Michig...",https://www.governmentjobs.com/careers/miottaw...,10-19,2023,27.82,36.18 Hourly
33,Election Outreach Administrator or Senior Outr...,"Arapahoe County, Colorado","$49,589 – $81,769",Election Outreach Administrator or Senior Outr...,https://www.governmentjobs.com/careers/arapaho...,10-19,2023,49589,81769
34,Elections Attorney,Iowa Secretary of State,"$68,764.- $105,872","Elections Attorney, Iowa Secretary of State– T...",https://www.governmentjobs.com/careers/iowa/jo...,10-19,2023,68764,105872
114,Elections Technician or Specialist,"Larimer County, Colorado",$22.69 – $29.95/hr,"Elections Technician or Specialist, Larimer Co...",https://www.governmentjobs.com/careers/colarim...,01-19,2023,22.69,29.95/hr
138,Election Program Supervisor,"King County, Washington",$36.67 – $46.48 Hourly,"Election Program Supervisor, King County, Wash...",https://electionline.org/wp-admin/post.php?pos...,04-20,2023,36.67,46.48 Hourly
148,Technology Division Leader,"Boulder County, Colorado","$61,680 – $88,836","Technology Division Leader, Boulder County, Co...",https://www.governmentjobs.com/careers/boulder...,04-20,2023,61680,88836
