In [220]:
# PART A: DOWNLOAD ALL OF ELECTIONLINE WEEKLY

# import requests
# from bs4 import BeautifulSoup
# import os

# YEARS = reversed(range(2011, 2024))

# # Define user-agent to simulate a web browser request
# headers = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
# }

# base_url = 'https://electionline.org'

# for year in YEARS:
#     dir_path = f"electionline-weekly/{year}"
#     os.makedirs(dir_path, exist_ok=True)
    
#     url = f"{base_url}/electionline-weekly/{year}"
    
#     # Send an HTTP GET request with headers
#     response = requests.get(url, headers=headers)

#     soup = BeautifulSoup(response.text, 'html.parser')
    
#     weeks = soup.find('ul', class_='weeks').find_all('li')
#     weeks = [f"{base_url}{week.find('a')['href']}" for week in weeks]

#     for week in weeks:
#         response = requests.get(week, headers=headers)
#         with open(f"{week.split('electionline.org/')[-1]}.html", 'w') as f:
#             f.write(response.text)


In [45]:
from bs4 import BeautifulSoup
import re
import pandas as pd
import os

pd.options.display.html.use_mathjax = False
pd.options.display.min_rows = 100
pd.options.display.max_rows = 100

YEARS = reversed(range(2011, 2024))

job_df = pd.DataFrame()

for year in YEARS:
    dir = f"electionline-weekly/{year}"
    weeks = os.listdir(dir)

    for week in weeks:
        date = week[:5]
        
        with open(os.path.join(dir, week)) as f:
            text = f.read()
        soup = BeautifulSoup(text, 'html.parser')
    
        # Find all divs with the class 'article-wrapper'
        divs_with_class = soup.find_all('div', class_='article-wrapper')
    
    
        for div in divs_with_class:
            h2_tags = div.find_all('h2', string=re.compile(r'^job', re.I))
            
            if h2_tags:
                for h2_tag in h2_tags:
                    # Find all p elements within the div containing the matched h2 tag
                    # Skip the first paragraph
                    job_paragraphs = div.find_all('p')[1:]  

                    # Skip intro and empty paragraphs
                    job_paragraphs = [para for para in job_paragraphs if (not para.text.startswith('electionlineWeekly')) and (len(para.text)>10)]    
                    
                    for paragraph in job_paragraphs:
                        # Extract job information from the paragraph
                        link = paragraph.find('a')
                        job_title = link.get_text() if link is not None else ""
                        employer_match = re.search(r'\/a>[^,]*(?:,|-)\s*([^-–—]*)', str(paragraph))
                        employer = employer_match.group(1) if employer_match else ""
                        salary_match = re.search(r'Salary[^:]*:\s(.*?)(?=\.\s)', paragraph.get_text())
                        salary = salary_match.group(1) if salary_match else ""
                        description = paragraph.get_text()
                        job_link = link['href'] if link is not None else ""
    
                        # Append job information to the list as a dictionary
                        new_row = pd.DataFrame({'Job Title': job_title,
                                                'Employer': employer,
                                                'Salary': salary,
                                                'Description': description,
                                                'Link': job_link,
                                                'Date': date,
                                                'Year': year}, index=[0])
                        job_df = pd.concat([job_df, new_row], ignore_index=True)

job_df = job_df.drop_duplicates(subset=['Job Title', 'Employer', 'Salary', 'Link'], keep='last')

In [67]:
job_df

Unnamed: 0,Job Title,Employer,Salary,Description,Link,Date,Year
22,Assistant Registrar of Voters,"Kern County, California","$120,886-$144,461","Assistant Registrar of Voters, Kern County, Ca...",https://www.kerncounty.com/Home/Components/New...,10-19,2023
25,City Secretary,"Denton, Texas","$85,260.- $136,416","City Secretary, Denton, Texas— Denton, Texas (...",https://www.governmentjobs.com/careers/bakerti...,10-19,2023
29,Departmental Analyst,Michigan Department of State,"$57,553 – $84,115","Departmental Analyst (Data & Programs Unit), M...",https://www.governmentjobs.com/careers/michiga...,10-19,2023
31,Director of Purchasing,Chicago Board of Elections,"$100,000 – $105,000","Director of Purchasing, Chicago Board of Elect...",https://app.chicagoelections.com/Documents/gen...,10-19,2023
32,Early Voting Specialist,"Ottawa County, Michigan",$27.82 – $36.18 Hourly,"Early Voting Specialist, Ottawa County, Michig...",https://www.governmentjobs.com/careers/miottaw...,10-19,2023
33,Election Outreach Administrator or Senior Outr...,"Arapahoe County, Colorado","$49,589 – $81,769",Election Outreach Administrator or Senior Outr...,https://www.governmentjobs.com/careers/arapaho...,10-19,2023
34,Elections Attorney,Iowa Secretary of State,"$68,764.- $105,872","Elections Attorney, Iowa Secretary of State– T...",https://www.governmentjobs.com/careers/iowa/jo...,10-19,2023
114,Elections Technician or Specialist,"Larimer County, Colorado",$22.69 – $29.95/hr,"Elections Technician or Specialist, Larimer Co...",https://www.governmentjobs.com/careers/colarim...,01-19,2023
138,Election Program Supervisor,"King County, Washington",$36.67 – $46.48 Hourly,"Election Program Supervisor, King County, Wash...",https://electionline.org/wp-admin/post.php?pos...,04-20,2023
148,Technology Division Leader,"Boulder County, Colorado","$61,680 – $88,836","Technology Division Leader, Boulder County, Co...",https://www.governmentjobs.com/careers/boulder...,04-20,2023


In [53]:
len(job_df)

1444

In [63]:
sum(job_df['Salary'] != '')

893

In [64]:
job_df.to_csv('jobs.csv')