In [72]:
import requests
import re
from bs4 import BeautifulSoup
import csv
import os
from enum import Enum

#methodology; TLDR: idempotent write op with job-id as primary key
# 1. use cvs as database
# 2. load csv in cache
# 3. scrape & parse linkedin jobs
# 4. look up jobid, omit if exists in cache, add to cache and jobs_to_add array
# 5. write jobs_to_add back to csv

In [73]:
#configurations
#base_url_prefix = "https://www.linkedin.com/jobs/search?keywords=software%20engineer%20OR%20engineering%20manager&location=United%20States&pageNum=0&start="
base_url_prefix = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=software%20engineer%20OR%20engineering%20manager&location=United%2BStates&geoId=103644278&trk=public_jobs_jobs-search-bar_search-submit&start="
row_increment_default = 10
max_row_default = 2000
csv_name="linkedin-job-scraper-database.csv"
csv_columns=['job_id','company','job_type','title','location','link','date']

class DebugLevel(Enum):
    WARN = 0
    GENERAL = 1
    GRANULAR = 2
    
debug_level = DebugLevel.WARN
debug_company=""

In [74]:
#initializations
jobs={} # updated cache, won't be written back to csv
jobs_to_add=[] # write-back as additions, strictly as a write buffer

In [75]:
def load_csv_in_cache(reader):
    dups_detection=[]
    for row in reader: #reader knows first row is headers
        key = row['job_id']
        
        if key in dups_detection:
            print(f"duplicate detected: job_id={key}")
        else:
            dups_detection.append(key)
            
        value = {k: v for k, v in row.items() if k != 'job_id'}
        jobs[key] = value

    print(f"{len(jobs)} rows read from csv")
    return

In [76]:
# load csv into cache
if not os.path.exists(csv_name):
    with open(csv_name, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile) #this will create new file if not exist
        writer.writerow(csv_columns)
else:
    with open(csv_name, 'r+', newline='') as csvfile:
        writer = csv.writer(csvfile)
        reader = csv.DictReader(csvfile)
        load_csv_in_cache(reader)

555 rows read from csv


In [77]:
for i in range(0,max_row_default,row_increment_default):
    base_url = f"{base_url_prefix}{i}"
    response = requests.get(base_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        job_listings = soup.find_all('div', {'class':'job-search-card'})
        for job in job_listings:
            try:
                date = job.find('time', {'class':'job-search-card__listdate'}).attrs['datetime']
            except AttributeError:
                date = job.find('time', {'class':'job-search-card__listdate--new'}).attrs['datetime']
            
            title = job.find('h3', {'class': 'base-search-card__title'}).text.strip()
            job_type = 'em' if 'manager' in title.lower() else 'eng'
            company = job.find('a', {'class': 'hidden-nested-link'}).text.strip()
            location = job.find('span', {'class': 'job-search-card__location'}).text.strip()
            link = job.find('a', {'class': 'base-card__full-link'}).attrs['href']
            pattern = r"(.*-)(\d+)"
            job_id = re.search(pattern,link).group(2)

            #debug
            if debug_level==DebugLevel.GRANULAR and debug_company != "" and company == debug_company:
                print(f"id:{job_id},title:{title},job_type:{job_type},link:{link}")
            elif debug_level==DebugLevel.GENERAL:
                print(f"id:{job_id},title:{title},job_type:{job_type},company:{company}")
            
            #look up job_id, omit if cache hit, add to cache and jobs_to_add array if cache miss
            if len(jobs_to_add) > 0 and job_id in jobs_to_add[0]: # exists in delta
                if debug_level==DebugLevel.WARN: #if item offset is set efficiently this printline should never show
                    print(f"we've seen {company}-{job_id}")
                continue
            elif jobs.get(job_id,"") != "": # cache hit: exists in database
                continue
            else:                    
                jobs.setdefault(job_id,{
                    csv_columns[1]: company,
                    csv_columns[2]: job_type,
                    csv_columns[3]: title,
                    csv_columns[4]: location,
                    csv_columns[5]: link,
                    csv_columns[6]: date
                })
                jobs_to_add.append([job_id,company,job_type,title,location,link,date])
                
    else:
        print("Failed to fetch job listings.")
        break

In [78]:
with open(csv_name, 'a', newline='') as csvfile:
    writer = csv.writer(csvfile)

    # Append all new rows
    writer.writerows(jobs_to_add)
    print(f"{len(jobs_to_add)} jobs added")
jobs_to_add=[] #reset

41 jobs added


In [79]:
import pandas as pd
df=pd.DataFrame(jobs).T
df2=df.groupby(['company'])['date'].agg(['count','max'])
df2.sort_values('count',ascending=False).head(20)

Unnamed: 0_level_0,count,max
company,Unnamed: 1_level_1,Unnamed: 2_level_1
SynergisticIT,15,2024-05-20
ICONMA,11,2024-05-15
Jobs Malaysia - Two95 HR HUB,8,2024-04-26
Varo Bank,8,2024-05-10
Lockheed Martin,8,2024-05-20
Airbnb,7,2024-05-16
Microsoft,7,2024-05-21
Team Remotely Inc,6,2024-05-21
Genesis10,6,2024-05-21
Marlee (Fingerprint For Success),5,2024-05-17


In [None]:
#todo connect to google drive via api