In [None]:
import requests
import re
from bs4 import BeautifulSoup
import csv
import os
from enum import Enum
import time
from sympy import fibonacci

#methodology; TLDR: idempotent write op with job-id as primary key
# 1. use cvs as database
# 2. load csv in cache
# 3. scrape & parse linkedin jobs
# 4. look up jobid, omit if exists in cache, add to cache and jobs_to_add array
# 5. write jobs_to_add back to csv

In [None]:
#configurations
base_url_prefix = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=software%20engineer%20OR%20engineering%20manager&location=United%2BStates&geoId=103644278&trk=public_jobs_jobs-search-bar_search-submit&start="
staff_url_postfix = "/people/?facetCurrentFunction=8&facetGeoRegion=103644278"
suffix_to_remove = "?trk=public_jobs_jserp-result_job-search-card-subtitle"
row_increment_default = 10
max_row_default = 2000
EM_AS_ENG_MULTIPLIER=3
csv_name="linkedin-job-scraper-database.csv"
csv_columns=['job_id','company','job_type','title','location','link','date']

class DebugLevel(Enum):
    WARN = 0
    GENERAL = 1
    GRANULAR = 2
    
debug_level = DebugLevel.WARN
debug_company=""
ignore_list=["SynergisticIT","Jobs Malaysia - Two95 HR HUB","Kforce Inc","ICONMA","Get It Recruit - Information Technology","Team Remotely Inc","Ampcus Inc","Genesis10","Intellectt Inc","Stealth","LanceSoft, Inc.","EV.Careers","Insight Global","Griffin Global Systems, Inc.","Insight Global","EVONA","Steneral Consulting","iHire","TALENT Software Services","Seasoned Recruitment"]
cookie_value='lang=v=2&lang=en-us; AMCVS_14215E3D5995C57C0A495C55%40AdobeOrg=1; spectroscopyId=596c9ff3-60d0-41e3-85d0-35f6a301a11d; lil-lang=en_US; s_cc=true; li_gp=MTsxNjk1ODI4MzEwOzA=; JSESSIONID="ajax:8314525928829728200"; li_sugr=25841127-ed35-4ab9-8c85-570471870609; at_check=true; dfpfpt=7c7fe9cf02884edfa5a39e7017cfb32b; li_theme=dark; li_theme_set=user; bcookie="v=2&cb2fb2a1-4d80-45fc-8516-207f51a51f3e"; bscookie="v=1&202403021342299a907719-a943-42ef-8402-5792074858caAQE5A2dVIE09Qv30nLvy78tZY6gf5eEG"; liap=true; PLAY_SESSION=eyJhbGciOiJIUzI1NiJ9.eyJkYXRhIjp7ImZsb3dUcmFja2luZ0lkIjoieThwNEdpbUdUMFNVZUxHOXB6a2JmUT09In0sIm5iZiI6MTcxMDU5NDUxOSwiaWF0IjoxNzEwNTk0NTE5fQ.zR1TJu4vkt13WOLRPTwV50YI6uKXf5-UWWpz1X8QgMg; li_ep_auth_context=AFlhcHA9YWNjb3VudENlbnRlckh1YixhaWQ9MjAzNDA1ODI2LGlpZD0yMDMyNzg1MDAscGlkPTE5MjE3ODY2MCxleHA9MTcxMjg3NzAwNTE0OSxjdXI9dHJ1ZQExwAx3K9P0J545UbnU6UvphQmb5A; _guid=420eeedd-0945-4be8-bb3d-7560213e03fb; timezone=America/Vancouver; s_sq=%5B%5BB%5D%5D; s_plt=1.46; s_pltp=developer.linkedin.com%2Fproduct-catalog%2Ftalent; s_ppv=developer.linkedin.com%2Fproduct-catalog%2Ftalent%2C100%2C55%2C1423.5%2C1%2C1; sdsc=22%3A1%2C1716311324586%7EJAPP%2C0dedPjDVh%2BnAliGcpJp0roYoGNuw%3D; li_at=AQEFAHIBAAAAAA8jfCgAAAGOR10hfQAAAY_uskrwVgAAFXVybjpsaTptZW1iZXI6MzM4NDU2NU6brTbPy_8D6pC-LEWNMQWWggzdn90tgsgOGINZ1JiCUwWg0jjZ4fwOCHI8ibezBk-sq2rG0gWHW-ulrvjBzzMhMlq9Vc9YUoPNP2Oe7L6KZhStednWSbMHFDxEMKi0ozXeE3NSqQ_1zMcVxS1Sy8Psa_ecRtrlglvRK8p-WSlgOGV6kfw-eN32XyXFuFvKwHgyBRg; AnalyticsSyncHistory=AQIZphPFkK3ZKQAAAY_Mzq7abzj-6wvwEgRLs0GJCSzxG08LOA_E2744k-Zp5x5TZGw_AnE9lEIEseNrdpFCKw; lms_ads=AQFvxAKBoK1iygAAAY_Mzq9v3DOdlaVZl4qxnR_BD7cXTWpIKFhF2v_tJ4KGNQOfpXbWWzd-IfUly3XlZV7sYqipcUhV80yD; lms_analytics=AQFvxAKBoK1iygAAAY_Mzq9v3DOdlaVZl4qxnR_BD7cXTWpIKFhF2v_tJ4KGNQOfpXbWWzd-IfUly3XlZV7sYqipcUhV80yD; fptctx2=taBcrIH61PuCVH7eNCyH0APzNoEiOrOqF4FbdtfiWWLEKzm%252bX525gSVvOxw6HIhb348dZ6J1N8EuDRt0U2jbG82ZGirLI%252bhHcWa0hlUWdLUPXJFCfJ5w7PthZ7NuHITSyG8i6hWyXkR2x66zZbxKzFN11rsxN3eYrgNtIUhlMWVeBF1S5OSejF9QWpkgnrgpntUiBR7KmqIrTSIIa9KjKeE4dyVvuj3O2%252fT3wUUcVS%252bFKOUFa7ZD6BkrXrm%252fGEDCkuD8oHpH6sA83a8WZOsUC60sKxVfykYg%252bmYz8%252f8%252fSzfEHTEP1VvVAovOpVsO%252fLK8pyIW80Dtsb%252fcMu4eBEwfWMbkWmjlwuuYKmJZdJbC%252bDA%253d; UserMatchHistory=AQJLBw2SJxerMwAAAY_QtvsWhUBlJ5CfWM3192ID3TVM6nwb-UgUr00WbhMhwjEJKuPgcKeeck_ZJYmdqJ1v6B90twjdmoh5hrRQDFjTAG9KjDjBkYVMq68jUxHwblY2z0YSaSe68KjSA8HNO--3yyuFSGWNmTmODCEs6kUyswfs_Uk0PsQQgJyy9MkZIDhSk1n_4oYpVUkcrjlTvAphEwTl2y6VnRNN_laDwx_YefOrXG4b3Os4UbYIDS-ZHGp1MytvWuSLl5Cihf-TE2i5zYlsiS3F1F8R5eOAK-d5tqHzVLmBw3t3Q1HaacbSoGQqstL5ihsDnXVPrJNqCleLRBzV3Inu7p3LJgkZ7xKVNQW0ekvIQw; lidc="b=OB65:s=O:r=O:a=O:p=O:g=4802:u=470:x=1:i=1717193606:t=1717266555:v=2:sig=AQEuCN_ZyU-YIDV_x-0QRtE5Gy9QBpeQ"'
headers = {
    "Cookie": cookie_value
}

In [None]:
# #selenium configuration
# from selenium import webdriver
# from selenium.webdriver.chrome.options import Options

# options = Options()
# options.headless = True  # Run Chrome in headless mode
# # driver_path = "/Users/vincentchen/Documents/investment"
# driver = webdriver.Chrome(options=options)

In [None]:
#initializations
jobs={} # updated cache, won't be written back to csv
jobs_to_add=[] # write-back as additions, strictly as a write buffer
company_staff_urls={}

In [None]:
def load_csv_in_cache(reader):
    dups_detection=[]
    for row in reader: #reader knows first row is headers
        key = row['job_id']
        
        if key in dups_detection:
            print(f"duplicate detected: job_id={key}")
            dups_detection.append(key)
            
        value = {k: v for k, v in row.items() if k != 'job_id'}

        if value["company"] in ignore_list:
            #print(f"ignoring company from source: {value['company']}")
            continue
        else:
            jobs[key] = value

    print(f"{len(jobs)} rows read from csv; {len(dups_detection)} dups detected in csv")
    return

In [None]:
if not os.path.exists(csv_name):
    with open(csv_name, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile) #this will create new file if not exist
        writer.writerow(csv_columns)
else:
    with open(csv_name, 'r+', newline='') as csvfile:
        writer = csv.writer(csvfile)
        reader = csv.DictReader(csvfile)
        load_csv_in_cache(reader)

In [None]:
def parse_job(job):
    try:
        date = job.find('time', {'class':'job-search-card__listdate'}).attrs['datetime']
    except AttributeError:
        date = job.find('time', {'class':'job-search-card__listdate--new'}).attrs['datetime']
    
    title = job.find('h3', {'class': 'base-search-card__title'}).text.strip()
    job_type = 'em' if 'manager' in title.lower() else 'eng'
    company = job.find('a', {'class': 'hidden-nested-link'}).text.strip()
    location = job.find('span', {'class': 'job-search-card__location'}).text.strip()
    link = job.find('a', {'class': 'base-card__full-link'}).attrs['href']
    pattern = r"(.*-)(\d+)"
    job_id = re.search(pattern,link).group(2)
    #company_staff_link = job.find('a', {'class': 'hidden-nested-link'}).attrs['href']
    #sanitized_company_staff_link = company_staff_link[:-len(suffix_to_remove)] + staff_url_postfix
    #company_staff_urls[company] = sanitized_company_staff_link
    
    return {
        'date':date, 
        'title':title, 
        'job_type':job_type, 
        'company':company, 
        'location':location,
        'link':link,
        'pattern':pattern,
        'job_id':job_id,
    }

In [None]:
def debug_data(job_id,company,job_type,title,location,link,date):
    if debug_level==DebugLevel.GRANULAR and debug_company != "" and company == debug_company:
        print(f"id:{job_id},title:{title},job_type:{job_type},link:{link}")
    elif debug_level==DebugLevel.GENERAL:
        print(f"id:{job_id},title:{title},job_type:{job_type},company:{company}")

In [None]:
#look up job_id, omit if cache hit, add to cache and jobs_to_add array if cache miss
def upsert_jobs_to_add(job_id,company,job_type,title,location,link,date):
    debug_data(job_id,company,job_type,title,location,link,date)
    jobs.setdefault(job_id,{
        csv_columns[1]: company,
        csv_columns[2]: job_type,
        csv_columns[3]: title,
        csv_columns[4]: location,
        csv_columns[5]: link,
        csv_columns[6]: date
    })
    jobs_to_add.append([job_id,company,job_type,title,location,link,date])

In [None]:
n=1
i=0
while i<=max_row_default:
#for i in range(0,max_row_default,row_increment_default):
    base_url = f"{base_url_prefix}{i}"
    if debug_level is not None:
        print(base_url)
        print(f"sleep {fibonacci(n)} seconds")
    time.sleep(fibonacci(n))
    response = requests.get(base_url)

    if response.status_code == 200:
        i+=row_increment_default
        soup = BeautifulSoup(response.text, 'html.parser')
        job_listings = soup.find_all('div', {'class':'job-search-card'})
        if n > 1:
            n-=1
            print(f"decrementing n to {n}")
        for job in job_listings:
            job_fields = parse_job(job)         
            if job_fields['company'] in ignore_list: #these are staffing companies
                continue
                
            if len(jobs_to_add) > 0 and job_fields['job_id'] in jobs_to_add[0]: # exists in delta
                if debug_level==DebugLevel.WARN: #if item offset is set efficiently this printline should never show
                    print(f"we've seen {job_fields['company']}-{job_fields['job_id']}")
                continue
            elif jobs.get(job_fields['job_id'],"") != "": # cache hit: exists in database
                continue
            else:
                upsert_jobs_to_add(job_fields['job_id'],job_fields['company'],job_fields['job_type'],job_fields['title'],job_fields['location'],job_fields['link'],job_fields['date'])
    else:
        print(f"Failed to fetch job listings: {response}")
        if response.status_code==429:
            n+=1
            print(f"retrying after {fibonacci(n)} seconds")
            time.sleep(fibonacci(n))
            # i=i-row_increment_default*2
            continue
        else:
            break

In [None]:
with open(csv_name, 'a', newline='') as csvfile:
    writer = csv.writer(csvfile)

    # Append all new rows
    writer.writerows(jobs_to_add)
    print(f"{len(jobs_to_add)} jobs added")
jobs_to_add=[] #reset

In [None]:
import pandas as pd
df=pd.DataFrame(jobs).T
df_eng=df[df['job_type']=='eng'].groupby(['company'])['date'].agg(['count','max'])
df_em=df[df['job_type']=='em'].groupby(['company'])['date'].agg(['count','max'])
df_em['count']=df_em['count'].apply(lambda x: x*EM_AS_ENG_MULTIPLIER)
df2=pd.merge(df_eng,df_em,on='company',how='outer')
df2 = df2.fillna(0)
df2['count']=df2['count_x']+df2['count_y']
df2['max']=df2.apply(lambda row: max(str(row['max_x']),str(row['max_y'])), axis=1)
p95count=df2['count'].quantile(.95)
print(f"p95count:{p95count}")
df2[df2['count']>p95count].sort_values('count',ascending=False)

In [None]:
#todo: scrape US engineering size, get % hiring. 
#https://www.linkedin.com/company/paypal/people/?facetCurrentFunction=8&facetGeoRegion=103644278
#https://www.linkedin.com/company/microsoft/people/?facetCurrentFunction=8&facetGeoRegion=103644278
#8 is eng 103644278 is US
#<h2 class="text-heading-xlarge">3,938 associated members</h2>
#parent is this div class="org-people__header-spacing-carousel"
#scrape this


In [None]:
# for company, url in company_staff_urls.items():
#     print(f"key:{company},value:{url}")
#     #parse number of staff
#     response = requests.get(url, headers=headers)
    
#     #PROBLEM: there is a session redirect, this URL requires authentication -> fixed with cookie
#     if response.status_code == 200:
#         try:
#             json_data = response.json()
#         except ValueError:
#             print("Invalid JSON response")
#         soup = BeautifulSoup(response.text, 'html.parser')
#         json_element = soup.find('script', {'type': 'application/json'})
#         if json_element:
#             json_data = json_element.string
#             print(json_data)
# #         test = soup.find_all('div',{'class':'org-people__header-spacing-carousel'})
# #         print(soup)
# #         for x in test:
# #             print(f"i'm here:{x}")
#     else:
#         print(response.text)
    