In [20]:
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd
import logging
from tqdm import tqdm
import time
import datetime

In [21]:
title = "Software Developer"  
location = "San Francisco"
NUMBER_OF_JOBS_TO_FETCH = 1000
SAVE_DATA = True

In [22]:
import os
os.makedirs('./results', exist_ok=True) # will dump results here

In [23]:
from collections import defaultdict
prg_skills = pd.read_csv('./skills.csv')['skills'].to_list()
prg_skills = set(prg_skills)

users_of_languages = defaultdict(int)
for l in prg_skills:
    users_of_languages[l] = 0

In [24]:
job_list = []

In [25]:
one_hot_skills = {s : [] for s in prg_skills}

In [26]:
for start in tqdm(range(0, NUMBER_OF_JOBS_TO_FETCH, 25)):
    list_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={title}&location={location}&distance=25&f_TPR=&f_WT=1&start={start}"

    response = requests.get(list_url)

    list_data = response.text
    list_soup = BeautifulSoup(list_data, "html.parser")
    page_jobs = list_soup.find_all("li")
        
    id_list = []

    for job in page_jobs:
        base_card_div = job.find("div", {"class": "base-card"})
        job_id = base_card_div.get("data-entity-urn").split(":")[3]
        id_list.append(job_id)
    
    for job_id in id_list:
        job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
        
        job_response = requests.get(job_url)
        if job_response.status_code != 200:
            logging.warning(f"Failed to retrieve job posting {job_id}: Status code {job_response.status_code}")
            continue
        job_soup = BeautifulSoup(job_response.text, "html.parser")
        
        job_post = {}
        
        try:
            job_post["job_title"] = job_soup.find("h2", {"class":"top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title"}).text.strip()
        except:
            job_post["job_title"] = None
            
        try:
            job_post["company_name"] = job_soup.find("a", {"class": "topcard__org-name-link topcard__flavor--black-link"}).text.strip()
        except:
            job_post["company_name"] = "Unknown" 

        try: 
            job_post["location"] = job_soup.find("span", {"class": "topcard__flavor topcard__flavor--bullet"}).text.strip().split(",")[0].strip()
        except:
            job_post["location"] = None
            
        try:
            job_post["time_posted"] = job_soup.find("span", {"class": "posted-time-ago__text topcard__flavor--metadata"}).text.strip()
        except:
            job_post["time_posted"] = None
            
        try:
            job_post["num_applicants"] = job_soup.find("span", {"class": "num-applicants__caption topcard__flavor--metadata topcard__flavor--bullet"}).text.strip().replace(" applicants", "")
        except:
            job_post["num_applicants"] = 0 

        for lang in prg_skills:
            if f"{lang.lower()}" in job_soup.find("div", {"class": "show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden"}).text.lower():  
                users_of_languages[lang] += 1 
                one_hot_skills[lang].append(job_post["company_name"])
        job_list.append(job_post)
        time.sleep(random.uniform(1, 3)) # Sleep between requests to avoid rate limiting 

 90%|█████████ | 36/40 [46:28<05:09, 77.45s/it]   


AttributeError: 'NoneType' object has no attribute 'get'

## Job postings 

In [27]:
jobs_df = pd.DataFrame(job_list)
print(f"Total job postings collected: {len(jobs_df)}")
jobs_df.head(15)

Total job postings collected: 359


Unnamed: 0,job_title,company_name,location,time_posted,num_applicants
0,"Software Engineer, New Grad",Stripe,San Francisco,2 weeks ago,0
1,"Software Engineer I, Frontend",Twitch,San Francisco,1 day ago,0
2,Full-Stack Engineer,Campfire,San Francisco,6 months ago,0
3,Software Engineer (New Grads),MLabs,San Francisco,1 week ago,100
4,"Software Engineering, Frontend (Slack - Multip...",Slack,San Francisco,2 days ago,172
5,Software Engineer I / II,Giga,San Francisco,1 week ago,0
6,Full Stack Software Engineer,Alex AI,San Francisco,3 days ago,0
7,Software Engineer,Nudge,San Francisco,4 weeks ago,0
8,"Software Engineer, New Grad",Eventual,San Francisco,4 weeks ago,0
9,"Software Engineering, New Grad",Eventual,San Francisco,2 months ago,0


In [28]:
if SAVE_DATA:
    jobs_df.to_csv(f'./results/jobs_{title}_{location}_{datetime.datetime.now().strftime("%Y-%m-%d")}.csv', index=False)

In [29]:
companies_with_jobs = jobs_df['company_name'].value_counts()
companies_with_jobs.head(5).to_clipboard()

## Skills usage in job postings 

In [36]:
skill_usage = pd.DataFrame.from_dict(users_of_languages, orient='index', columns=['Number of Job Postings']).reset_index().rename(columns={'index': 'Programming Skill'})
skill_usage.drop(skill_usage[skill_usage['Number of Job Postings'] == 0].index, inplace=True)
skill_usage = skill_usage.sort_values(by='Number of Job Postings', ascending=False)
skill_usage = skill_usage.reset_index(drop=True)
skill_usage.head(20)

Unnamed: 0,Programming Skill,Number of Job Postings
0,Python,161
1,AWS,151
2,React,144
3,TypeScript,126
4,Rust,125
5,Java,95
6,Lua,76
7,SQL,74
8,JavaScript,72
9,Git,66


In [37]:
skill_usage.head(10).to_clipboard()

In [30]:
if SAVE_DATA:
    skill_usage.to_csv(f'./results/skills_{title}_{location}_{datetime.datetime.now().strftime("%Y-%m-%d")}.csv', index=False)

## One-hot encoding of skills by company

In [31]:
one_hot_df = pd.DataFrame.from_dict(one_hot_skills, orient='index').transpose()
one_hot_df.head(10)

Unnamed: 0,Erlang,SpringBoot,MATLAB,React,Agda,Dart,Python,NoSQL,.NET,WebAssembly,...,Elm,C++,SQL,Swift,Django,PHP,JRuby,Razor,Java,OpenCV
0,,,,Twitch,,,Campfire,Emerald AI,,,...,Emerald AI,Stripe,MLabs,Twitch,Virtue AI,James Chase,,,Stripe,Boon
1,,,,Campfire,,,Alex AI,Clerkie,,,...,Simple AI,Google,Collate,Divine,Outset,Postman Corp,,,Slack,
2,,,,Slack,,,Nudge,Poshmark,,,...,Latent,Meta,Virtue AI,Weekday AI (YC W21),Weekday AI (YC W21),Meta,,,Alex AI,
3,,,,Alex AI,,,Eventual,Falconer,,,...,Freshworks,Kodiak,MLabs,IXL Learning,Until,Postman,,,GlossGenius,
4,,,,FurtherAI,,,Eventual,Cambly Inc.,,,...,Emerald AI,Snap Inc.,BetterBasket,Fluency,Catalyst Labs,Quizlet,,,"Observe, Inc.",
5,,,,GlossGenius,,,Uber,Snap Inc.,,,...,,OpenAI,Emerald AI,Verkada,The Mom Project,Slack,,,Uber,
6,,,,Delve,,,Collate,ExecutivePlacements.com,,,...,,ExecutivePlacements.com,HeyGen,ExecutivePlacements.com,Artos AI,,,,Replit,
7,,,,"Observe, Inc.",,,Replit,"Checkr, Inc.",,,...,,Impulse,Peppr AI (YC W25),Meta,,,,,Maxima,
8,,,,Replo,,,Virtue AI,Weekday AI (YC W21),,,...,,Meta,Clerkie,Meta,,,,,BetterBasket,
9,,,,Unwrap,,,OpenAI,Treinetic,,,...,,Verkada,Poshmark,Weekday AI (YC W21),,,,,Emerald AI,


In [32]:
if SAVE_DATA:
    one_hot_df.to_csv(f'./results/one_hot_skills_{title}_{location}_{datetime.datetime.now().strftime("%Y-%m-%d")}.csv', index=False)

In [33]:
companies_used_skill = {}
for skill, companies in one_hot_skills.items():
    companies_used_skill[skill] = [len(set([c for c in companies if c is not None]))]

companies_used_skill_df = pd.DataFrame.from_dict(companies_used_skill, orient='index', columns=['Number of Companies']).reset_index()
companies_used_skill_df = companies_used_skill_df.sort_values(by='Number of Companies', ascending=False).reset_index(drop=True)
companies_used_skill_df.head(20)


Unnamed: 0,index,Number of Companies
0,React,124
1,Python,122
2,TypeScript,110
3,AWS,101
4,Rust,74
5,Java,69
6,SQL,63
7,Lua,62
8,JavaScript,56
9,Ada,50


In [34]:
if SAVE_DATA:
    companies_used_skill_df.to_csv(f'./results/skill_usage_{title}_{location}_{datetime.datetime.now().strftime("%Y-%m-%d")}.csv', index=False)