In [1]:
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd
import logging
from tqdm import tqdm
import time
import datetime

In [2]:
title = "Software Developer"  
location = "Linköping"
NUMBER_OF_JOBS_TO_FETCH = 1000
SAVE_DATA = True

In [3]:
import os
os.makedirs('./results', exist_ok=True) # will dump results here

In [4]:
from collections import defaultdict
prg_skills = pd.read_csv('./skills.csv')['skills'].to_list()
prg_skills = set(prg_skills)

users_of_languages = defaultdict(int)
for l in prg_skills:
    users_of_languages[l] = 0

In [5]:
job_list = []

In [6]:
one_hot_skills = {s : [] for s in prg_skills}

In [7]:
for start in tqdm(range(0, NUMBER_OF_JOBS_TO_FETCH, 25)):
    list_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={title}&location={location}&distance=25&f_TPR=&f_WT=1&start={start}"

    response = requests.get(list_url)

    list_data = response.text
    list_soup = BeautifulSoup(list_data, "html.parser")
    page_jobs = list_soup.find_all("li")
        
    id_list = []

    for job in page_jobs:
        base_card_div = job.find("div", {"class": "base-card"})
        job_id = base_card_div.get("data-entity-urn").split(":")[3]
        id_list.append(job_id)
    
    for job_id in id_list:
        job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
        
        job_response = requests.get(job_url)
        if job_response.status_code != 200:
            logging.warning(f"Failed to retrieve job posting {job_id}: Status code {job_response.status_code}")
            continue
        job_soup = BeautifulSoup(job_response.text, "html.parser")
        
        job_post = {}
        
        try:
            job_post["job_title"] = job_soup.find("h2", {"class":"top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title"}).text.strip()
        except:
            job_post["job_title"] = None
            
        try:
            job_post["company_name"] = job_soup.find("a", {"class": "topcard__org-name-link topcard__flavor--black-link"}).text.strip()
        except:
            job_post["company_name"] = "Unknown" 

        try: 
            job_post["location"] = job_soup.find("span", {"class": "topcard__flavor topcard__flavor--bullet"}).text.strip().split(",")[0].strip()
        except:
            job_post["location"] = None
            
        try:
            job_post["time_posted"] = job_soup.find("span", {"class": "posted-time-ago__text topcard__flavor--metadata"}).text.strip()
        except:
            job_post["time_posted"] = None
            
        try:
            job_post["num_applicants"] = job_soup.find("span", {"class": "num-applicants__caption topcard__flavor--metadata topcard__flavor--bullet"}).text.strip().replace(" applicants", "")
        except:
            job_post["num_applicants"] = 0 

        for lang in prg_skills:
            if f"{lang.lower()}" in job_soup.find("div", {"class": "show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden"}).text.lower():  
                users_of_languages[lang] += 1 
                one_hot_skills[lang].append(job_post["company_name"])
        job_list.append(job_post)
        time.sleep(random.uniform(1, 3)) # Sleep between requests to avoid rate limiting 

100%|██████████| 40/40 [01:30<00:00,  2.27s/it]


## Job postings 

In [8]:
jobs_df = pd.DataFrame(job_list)
print(f"Total job postings collected: {len(jobs_df)}")
jobs_df.head(15)

Total job postings collected: 30


Unnamed: 0,job_title,company_name,location,time_posted,num_applicants
0,Developer: PC-plattformar & MCU-utveckling,Saab,Linköping,1 week ago,0
1,System Developer to Linköping!,TMC Sweden,Linköping,3 weeks ago,0
2,Testare,Saab,Linköping,6 days ago,33
3,Fullstack Javautvecklare (Hyrköp),Professional Galaxy AB,Linköping,3 weeks ago,0
4,"Software Developer, Backend - Sectra Medical",Sectra,Linköping,5 days ago,0
5,Software Engineer,Voyado,Norrköping,2 weeks ago,118
6,Utvecklare till Aircraft Simulation,Saab,Linköping,1 week ago,0
7,AI Software Developer,Knowit,Linköping,1 month ago,0
8,Algoritmutveckling / Reglerteknik / Modellbase...,Combine,Linköping,5 days ago,0
9,Frontendutvecklare,Deploja,Linköping,1 month ago,0


In [9]:
if SAVE_DATA:
    jobs_df.to_csv(f'./results/jobs_{title}_{location}_{datetime.datetime.now().strftime("%Y-%m-%d")}.csv', index=False)

In [10]:
companies_with_jobs = jobs_df['company_name'].value_counts()
companies_with_jobs.head(5).to_clipboard()

## Skills usage in job postings 

In [11]:
skill_usage = pd.DataFrame.from_dict(users_of_languages, orient='index', columns=['Number of Job Postings']).reset_index().rename(columns={'index': 'Programming Skill'})
skill_usage.drop(skill_usage[skill_usage['Number of Job Postings'] == 0].index, inplace=True)
skill_usage = skill_usage.sort_values(by='Number of Job Postings', ascending=False)
skill_usage = skill_usage.reset_index(drop=True)
skill_usage.head(20)

Unnamed: 0,Programming Skill,Number of Job Postings
0,Python,15
1,C++,14
2,Git,13
3,Java,8
4,C#,8
5,CI/CD,8
6,Azure,7
7,Ada,7
8,SQL,6
9,C/C++,5


In [12]:
skill_usage.head(10).to_clipboard()

In [13]:
if SAVE_DATA:
    skill_usage.to_csv(f'./results/skills_{title}_{location}_{datetime.datetime.now().strftime("%Y-%m-%d")}.csv', index=False)

## One-hot encoding of skills by company

In [14]:
one_hot_df = pd.DataFrame.from_dict(one_hot_skills, orient='index').transpose()
one_hot_df.head(10)

Unnamed: 0,Dart,C++,Scrum,debian,Ada,Smalltalk,Python,Linux,GraphQL,SpringBoot,...,PyTroch,JRuby,Blazor,Docker,SwiftUI,CSS,Java,C/C++,.NET,Flask
0,Syntronic - A Global Design House,Saab,Combitech Sverige,,Saab,,Saab,Saab,Knowit,,...,,,,Knowit,,,Professional Galaxy AB,Combine,Voyado,
1,FOI,Sectra,Sectra,,Voyado,,Saab,Sectra,Knowit,,...,,,,Syntronic - A Global Design House,,,Voyado,Professional Galaxy AB,Combitech Sverige,
2,,Saab,,,Saab,,Knowit,,,,...,,,,Knowit,,,Sweco,CO-WORKER TECHNOLOGY AB,,
3,,Combine,,,Saab,,Combine,,,,...,,,,,,,Syntronic - A Global Design House,FOI,,
4,,Saab,,,Professional Galaxy AB,,Saab,,,,...,,,,,,,FOI,Combitech Sverige,,
5,,Professional Galaxy AB,,,CO-WORKER TECHNOLOGY AB,,Professional Galaxy AB,,,,...,,,,,,,Combitech Sverige,,,
6,,Syntronic - A Global Design House,,,FOI,,Syntronic - A Global Design House,,,,...,,,,,,,Ictech,,,
7,,ALTEN Sweden,,,,,Knowit,,,,...,,,,,,,Linköping University,,,
8,,Sectra,,,,,ALTEN Sweden,,,,...,,,,,,,,,,
9,,CO-WORKER TECHNOLOGY AB,,,,,Qualcomm,,,,...,,,,,,,,,,


In [15]:
if SAVE_DATA:
    one_hot_df.to_csv(f'./results/one_hot_skills_{title}_{location}_{datetime.datetime.now().strftime("%Y-%m-%d")}.csv', index=False)

In [16]:
companies_used_skill = {}
for skill, companies in one_hot_skills.items():
    companies_used_skill[skill] = [len(set([c for c in companies if c is not None]))]

companies_used_skill_df = pd.DataFrame.from_dict(companies_used_skill, orient='index', columns=['Number of Companies']).reset_index()
companies_used_skill_df = companies_used_skill_df.sort_values(by='Number of Companies', ascending=False).reset_index(drop=True)
companies_used_skill_df.head(20)


Unnamed: 0,index,Number of Companies
0,Python,12
1,C++,10
2,Git,10
3,Java,8
4,C#,6
5,CI/CD,6
6,Ada,5
7,C/C++,5
8,Azure,4
9,SQL,4


In [17]:
if SAVE_DATA:
    companies_used_skill_df.to_csv(f'./results/skill_usage_{title}_{location}_{datetime.datetime.now().strftime("%Y-%m-%d")}.csv', index=False)