In [1]:
import requests
from requests.auth import HTTPBasicAuth
import pandas as pd
import os
from typing import Any
from dotenv import load_dotenv
import time

In [2]:
load_dotenv()
REED_USER = os.getenv('REED_USER')

JOB_TITLES = ["data science", "data engineer", "data analyst"]

In [3]:
class ReedAPI:
    def __init__(self, api_key: str):
        self.user = api_key
        self.password = ""
        self.base_url = "https://www.reed.co.uk/api/1.0/search/"
        self.page_size = 50

    def get_all_jobs(self, job_title: str) -> list[dict]:
        jobs, total_results = self.get_jobs_partial(job_title, self.page_size, 0)
        for page in range(1, total_results // self.page_size + 1):
            time.sleep(1) # TODO Check the API rate limit
            jobs.extend(self.get_jobs_partial(job_title, self.page_size, page)[0])

            if page > 1: # TODO Remove this, it's just for testing
                break
        return jobs
    
    def get_jobs_partial(self, job_title: str, count: int, page: int) -> tuple[list[dict], int]:
        print(f"Getting jobs for {job_title} page {page}")
        params = {
            "keywords" : job_title,
            "resultsToTake" : count,
            "resultsToSkip" : page * count
        }
        auth = HTTPBasicAuth(self.user, self.password)
        response = requests.get(self.base_url, auth=auth, params=params)
        if response.status_code != 200:
            raise Exception("Failed to get jobs")
        response_json = response.json()
        print(f"Got {len(response_json.get('results'))} jobs for {job_title} page {page}. Total results: {response_json.get('totalResults')}")
        return (response_json.get("results"), int(response_json.get("totalResults")))
    
def load(dict_list: list[dict]) -> pd.DataFrame:
    df = pd.DataFrame(dict_list)
    df_selected = df[['jobId', 'employerId','employerName', 'jobTitle', 'locationName', 'minimumSalary', 'maximumSalary', 'currency', 'date', 'applications', 'jobUrl']].copy()
    df_selected['date'] = pd.to_datetime(df_selected['date'], format="%d/%m/%Y", errors='coerce')
    return df_selected
    

In [4]:
reed_api = ReedAPI(REED_USER)
datascience_jobs = reed_api.get_all_jobs("data science")
print(len(datascience_jobs))

Getting jobs for data science page 0
Got 50 jobs for data science page 0. Total results: 1431
Getting jobs for data science page 1
Got 50 jobs for data science page 1. Total results: 1431
Getting jobs for data science page 2
Got 50 jobs for data science page 2. Total results: 1431
150


In [5]:
jobs_df = load(datascience_jobs)
print(jobs_df.shape)
jobs_df.head()

(150, 11)


Unnamed: 0,jobId,employerId,employerName,jobTitle,locationName,minimumSalary,maximumSalary,currency,date,applications,jobUrl
0,50980134,389257,Morgan McKinley,Data Science Manager,London,80000.0,80000.0,GBP,2023-08-08,14,https://www.reed.co.uk/jobs/data-science-manag...
1,50938687,383872,Rutherford Briant,Data Science Lead,CO49YA,90000.0,110000.0,GBP,2023-08-01,25,https://www.reed.co.uk/jobs/data-science-lead/...
2,50986900,575264,Reed,Data Science Lead,M405BP,735.0,735.0,GBP,2023-08-08,13,https://www.reed.co.uk/jobs/data-science-lead/...
3,50963399,331522,Harnham - Data & Analytics Recruitment,Data Science Manager,London,75000.0,85000.0,GBP,2023-08-04,14,https://www.reed.co.uk/jobs/data-science-manag...
4,51055661,331522,Harnham - Data & Analytics Recruitment,Director of Data Science,London,100000.0,115000.0,GBP,2023-08-18,6,https://www.reed.co.uk/jobs/director-of-data-s...


In [6]:
transformed_df = jobs_df.groupby('locationName').agg(
    {
        'jobId': 'count',
        'minimumSalary': 'mean',
        'maximumSalary': 'mean',
        'date': 'max'
    }).reset_index()
transformed_df.columns = ['location', 'job_count', 'avg_min_salary', 'avg_max_salary', 'last_job_post']
transformed_df["avg_max_salary"] = transformed_df["avg_max_salary"].round(0)
transformed_df["avg_min_salary"] = transformed_df["avg_min_salary"].round(0)
transformed_df.sort_values('job_count', ascending=False)

Unnamed: 0,location,job_count,avg_min_salary,avg_max_salary,last_job_post
41,London,34,74123.0,90233.0,2023-08-30
44,Manchester,6,52000.0,60000.0,2023-08-29
10,Cambridge,5,45000.0,56667.0,2023-08-29
40,Liverpool,4,32000.0,40000.0,2023-08-29
45,Milton Keynes,4,31867.0,35933.0,2023-08-28
...,...,...,...,...,...
35,L55AF,1,135.0,190.0,2023-08-24
36,LU29TN,1,,,2023-05-22
37,Lancaster,1,160.0,220.0,2023-08-23
1,B301HZ,1,144.0,244.0,2023-08-23


In [7]:
len(transformed_df)

76