In [2]:
import requests
import pandas as pd

In [3]:
def get_ads(url_for_search, params):
    response = requests.get(url_for_search, params=params)
    return response.json()

##### define url

In [4]:
url = "https://jobsearch.api.jobtechdev.se"
url_for_search = f"{url}/search"

##### define common query parameters for all pages

In [5]:
params = {"limit":100,
          "occupation-field": "6Hq3_tKo_V57"}

##### define specific query parameters for the first page

In [7]:
offset = 0
page_params = dict(params, offset=offset)

##### extract data for the first page

In [8]:
data = get_ads(url_for_search, page_params)

In [11]:
type(data["hits"])

list

In [12]:
len(data["hits"])

100

In [13]:
data_page_one = pd.DataFrame(data["hits"])
data_page_one.head()

Unnamed: 0,relevance,id,external_id,original_id,label,webpage_url,logo_url,headline,application_deadline,number_of_vacancies,...,workplace_address,must_have,nice_to_have,application_contacts,publication_date,last_publication_date,removed,removed_date,source_type,timestamp
0,1.0,29672126,,,[],https://arbetsformedlingen.se/platsbanken/anno...,https://arbetsformedlingen.se/rest/agas/api/v1...,SSAB - Eltekniker/Planerare Elunderhåll,2025-05-31T23:59:59,1,...,"{'municipality': 'Luleå', 'municipality_code':...","{'skills': [], 'languages': [], 'work_experien...","{'skills': [], 'languages': [], 'work_experien...",[],2025-05-02T09:25:24,2025-05-31T23:59:59,False,,VIA_ANNONSERA,1746170724657
1,1.0,29672142,,,[],https://arbetsformedlingen.se/platsbanken/anno...,https://arbetsformedlingen.se/rest/agas/api/v1...,Continuous Improvement (CI) & Process Engineer,2025-05-13T23:59:59,1,...,"{'municipality': 'Upplands Väsby', 'municipali...","{'skills': [], 'languages': [], 'work_experien...","{'skills': [], 'languages': [], 'work_experien...",[],2025-05-02T09:20:14,2025-05-13T23:59:59,False,,VIA_ANNONSERA,1746170414637
2,1.0,29672143,46-559384-1058-5883826-1,,[],https://arbetsformedlingen.se/platsbanken/anno...,,Junior Konsult inom Processutveckling och Proj...,2025-06-19T23:59:59,1,...,"{'municipality': 'Uppsala', 'municipality_code...","{'skills': [], 'languages': [], 'work_experien...","{'skills': [], 'languages': [], 'work_experien...","[{'name': None, 'description': 'Oliver Rydberg...",2025-05-02T09:19:45,2025-06-19T23:59:59,False,,VIA_PLATSBANKEN_DXA,1746170385277
3,1.0,29672117,46-556902-6767-421812681471dfd5496,,[],https://arbetsformedlingen.se/platsbanken/anno...,https://arbetsformedlingen.se/rest/agas/api/v1...,Terri söker erfaren konstruktör - Forma framti...,2025-05-25T23:59:59,1,...,"{'municipality': 'Växjö', 'municipality_code':...","{'skills': [], 'languages': [], 'work_experien...","{'skills': [], 'languages': [], 'work_experien...","[{'name': None, 'description': 'Linnéa Furudah...",2025-05-02T09:19:06,2025-05-25T23:59:59,False,,VIA_PLATSBANKEN_DXA,1746170346534
4,1.0,29672116,46-556902-6767-421813681471c73e0b0,,[],https://arbetsformedlingen.se/platsbanken/anno...,https://arbetsformedlingen.se/rest/agas/api/v1...,Terri söker erfaren konstruktör - Forma framti...,2025-05-25T23:59:59,1,...,"{'municipality': 'Älmhult', 'municipality_code...","{'skills': [], 'languages': [], 'work_experien...","{'skills': [], 'languages': [], 'work_experien...","[{'name': None, 'description': 'Linnéa Furudah...",2025-05-02T09:18:36,2025-05-25T23:59:59,False,,VIA_PLATSBANKEN_DXA,1746170316280


In [15]:
data_page_one["id"].unique().shape

(100,)

##### define specific query parameters for second page

In [16]:
offset = 100
page_params = dict(params, offset=offset)

##### extract data for second page

In [17]:
data = get_ads(url_for_search, page_params)

In [18]:
data_page_two = pd.DataFrame(data["hits"])

In [19]:
data_page_two["id"].unique().shape

(100,)

##### check if page one and two are all unique job ads

In [20]:
data_combined = pd.concat([data_page_one, data_page_two],ignore_index=True)

In [21]:
data_combined["id"].unique().shape

(200,)

##### generator function called jobsearch_resource()
- it uses while loop to extract data
- for each loop, it extracts one page of data and yield the job ads
- when we execute this generator function, we produce a generator object -> this generator object remembers everything yielded in all pages
- data from the generator object can only be consumed ONCE, afterwards, data points disappear

In [None]:
def jobsearch_resource(params):
    url = "https://jobsearch.api.jobtechdev.se"
    url_for_search = f"{url}/search"
    limit = params.get("limit", 100)
    offset = 0

    while True:

        #for each page
        page_params = dict(params, offset=offset)
        
        data = get_ads(url_for_search, page_params)
        hits = data.get("hits", [])

        #if there is no data for this page, exist here
        if not hits: 
            break

        #if there is data for this page
        for ad in hits:
            yield ad

        #if the page is not full, exist here
        if len(hits) < limit or offset > 1900:
            break
        
        #for next page
        offset += limit

##### illustration of what a generator function can remember when there is a loop inside the function

In [22]:
# create a simple generator function with a loop inside

def count_up_to(n):
    
    i = 0

    while i < n:
        yield i
        i += 1

In [23]:
# create one generator object by executing this generator function
gen = count_up_to(3)

In [24]:
# begin to consume memory from the generator object
next(gen)

0

In [25]:
# continue to consume remaining memory from the same generator object

for nr in gen:
    print(nr)

1
2


In [None]:
# efter we have consumed all data points ONCE, they are not remembered anymore 

for nr in gen:
    print(nr)

In [27]:
# test if you create one generator object from jobsearch_resource() and read teh first job, then you cannot see the first job anylonger