In [7]:
import json
import os
import requests
from datetime import datetime
import math
from loguru import logger
import pandas as pd

In [10]:
ADZUNA_APP_ID = os.getenv('80393870')
ADZUNA_APP_KEY = os.getenv('875441c752735976f76161157548e0f0')

# Define the API endpoint and base parameters
url = "https://api.adzuna.com/v1/api/jobs/ca/search/"
base_params = {
    'app_id': ADZUNA_APP_ID,
    'app_key': ADZUNA_APP_KEY,
    'results_per_page': 50,  # Maximum allowed results per page
    'what_phrase': "data engineer", # an entire phrase which must be found in the description or title
    'max_days_old': 2,
    'sort_by': "date"
}

# Initialize a list to store all job postings
all_job_postings = []

# Make the first request to determine the total number of pages
logger.info("Making the first request to determine the total number of pages")
response = requests.get(f"{url}1", params=base_params)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()  # Parse the JSON response
    total_results = data['count']  # Get the total number of results
    results_per_page = base_params['results_per_page']
    
    # Calculate the total number of pages
    total_pages = math.ceil(total_results / results_per_page)
    logger.info(f"Total number of page = {total_pages}")
    
    # Store the results from the first page
    all_job_postings.extend(data['results'])

    # Loop through the remaining pages and request data from each
    logger.info("Looping through the remaining pages and request data from each")
    for page in range(2, total_pages + 1):  # Start from page 2 to total_pages
        response = requests.get(f"{url}{page}", params=base_params)
        
        # Check if the request was successful
        if response.status_code == 200:
            page_data = response.json()
            # Append job postings from this page to the list
            all_job_postings.extend(page_data['results'])
        else:
            logger.error(f"Error fetching page {page}: {response.status_code}, {response.text}")
else:
    logger.error(f"Error: {response.status_code}, {response.text}")

# Now all_job_postings contains data from all pages
logger.info(f"Total jobs retrieved: {len(all_job_postings)}")


# Transformation: picking up only necessary fields
parsed_jobs = []
for job in all_job_postings:
      parsed_jobs.append(
        dict(
            job_id = job['id'],
            job_title = job['title'],
            job_location = job['location']['display_name'],
            job_company = job['company']['display_name'],
            job_category = job['category']['label'],
            job_description = job['description'],
            job_url = job['redirect_url'],
            job_created = job['created']
        )
      )

jobs_df = pd.DataFrame.from_dict(parsed_jobs)
# jobs_df['job_created'] = pd.to_datetime(jobs_df['job_created'])

logger.info("Done")

[32m2024-09-29 19:25:31.103[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m19[0m - [1mMaking the first request to determine the total number of pages[0m
[32m2024-09-29 19:25:31.238[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m48[0m - [31m[1mError: 400, <!DOCTYPE html>
<html>
<!-- This file is managed by Chef -->
    <head>
        <meta charset="UTF-8">
        <title>Uh oh, something isn't right</title>
        <meta name="HandheldFriendly" content="True">
        <meta name="MobileOptimized" content="320">
        <meta name="format-detection" content="telephone=no">
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <meta http-equiv="cleartype" content="on">
        <style type="text/css">
            html { margin:0; padding:0; }
            body {
                margin:0; padding:20px;
                font-family:Arial, "Helvetica Neue", Helvetica, sans-serif;
                font-size:16px; co