### Job Extraction Pipeline


In [1]:
import pandas as pd
import json
import time
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime

# Import helper functions
from jobs_helper import scrape_single_config

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# File paths
SITE_CONFIGS_PATH = './data/site_configs_raw.json'
OUTPUT_JOBS_CSV = './data/avature_jobs.csv'
OUTPUT_JOBS_JSON = './data/avature_jobs.json'

# Scraping configuration
MAX_WORKERS = 5

print("Configuration loaded")
print(f"Input: {SITE_CONFIGS_PATH}")
print(f"Output: {OUTPUT_JOBS_CSV}")


Configuration loaded
Input: ./data/site_configs_raw.json
Output: ./data/avature_jobs.csv


In [2]:
# Load site configurations
with open(SITE_CONFIGS_PATH, 'r') as f:
    all_configs = json.load(f)

df_configs = pd.DataFrame(all_configs)

print("="*60)
print("SITE CONFIGURATIONS LOADED")
print("="*60)

# Filter for scrapable endpoints
success_configs = df_configs[df_configs['status'] == 'success'].to_dict('records')
partial_configs = df_configs[df_configs['status'] == 'partial'].to_dict('records')

print(f"\nEndpoint Status:")
print(f"  SUCCESS (paginated): {len(success_configs)}")
print(f"  PARTIAL (HTML scrape): {len(partial_configs)}")
print(f"  Total to scrape: {len(success_configs) + len(partial_configs)}")

print(f"\nSample SUCCESS endpoints:")
for config in success_configs[:3]:
    print(f"  {config['tenant']}: {config['endpoint']}")

if len(partial_configs) > 0:
    print(f"\nSample PARTIAL endpoints:")
    for config in partial_configs[:3]:
        print(f"  {config['tenant']}: {config.get('sample_job_ids', 'N/A')}")


SITE CONFIGURATIONS LOADED

Endpoint Status:
  SUCCESS (paginated): 179
  PARTIAL (HTML scrape): 9
  Total to scrape: 188

Sample SUCCESS endpoints:
  ally.avature.net: https://ally.avature.net/careers/SearchJobs
  a2milkkf.avature.net: https://a2milkkf.avature.net/careers/SearchJobs
  amerilife.avature.net: https://amerilife.avature.net/careers/SearchJobs

Sample PARTIAL endpoints:
  arcbest.avature.net: ['27800', '27986', '27573', '27984', '27985']
  baufest.avature.net: ['5773', '5763', '5779', '5775', '5767']
  deltaflightattendants.avature.net: ['15', '16']


In [3]:
print("="*60)
print("STARTING JOB EXTRACTION")
print("="*60)

all_jobs = []
configs_to_scrape = success_configs + partial_configs

print(f"\nProcessing {len(configs_to_scrape)} endpoints...")
print(f"Estimated time: {len(configs_to_scrape) * 30 / 60:.1f} minutes\n")

start_time = time.time()

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    future_to_config = {
        executor.submit(scrape_single_config, config): config 
        for config in configs_to_scrape
    }
    
    completed = 0
    for future in as_completed(future_to_config):
        config = future_to_config[future]
        try:
            jobs = future.result()
            all_jobs.extend(jobs)
            completed += 1
            
            if completed % 10 == 0:
                elapsed = time.time() - start_time
                jobs_so_far = len(all_jobs)
                print(f"\nProgress: {completed}/{len(configs_to_scrape)} sites | "
                      f"{jobs_so_far:,} jobs | {elapsed/60:.1f} min elapsed")
        except Exception as e:
            logger.error(f"Future failed for {config['tenant']}: {e}")

elapsed_time = time.time() - start_time

print("\n" + "="*60)
print("SCRAPING COMPLETE")
print("="*60)
print(f"\nTotal time: {elapsed_time/60:.1f} minutes")
print(f"Total jobs collected: {len(all_jobs):,}")
print(f"Sites scraped: {len(configs_to_scrape)}")
if len(configs_to_scrape) > 0:
    print(f"Average: {len(all_jobs)/len(configs_to_scrape):.1f} jobs/site")


2026-02-08 15:50:48,222 - INFO - Scraping ally.avature.net...
2026-02-08 15:50:48,240 - INFO - Scraping a2milkkf.avature.net...
2026-02-08 15:50:48,245 - INFO - Scraping amerilife.avature.net...
2026-02-08 15:50:48,245 - INFO - Scraping amswh.avature.net...
2026-02-08 15:50:48,247 - INFO - Scraping advocateaurorahealth.avature.net...


STARTING JOB EXTRACTION

Processing 188 endpoints...
Estimated time: 94.0 minutes



2026-02-08 15:50:52,732 - INFO - ally.avature.net: Page 1 - 28 new jobs, 11 duplicates (total: 28)
2026-02-08 15:50:53,715 - INFO - advocateaurorahealth.avature.net: Page 1 - 30 new jobs, 11 duplicates (total: 30)
2026-02-08 15:50:53,891 - INFO - amerilife.avature.net: Page 1 - 23 new jobs, 10 duplicates (total: 23)
2026-02-08 15:50:55,515 - INFO - a2milkkf.avature.net: Page 1 - 33 new jobs, 0 duplicates (total: 33)
2026-02-08 15:50:55,723 - INFO - amswh.avature.net: Page 1 - 23 new jobs, 9 duplicates (total: 23)
2026-02-08 15:50:57,734 - INFO - advocateaurorahealth.avature.net: Page 2 - 35 new jobs, 12 duplicates (total: 65)
2026-02-08 15:50:57,965 - INFO - ally.avature.net: Page 2 - 27 new jobs, 12 duplicates (total: 55)
2026-02-08 15:50:59,297 - INFO - amerilife.avature.net: Page 2 - 23 new jobs, 10 duplicates (total: 46)
2026-02-08 15:51:00,495 - INFO - a2milkkf.avature.net: Page 2 - 3 new jobs, 0 duplicates (total: 36)
2026-02-08 15:51:01,219 - INFO - ally.avature.net: Page 3 - 3 


Progress: 10/188 sites | 1,765 jobs | 3.4 min elapsed


2026-02-08 15:54:10,812 - INFO - bmcrecruit.avature.net: Page 2 - 25 new jobs, 6 duplicates (total: 44)
2026-02-08 15:54:11,316 - INFO - bloomberg.avature.net: Page 4 - 13 new jobs, 13 duplicates (total: 55)
2026-02-08 15:54:11,787 - INFO - auspost.avature.net: Page 4 - 15 new jobs, 0 duplicates (total: 153)
2026-02-08 15:54:13,423 - INFO - berenberg.avature.net: Page 5 - 6 new jobs, 0 duplicates (total: 50)
2026-02-08 15:54:13,954 - INFO - bradyplus.avature.net: Page 1 - 13 new jobs, 6 duplicates (total: 13)
2026-02-08 15:54:13,994 - INFO - bmcrecruit.avature.net: Page 3 - 20 new jobs, 5 duplicates (total: 64)
2026-02-08 15:54:15,216 - INFO - bloomberg.avature.net: Page 5 - 13 new jobs, 13 duplicates (total: 68)
2026-02-08 15:54:16,443 - INFO - auspost.avature.net: Page 5 - 15 new jobs, 0 duplicates (total: 168)
2026-02-08 15:54:18,354 - INFO - bmcrecruit.avature.net: Page 4 - 14 new jobs, 0 duplicates (total: 78)
2026-02-08 15:54:18,434 - INFO - bradyplus.avature.net: Page 2 - 19 new


Progress: 20/188 sites | 4,087 jobs | 6.2 min elapsed


2026-02-08 15:57:02,512 - INFO - cisco2.avature.net: Page 1 - 8 new jobs, 0 duplicates (total: 8)
2026-02-08 15:57:03,527 - INFO - ciscotrainingats.avature.net: Page 2 - 7 new jobs, 0 duplicates (total: 14)
2026-02-08 15:57:05,652 - INFO - cisco2.avature.net: Page 1 - 9 new jobs, 0 duplicates (total: 9)
2026-02-08 15:57:08,015 - INFO - cisco2.avature.net: Page 2 - 8 new jobs, 0 duplicates (total: 16)
2026-02-08 15:57:09,157 - INFO - ciscotrainingats.avature.net: Page 1 - 7 new jobs, 0 duplicates (total: 7)
2026-02-08 15:57:09,377 - INFO - ciscotrainingats.avature.net: Page 3 - 7 new jobs, 0 duplicates (total: 21)
2026-02-08 15:57:09,430 - INFO - ciusss.avature.net: Page 1 - 47 new jobs, 12 duplicates (total: 47)
2026-02-08 15:57:10,063 - INFO - cisco2.avature.net: Page 2 - 9 new jobs, 0 duplicates (total: 18)
2026-02-08 15:57:13,035 - INFO - cisco2.avature.net: Page 3 - 8 new jobs, 0 duplicates (total: 24)
2026-02-08 15:57:13,393 - INFO - ciusss.avature.net: Page 2 - 47 new jobs, 12 du


Progress: 30/188 sites | 6,283 jobs | 9.8 min elapsed


2026-02-08 16:00:35,912 - INFO - deloittecm.avature.net: Page 1 - 10 new jobs, 6 duplicates (total: 10)
2026-02-08 16:00:37,933 - INFO - deloittepng.avature.net: Page 4 - 4 new jobs, 0 duplicates (total: 16)
2026-02-08 16:00:38,670 - INFO - cyclecarriage.avature.net: Page 3 - 4 new jobs, 0 duplicates (total: 103)
2026-02-08 16:00:38,707 - INFO - deloitteus.avature.net: Page 1 - 19 new jobs, 0 duplicates (total: 19)
2026-02-08 16:00:40,315 - INFO - deloittecm.avature.net: Page 2 - 10 new jobs, 6 duplicates (total: 20)
2026-02-08 16:00:42,842 - INFO - deloittepng.avature.net: Page 5 - 4 new jobs, 0 duplicates (total: 20)
2026-02-08 16:00:43,976 - INFO - deloitteus.avature.net: Page 2 - 19 new jobs, 0 duplicates (total: 38)
2026-02-08 16:00:44,051 - INFO - demosergioaguado.avature.net: Page 1 - 63 new jobs, 0 duplicates (total: 63)
2026-02-08 16:00:44,094 - INFO - cyclecarriage.avature.net: Page 4 - 4 new jobs, 0 duplicates (total: 107)
2026-02-08 16:00:44,127 - INFO - deloittecm.avature.


Progress: 40/188 sites | 7,992 jobs | 13.5 min elapsed


2026-02-08 16:04:16,889 - INFO - epic.avature.net: Page 3 - 5 new jobs, 8 duplicates (total: 23)
2026-02-08 16:04:17,278 - INFO - epic.avature.net: Page 2 - 5 new jobs, 8 duplicates (total: 18)
2026-02-08 16:04:17,786 - INFO - dosist.avature.net: Page 9 - 4 new jobs, 0 duplicates (total: 36)
2026-02-08 16:04:19,359 - INFO - fmlogistic.avature.net: Page 2 - 6 new jobs, 0 duplicates (total: 19)
2026-02-08 16:04:20,465 - INFO - epic.avature.net: Page 4 - 5 new jobs, 8 duplicates (total: 28)
2026-02-08 16:04:20,640 - INFO - epic.avature.net: Page 3 - 5 new jobs, 8 duplicates (total: 23)
2026-02-08 16:04:21,571 - INFO - dosist.avature.net: Page 10 - 4 new jobs, 0 duplicates (total: 40)
2026-02-08 16:04:23,541 - INFO - fmlogistic.avature.net: Page 3 - 6 new jobs, 0 duplicates (total: 25)
2026-02-08 16:04:24,764 - INFO - epic.avature.net: Page 4 - 5 new jobs, 8 duplicates (total: 28)
2026-02-08 16:04:24,792 - INFO - epic.avature.net: Page 5 - 5 new jobs, 8 duplicates (total: 33)
2026-02-08 16


Progress: 50/188 sites | 9,328 jobs | 16.8 min elapsed


2026-02-08 16:07:34,524 - INFO - ikea.avature.net: Page 2 - 3 new jobs, 0 duplicates (total: 18)
2026-02-08 16:07:35,074 - INFO - gpshospitality.avature.net: Page 5 - 24 new jobs, 10 duplicates (total: 120)
2026-02-08 16:07:35,708 - INFO - ea.avature.net: Page 12 - 18 new jobs, 0 duplicates (total: 1406)
2026-02-08 16:07:37,297 - INFO - frequentis.avature.net: Page 6 - 7 new jobs, 2 duplicates (total: 154)
2026-02-08 16:07:39,650 - INFO - ikea.avature.net: Page 3 - 3 new jobs, 0 duplicates (total: 21)
2026-02-08 16:07:39,766 - INFO - gpshospitality.avature.net: Page 6 - 24 new jobs, 10 duplicates (total: 144)
2026-02-08 16:07:41,062 - INFO - frequentis.avature.net: Page 7 - 7 new jobs, 2 duplicates (total: 161)
2026-02-08 16:07:41,576 - INFO - ea.avature.net: Page 13 - 18 new jobs, 0 duplicates (total: 1424)
2026-02-08 16:07:42,775 - INFO - ikea.avature.net: Page 1 - 62 new jobs, 20 duplicates (total: 62)
2026-02-08 16:07:44,535 - INFO - ikea.avature.net: Page 4 - 3 new jobs, 0 duplica


Progress: 60/188 sites | 18,488 jobs | 21.0 min elapsed


2026-02-08 16:11:51,207 - INFO - infor.avature.net: Page 7 - 14 new jobs, 0 duplicates (total: 110)
2026-02-08 16:11:53,080 - INFO - infor.avature.net: Page 4 - 14 new jobs, 0 duplicates (total: 68)
2026-02-08 16:11:53,322 - INFO - ikea.avature.net: Page 11 - 62 new jobs, 20 duplicates (total: 682)
2026-02-08 16:11:54,551 - INFO - ikea.avature.net: Page 12 - 61 new jobs, 20 duplicates (total: 732)
2026-02-08 16:11:55,915 - INFO - infor.avature.net: Page 8 - 14 new jobs, 0 duplicates (total: 124)
2026-02-08 16:11:56,622 - INFO - infor.avature.net: Page 5 - 14 new jobs, 0 duplicates (total: 82)
2026-02-08 16:11:59,261 - INFO - ikea.avature.net: Page 12 - 62 new jobs, 20 duplicates (total: 744)
2026-02-08 16:12:00,059 - INFO - infor.avature.net: Page 9 - 14 new jobs, 0 duplicates (total: 138)
2026-02-08 16:12:00,858 - INFO - infor.avature.net: Page 6 - 14 new jobs, 0 duplicates (total: 96)
2026-02-08 16:12:01,211 - INFO - ikea.avature.net: Page 13 - 61 new jobs, 20 duplicates (total: 793)


Progress: 70/188 sites | 23,443 jobs | 25.6 min elapsed


2026-02-08 16:16:28,644 - INFO - infor.avature.net: Page 18 - 14 new jobs, 0 duplicates (total: 403)
2026-02-08 16:16:28,822 - INFO - justicejobs.avature.net: Collected 270 unique jobs via HTML
2026-02-08 16:16:28,826 - INFO - Scraping koch.avature.net...
2026-02-08 16:16:29,056 - INFO - kellybridge.avature.net: Page 12 - 45 new jobs, 0 duplicates (total: 540)
2026-02-08 16:16:30,883 - INFO - justicejobs.avature.net: Page 17 - 10 new jobs, 0 duplicates (total: 225)
2026-02-08 16:16:34,694 - INFO - kellybridge.avature.net: Page 13 - 45 new jobs, 0 duplicates (total: 585)
2026-02-08 16:16:34,726 - INFO - infor.avature.net: Page 19 - 14 new jobs, 0 duplicates (total: 417)
2026-02-08 16:16:35,197 - INFO - justicejobs.avature.net: Page 18 - 10 new jobs, 0 duplicates (total: 235)
2026-02-08 16:16:36,355 - INFO - koch.avature.net: Page 1 - 44 new jobs, 0 duplicates (total: 44)
2026-02-08 16:16:39,906 - INFO - kellybridge.avature.net: Page 14 - 45 new jobs, 0 duplicates (total: 630)
2026-02-08


Progress: 80/188 sites | 26,008 jobs | 29.0 min elapsed


2026-02-08 16:19:48,389 - INFO - lindner.avature.net: Page 20 - 3 new jobs, 0 duplicates (total: 80)
2026-02-08 16:19:48,588 - INFO - lol.avature.net: Page 14 - 3 new jobs, 0 duplicates (total: 52)
2026-02-08 16:19:48,908 - INFO - loa.avature.net: Page 10 - 79 new jobs, 40 duplicates (total: 786)
2026-02-08 16:19:51,229 - INFO - lindner.avature.net: Collected 80 unique jobs via HTML
2026-02-08 16:19:51,229 - INFO - Scraping manpowergroup.avature.net...
2026-02-08 16:19:53,144 - INFO - lawson.avature.net: Page 18 - 7 new jobs, 0 duplicates (total: 138)
2026-02-08 16:19:53,185 - INFO - lol.avature.net: Page 15 - 3 new jobs, 0 duplicates (total: 55)
2026-02-08 16:19:54,269 - INFO - lol.avature.net: Page 1 - 13 new jobs, 10 duplicates (total: 13)
2026-02-08 16:19:54,520 - INFO - loa.avature.net: Page 11 - 69 new jobs, 40 duplicates (total: 855)
2026-02-08 16:19:57,200 - INFO - lawson.avature.net: Page 19 - 7 new jobs, 0 duplicates (total: 145)
2026-02-08 16:19:57,499 - INFO - lol.avature.n


Progress: 90/188 sites | 29,083 jobs | 32.3 min elapsed


2026-02-08 16:23:07,876 - INFO - mercadona.avature.net: Page 13 - 10 new jobs, 0 duplicates (total: 411)
2026-02-08 16:23:08,173 - INFO - metrobank.avature.net: Page 17 - 5 new jobs, 0 duplicates (total: 90)
2026-02-08 16:23:08,994 - INFO - mercadona.avature.net: Page 16 - 10 new jobs, 0 duplicates (total: 441)
2026-02-08 16:23:12,192 - INFO - mhcta.avature.net: Page 1 - 79 new jobs, 14 duplicates (total: 79)
2026-02-08 16:23:12,291 - INFO - missionpethealth.avature.net: Page 9 - 33 new jobs, 6 duplicates (total: 296)
2026-02-08 16:23:14,021 - INFO - mercadona.avature.net: Page 14 - 10 new jobs, 0 duplicates (total: 421)
2026-02-08 16:23:14,238 - INFO - metrobank.avature.net: Page 18 - 5 new jobs, 0 duplicates (total: 95)
2026-02-08 16:23:14,254 - INFO - mercadona.avature.net: Page 17 - 10 new jobs, 0 duplicates (total: 451)
2026-02-08 16:23:16,222 - INFO - mhcta.avature.net: Page 2 - 78 new jobs, 15 duplicates (total: 157)
2026-02-08 16:23:17,548 - INFO - missionpethealth.avature.net:


Progress: 100/188 sites | 32,930 jobs | 35.6 min elapsed


2026-02-08 16:26:27,556 - INFO - philips.avature.net: Page 8 - 1 new jobs, 2 duplicates (total: 9)
2026-02-08 16:26:28,056 - INFO - optavise.avature.net: Page 14 - 8 new jobs, 0 duplicates (total: 157)
2026-02-08 16:26:28,245 - INFO - onecall.avature.net: Page 17 - 3 new jobs, 0 duplicates (total: 80)
2026-02-08 16:26:32,150 - INFO - plantemoran.avature.net: Page 1 - 2 new jobs, 0 duplicates (total: 2)
2026-02-08 16:26:32,475 - INFO - philips.avature.net: Page 9 - 1 new jobs, 2 duplicates (total: 10)
2026-02-08 16:26:32,490 - INFO - optavise.avature.net: Page 15 - 8 new jobs, 0 duplicates (total: 165)
2026-02-08 16:26:32,522 - INFO - onecall.avature.net: Page 18 - 3 new jobs, 0 duplicates (total: 83)
2026-02-08 16:26:37,093 - INFO - onecall.avature.net: Page 19 - 3 new jobs, 0 duplicates (total: 86)
2026-02-08 16:26:37,098 - INFO - optavise.avature.net: Page 16 - 8 new jobs, 0 duplicates (total: 173)
2026-02-08 16:26:37,544 - INFO - philips.avature.net: Page 10 - 1 new jobs, 2 duplicat


Progress: 110/188 sites | 33,761 jobs | 39.2 min elapsed


2026-02-08 16:30:02,742 - INFO - platinion.avature.net: Page 15 - 2 new jobs, 0 duplicates (total: 48)
2026-02-08 16:30:06,277 - INFO - pontoonsolutions.avature.net: Page 1 - 17 new jobs, 15 duplicates (total: 17)
2026-02-08 16:30:06,781 - INFO - platinion.avature.net: Page 16 - 2 new jobs, 0 duplicates (total: 50)
2026-02-08 16:30:07,917 - INFO - pontoonsolutions.avature.net: Page 1 - 17 new jobs, 15 duplicates (total: 17)
2026-02-08 16:30:09,735 - INFO - pepsicoglobalpontoon.avature.net: Page 2 - 45 new jobs, 0 duplicates (total: 90)
2026-02-08 16:30:10,548 - INFO - pepsicoglobalpontoon.avature.net: Page 16 - 3 new jobs, 0 duplicates (total: 246)
2026-02-08 16:30:10,874 - INFO - pontoonsolutions.avature.net: Page 2 - 2 new jobs, 0 duplicates (total: 19)
2026-02-08 16:30:11,901 - INFO - platinion.avature.net: Page 17 - 2 new jobs, 0 duplicates (total: 52)
2026-02-08 16:30:13,020 - INFO - pontoonsolutions.avature.net: Page 2 - 2 new jobs, 0 duplicates (total: 19)
2026-02-08 16:30:15,99


Progress: 120/188 sites | 36,854 jobs | 43.1 min elapsed


2026-02-08 16:33:52,484 - INFO - radpartners.avature.net: Page 18 - 2 new jobs, 0 duplicates (total: 120)
2026-02-08 16:33:55,816 - INFO - resourcebank.avature.net: Page 1 - 31 new jobs, 20 duplicates (total: 31)
2026-02-08 16:33:56,620 - INFO - resourcebank.avature.net: Page 5 - 3 new jobs, 0 duplicates (total: 31)
2026-02-08 16:33:56,743 - INFO - regis.avature.net: Page 5 - 3 new jobs, 0 duplicates (total: 105)
2026-02-08 16:33:57,497 - INFO - radpartners.avature.net: Page 19 - 2 new jobs, 0 duplicates (total: 122)
2026-02-08 16:33:58,944 - INFO - resourcebank.avature.net: Page 1 - 21 new jobs, 0 duplicates (total: 21)
2026-02-08 16:34:00,364 - INFO - resourcebank.avature.net: Page 2 - 31 new jobs, 20 duplicates (total: 62)
2026-02-08 16:34:00,788 - INFO - regis.avature.net: Page 6 - 3 new jobs, 0 duplicates (total: 108)
2026-02-08 16:34:01,421 - INFO - resourcebank.avature.net: Page 6 - 3 new jobs, 0 duplicates (total: 34)
2026-02-08 16:34:02,717 - INFO - radpartners.avature.net: Pa


Progress: 130/188 sites | 37,969 jobs | 46.6 min elapsed


2026-02-08 16:37:26,841 - INFO - sandboxdeloitteglobal.avature.net: Page 5 - 2 new jobs, 0 duplicates (total: 10)
2026-02-08 16:37:26,883 - INFO - sandboxally.avature.net: Page 4 - 3 new jobs, 0 duplicates (total: 86)
2026-02-08 16:37:28,364 - INFO - sandboxcredicorpats.avature.net: Page 7 - 10 new jobs, 0 duplicates (total: 83)
2026-02-08 16:37:29,175 - INFO - sandboxdelta.avature.net: Page 1 - 24 new jobs, 10 duplicates (total: 24)
2026-02-08 16:37:29,835 - INFO - sandbox2lenovo.avature.net: Page 20 - 30 new jobs, 21 duplicates (total: 601)
2026-02-08 16:37:31,836 - INFO - sandboxdeloitteglobal.avature.net: Page 6 - 2 new jobs, 0 duplicates (total: 12)
2026-02-08 16:37:32,492 - INFO - sandbox2lenovo.avature.net: Collected 601 unique jobs via HTML
2026-02-08 16:37:32,495 - INFO - Scraping sandboxone800flowers.avature.net...
2026-02-08 16:37:32,737 - INFO - sandboxally.avature.net: Page 5 - 3 new jobs, 0 duplicates (total: 89)
2026-02-08 16:37:32,793 - INFO - sandboxdelta.avature.net: 


Progress: 140/188 sites | 40,670 jobs | 49.7 min elapsed


2026-02-08 16:40:33,284 - INFO - stagingunifi.avature.net: Page 1 - 15 new jobs, 0 duplicates (total: 15)
2026-02-08 16:40:34,237 - INFO - sandboxmgl.avature.net: Page 18 - 1 new jobs, 0 duplicates (total: 109)
2026-02-08 16:40:35,247 - INFO - sandboxunifi.avature.net: Page 4 - 20 new jobs, 23 duplicates (total: 76)
2026-02-08 16:40:37,631 - INFO - stagingkoch.avature.net: Page 1 - 44 new jobs, 0 duplicates (total: 44)
2026-02-08 16:40:38,284 - INFO - stagingunifi.avature.net: Page 2 - 3 new jobs, 0 duplicates (total: 18)
2026-02-08 16:40:38,805 - INFO - stagingxerox.avature.net: Page 1 - 26 new jobs, 10 duplicates (total: 26)
2026-02-08 16:40:39,103 - INFO - sandboxmgl.avature.net: Page 19 - 1 new jobs, 0 duplicates (total: 110)
2026-02-08 16:40:40,374 - INFO - sandboxunifi.avature.net: Page 5 - 19 new jobs, 24 duplicates (total: 95)
2026-02-08 16:40:42,645 - INFO - stagingkoch.avature.net: Page 2 - 44 new jobs, 0 duplicates (total: 88)
2026-02-08 16:40:43,298 - INFO - stagingunifi.av


Progress: 150/188 sites | 42,652 jobs | 53.2 min elapsed


2026-02-08 16:43:59,637 - INFO - transcom.avature.net: Page 2 - 35 new jobs, 12 duplicates (total: 71)
2026-02-08 16:44:00,741 - INFO - tmf.avature.net: Page 19 - 3 new jobs, 0 duplicates (total: 82)
2026-02-08 16:44:03,416 - INFO - tql.avature.net: Page 7 - 8 new jobs, 6 duplicates (total: 56)
2026-02-08 16:44:03,594 - INFO - traderjoes.avature.net: Page 6 - 2 new jobs, 0 duplicates (total: 192)
2026-02-08 16:44:04,695 - INFO - transcom.avature.net: Page 3 - 35 new jobs, 12 duplicates (total: 106)
2026-02-08 16:44:04,700 - INFO - tmf.avature.net: Page 20 - 3 new jobs, 0 duplicates (total: 85)
2026-02-08 16:44:07,651 - INFO - tmf.avature.net: Collected 85 unique jobs via HTML
2026-02-08 16:44:07,651 - INFO - Scraping uclahealth.avature.net...
2026-02-08 16:44:08,000 - INFO - traderjoes.avature.net: Page 7 - 2 new jobs, 0 duplicates (total: 194)
2026-02-08 16:44:08,338 - INFO - tql.avature.net: Page 8 - 8 new jobs, 6 duplicates (total: 64)
2026-02-08 16:44:08,422 - INFO - uatmonadelphou


Progress: 160/188 sites | 43,842 jobs | 56.5 min elapsed


2026-02-08 16:47:16,121 - INFO - unifi.avature.net: Page 4 - 2 new jobs, 0 duplicates (total: 22)
2026-02-08 16:47:16,148 - INFO - unifi.avature.net: Page 8 - 5 new jobs, 0 duplicates (total: 69)
2026-02-08 16:47:16,954 - INFO - unifi.avature.net: Page 8 - 16 new jobs, 27 duplicates (total: 157)
2026-02-08 16:47:18,378 - INFO - uirevision.avature.net: Page 18 - 4 new jobs, 0 duplicates (total: 72)
2026-02-08 16:47:20,308 - INFO - unifi.avature.net: Page 9 - 5 new jobs, 0 duplicates (total: 74)
2026-02-08 16:47:21,127 - INFO - unifi.avature.net: Page 5 - 2 new jobs, 0 duplicates (total: 24)
2026-02-08 16:47:21,976 - INFO - unifi.avature.net: Page 9 - 18 new jobs, 25 duplicates (total: 175)
2026-02-08 16:47:22,266 - INFO - uirevision.avature.net: Page 19 - 4 new jobs, 0 duplicates (total: 76)
2026-02-08 16:47:22,781 - INFO - vanoord.avature.net: Page 1 - 10 new jobs, 6 duplicates (total: 10)
2026-02-08 16:47:24,203 - INFO - unifi.avature.net: Page 10 - 5 new jobs, 0 duplicates (total: 79


Progress: 170/188 sites | 44,802 jobs | 59.7 min elapsed


2026-02-08 16:50:30,049 - INFO - voutiqueintegrations.avature.net: Page 9 - 4 new jobs, 0 duplicates (total: 36)
2026-02-08 16:50:30,473 - INFO - voutiquetraining.avature.net: Page 9 - 3 new jobs, 0 duplicates (total: 27)
2026-02-08 16:50:32,306 - INFO - walmartsourcingeu.avature.net: Page 5 - 2 new jobs, 0 duplicates (total: 10)
2026-02-08 16:50:32,521 - INFO - voutiqueintegrations.avature.net: Page 16 - 4 new jobs, 0 duplicates (total: 64)
2026-02-08 16:50:35,382 - INFO - wickes.avature.net: Page 1 - 23 new jobs, 20 duplicates (total: 23)
2026-02-08 16:50:35,737 - INFO - voutiqueintegrations.avature.net: Page 10 - 4 new jobs, 0 duplicates (total: 40)
2026-02-08 16:50:35,737 - INFO - voutiquetraining.avature.net: Page 10 - 3 new jobs, 0 duplicates (total: 30)
2026-02-08 16:50:35,986 - INFO - walmartsourcingeu.avature.net: Page 6 - 2 new jobs, 0 duplicates (total: 12)
2026-02-08 16:50:37,341 - INFO - voutiqueintegrations.avature.net: Page 17 - 4 new jobs, 0 duplicates (total: 68)
2026-


Progress: 180/188 sites | 45,190 jobs | 62.0 min elapsed


2026-02-08 16:52:50,548 - INFO - zungfu.avature.net: Page 13 - 4 new jobs, 0 duplicates (total: 88)
2026-02-08 16:52:51,411 - INFO - workmyway.avature.net: Page 11 - 18 new jobs, 0 duplicates (total: 367)
2026-02-08 16:52:54,224 - INFO - xerox.avature.net: Page 14 - 6 new jobs, 0 duplicates (total: 264)
2026-02-08 16:52:55,969 - INFO - zungfu.avature.net: Page 14 - 4 new jobs, 0 duplicates (total: 92)
2026-02-08 16:52:59,654 - INFO - xerox.avature.net: Page 15 - 6 new jobs, 0 duplicates (total: 270)
2026-02-08 16:52:59,967 - INFO - workmyway.avature.net: Page 12 - 18 new jobs, 0 duplicates (total: 385)
2026-02-08 16:53:01,296 - INFO - zungfu.avature.net: Page 15 - 4 new jobs, 0 duplicates (total: 96)
2026-02-08 16:53:05,354 - INFO - xerox.avature.net: Page 16 - 6 new jobs, 0 duplicates (total: 276)
2026-02-08 16:53:06,171 - INFO - delta.avature.net: Collected 5 jobs from partial
2026-02-08 16:53:06,173 - INFO - Scraping jawood.avature.net (partial - 1 jobs)...
2026-02-08 16:53:06,614 -


SCRAPING COMPLETE

Total time: 63.2 minutes
Total jobs collected: 46,151
Sites scraped: 188
Average: 245.5 jobs/site


In [7]:
df_jobs = pd.DataFrame(all_jobs)

print("\n" + "="*60)
print("DEDUPLICATION")
print("="*60)

before_dedup = len(df_jobs)
print(f"\nOriginal jobs collected: {before_dedup:,}")

# Step 1: Remove same-tenant duplicates
df_jobs_step1 = df_jobs.drop_duplicates(subset=['tenant', 'job_id'], keep='first')
removed_step1 = before_dedup - len(df_jobs_step1)
print(f"Step 1 - Same-tenant duplicates: {removed_step1:,} removed")

# Step 2: Remove cross-tenant duplicates
df_jobs_step2 = df_jobs_step1.drop_duplicates(subset=['job_id'], keep='first')
removed_step2 = len(df_jobs_step1) - len(df_jobs_step2)
print(f"Step 2 - Cross-tenant duplicates: {removed_step2:,} removed")

# Step 3: Remove URL duplicates
df_jobs_final = df_jobs_step2.drop_duplicates(subset=['job_url'], keep='first')
removed_step3 = len(df_jobs_step2) - len(df_jobs_final)
print(f"Step 3 - URL duplicates: {removed_step3:,} removed")

print(f"\nFinal result:")
print(f"  Before: {before_dedup:,}")
print(f"  After: {len(df_jobs_final):,}")
print(f"  Removed: {before_dedup - len(df_jobs_final):,} ({(before_dedup - len(df_jobs_final))/before_dedup*100:.1f}%)")

df_jobs = df_jobs_final

# Save files
df_jobs.to_csv(OUTPUT_JOBS_CSV, index=False)
print(f"\nSaved CSV: {OUTPUT_JOBS_CSV}")

with open(OUTPUT_JOBS_JSON, 'w') as f:
    json.dump(df_jobs.to_dict('records'), f, indent=2)
print(f"Saved JSON: {OUTPUT_JOBS_JSON}")



DEDUPLICATION

Original jobs collected: 46,151
Step 1 - Same-tenant duplicates: 40,196 removed
Step 2 - Cross-tenant duplicates: 325 removed
Step 3 - URL duplicates: 0 removed

Final result:
  Before: 46,151
  After: 5,630
  Removed: 40,521 (87.8%)

Saved CSV: ./data/avature_jobs.csv
Saved JSON: ./data/avature_jobs.json


In [8]:
print("\n" + "="*60)
print("DATA QUALITY REPORT")
print("="*60)

# Check completeness
print(f"\nField Completeness:")
for col in ['job_title', 'job_url', 'job_id', 'location', 'job_description']:
    completeness = df_jobs[col].notna().sum() / len(df_jobs) * 100
    print(f"  {col}: {completeness:.1f}%")

# Check for issues
issues = []

no_title = df_jobs[df_jobs['job_title'].isna()]
if len(no_title) > 0:
    issues.append(f"{len(no_title)} jobs missing titles")

no_url = df_jobs[df_jobs['job_url'].isna()]
if len(no_url) > 0:
    issues.append(f"{len(no_url)} jobs missing URLs")

if issues:
    print(f"\nIssues Found:")
    for issue in issues:
        print(f"  {issue}")
else:
    print(f"\nNo critical issues found")



DATA QUALITY REPORT

Field Completeness:
  job_title: 100.0%
  job_url: 100.0%
  job_id: 100.0%
  location: 27.6%
  job_description: 0.4%

No critical issues found
