## 1. Get the list of the jobs

In [51]:
import requests


# Step 1: Fetch the areas from the API and extract the set of IDs for Russia
def get_russian_area_ids():
    url = 'https://api.hh.ru/areas'
    response = requests.get(url)
    if response.status_code == 200:
        areas = response.json()
        russian_area_ids = set()

        def extract_ids(area):
            if area['id'] == '113':  # Russia ID
                for sub_area in area['areas']:
                    russian_area_ids.add(sub_area['id'])
                    for sub_sub_area in sub_area['areas']:
                        russian_area_ids.add(sub_sub_area['id'])
            else:
                for sub_area in area['areas']:
                    extract_ids(sub_area)

        for area in areas:
            extract_ids(area)

        return russian_area_ids
    else:
        print(f"Failed to fetch areas. Status code: {response.status_code}")
        return set()

russian_area_ids = get_russian_area_ids()
len(russian_area_ids)

8578

In [44]:
import random
import time
import requests
import json
import csv
from datetime import datetime, timedelta

from tqdm import tqdm

API_BASE_URL = "https://api.hh.ru/vacancies"
DATE_FORMAT = "%Y-%m-%d"
START_DATE = "2024-10-01"
END_DATE = "2024-11-25"
# all roles for IT in hh:
PROFESSIONAL_ROLES = ['156', '160', '10', '12', '150', '25', '165', '34', '36', '73', '155', '96', '164', '104', '157', '107', '112', '113', '148', '114', '116', '121', '124', '125', '126']
PER_PAGE = 100
MAX_PAGES = 20

def get_vacancies(date_from, date_to, page):
    url = f"{API_BASE_URL}?only_with_salary=true&date_from={date_from}&date_to={date_to}&page={page}&per_page={PER_PAGE}"
    for role in PROFESSIONAL_ROLES:
        url += f"&professional_role={role}"
    # Headers to mimic Firefox on Mac
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'TE': 'Trailers'
    }
    max_retries = 5
    backoff_factor = 1

    for _ in range(max_retries):
        try:
            # Fetch the page content with a timeout of 1 minute
            response = requests.get(url, headers=headers, timeout=60)
            if response.status_code == 200:
                job = response.json()
                return job
            else:
                print(f"Failed to fetch for page {page} date {date_from}. Status code: {response.status_code}")
                return {'items': []}
        except requests.RequestException as e:
            print(f"Request failed: {e}. Retrying in {backoff_factor} seconds...")
            time.sleep(backoff_factor)
            backoff_factor *= 2  # Exponential backoff

    raise Exception(f"Failed to fetch page {page} for date {date_from} after {max_retries} attempts")

def get_vacancy_details(vacancy_id):
    url = f"{API_BASE_URL}/{vacancy_id}"
    # Headers to mimic Firefox on Mac
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'TE': 'Trailers'
    }

    max_retries = 5
    backoff_factor = 1

    for _ in range(max_retries):
        try:
            # Fetch the page content with a timeout of 1 minute
            response = requests.get(url, headers=headers, timeout=60)
            if response.status_code == 200:
                job = response.json()
                return job
            else:
                print(f"Failed to fetch job details. Status code: {response.status_code}")
                return {
                    'id': vacancy_id,
                    'error': f"Failed to fetch job details. Status code: {response.status_code}",
                    'description': 'null',
                }
                # raise Exception(f"Failed to fetch vacancy details for {vacancy_id}")
        except requests.RequestException as e:
            print(f"Request failed: {e}. Retrying in {backoff_factor} seconds...")
            time.sleep(backoff_factor)
            backoff_factor *= 2  # Exponential backoff

    raise Exception(f"Failed to fetch vacancy details for {vacancy_id} after {max_retries} attempts")

def collect_vacancies():
    current_date = datetime.strptime(START_DATE, DATE_FORMAT)
    end_date = datetime.strptime(END_DATE, DATE_FORMAT)
    all_vacancies = []

    while current_date <= end_date:
        date_from = current_date.strftime(DATE_FORMAT)
        date_to = (current_date + timedelta(days=1)).strftime(DATE_FORMAT)
        page = 0
        print(f"Collecting vacancies for {date_from}...")

        while True:
            vacancies = get_vacancies(date_from, date_to, page)
            if not vacancies['items']:
                break
            all_vacancies.extend(vacancies['items'])
            if len(vacancies['items']) < PER_PAGE or page >= MAX_PAGES - 1:
                break
            page += 1

        current_date += timedelta(days=1)

    return all_vacancies

def collect_vacancy_details(vacancies):
    all_details = []
    for vacancy in tqdm(vacancies, desc="Collecting vacancy details"):
        details = get_vacancy_details(vacancy['id'])
        # sleep for a random time between 0.5 and 1 seconds
        time.sleep(0.5 + 0.5 * random.random())
        all_details.append(details)
    return all_details

def process_vacancies_in_batches(vacancies, batch_size=100, start_batch=0):
    for i in tqdm(range(start_batch * batch_size, len(vacancies), batch_size), desc="Processing batches"):
        batch = vacancies[i:i + batch_size]
        # batch_vacancies = [{'id': vacancy_id} for vacancy_id in batch]
        batch_details = collect_vacancy_details(batch)

        # Save the batch details to a JSON file
        batch_file_name = f"../data/hh_18000_24000/hh_vacancies_batch_{i // batch_size}.json"
        with open(batch_file_name, 'w', encoding='utf-8') as f:
            json.dump(batch_details, f, ensure_ascii=False, indent=4)
        print(f"Saved batch {i // batch_size} to {batch_file_name}")

def save_to_csv(vacancies, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["job_title", "link", "company", "technologies", "description_text", "salary_from", "salary_to", "currency", "locations", "url", "salary"])

        for vacancy in vacancies:
            job_title = vacancy.get("name", "")
            link = vacancy.get("alternate_url", "")
            company = vacancy.get("employer", {}).get("name", "")
            technologies = ", ".join([skill["name"] for skill in vacancy.get("key_skills", [])])
            description_text = vacancy.get("description", "").replace("\n", " ").replace("\r", " ")
            salary = vacancy.get("salary", {})
            salary_from = salary.get("from", "")
            salary_to = salary.get("to", "")
            currency = salary.get("currency", "")
            locations = vacancy.get("area", {}).get("name", "")
            url = vacancy.get("url", "")
            salary_info = f"{salary_from} - {salary_to} {currency}"

            writer.writerow([job_title, link, company, technologies, description_text, salary_from, salary_to, currency, locations, url, salary_info])

### Data cleaning

In [55]:
# open all_vacancies.json and find len(all_vacancies) to get the number of vacancies
jobs = []
with open('hh_all_vacancies_extra_roles.json', 'r') as f:
    jobs = json.load(f)

print(len(jobs))

34951


In [56]:
# create a set of job ids and check if there are any duplicates
job_ids = set()
for job in jobs:
    job_ids.add(job['id'])

print(len(job_ids))

18676


In [57]:
from tqdm import tqdm
# write a script that removes duplicates from all_vacancies.json
# and saves the result to all_vacancies_no_duplicates.json
# for each job, check if the id is already fetched
# if yes, check the published_at, keep the latest one
# datess are strings, so compare them as datetime objects
# date format is 2024-10-07T18:23:45+0300

jobs_no_duplicates = {}
for job in tqdm(jobs):
    if job['id'] not in jobs_no_duplicates:
        jobs_no_duplicates[job['id']] = job
    else:
        if datetime.strptime(job['published_at'], "%Y-%m-%dT%H:%M:%S%z") > datetime.strptime(jobs_no_duplicates[job['id']]['published_at'], "%Y-%m-%dT%H:%M:%S%z"):
            jobs_no_duplicates[job['id']] = job

100%|██████████| 34951/34951 [00:00<00:00, 81324.13it/s]


In [58]:
print(len(jobs_no_duplicates))

18676


In [59]:
# save the result to all_vacancies_no_duplicates.json
with open('hh_all_vacancies_extra_roles_no_duplicates.json', 'w') as f:
    json.dump(list(jobs_no_duplicates.values()), f, ensure_ascii=False, indent=4)

In [60]:
# filter out the jobs that are in Russia
jobs_no_duplicates_ru = [job for job in jobs_no_duplicates.values() if job['area']['id'] in russian_area_ids]

In [61]:
len(jobs_no_duplicates_ru)

17334

In [62]:
jobs_no_duplicates_ru[0]

{'id': '108664089',
 'premium': False,
 'name': 'Project-менеджер',
 'department': None,
 'has_test': False,
 'response_letter_required': False,
 'area': {'id': '15',
  'name': 'Астрахань',
  'url': 'https://api.hh.ru/areas/15'},
 'salary': {'from': 35000, 'to': 50000, 'currency': 'RUR', 'gross': True},
 'type': {'id': 'open', 'name': 'Открытая'},
 'address': {'city': 'Астрахань',
  'street': 'улица Николая Островского',
  'building': '148Е',
  'lat': 46.33419042214568,
  'lng': 48.064644619712105,
  'description': None,
  'raw': 'Астрахань, улица Николая Островского, 148Е',
  'metro': None,
  'metro_stations': [],
  'id': '16137035'},
 'response_url': None,
 'sort_point_distance': None,
 'published_at': '2024-10-15T11:29:43+0300',
 'created_at': '2024-10-15T11:29:43+0300',
 'archived': False,
 'apply_alternate_url': 'https://hh.ru/applicant/vacancy_response?vacancyId=108664089',
 'show_logo_in_search': None,
 'insider_interview': None,
 'url': 'https://api.hh.ru/vacancies/108664089?ho

In [63]:
# save the result to all_vacancies_no_duplicates_ru.json
with open('hh_all_vacancies_extra_roles_no_duplicates_ru.json', 'w') as f:
    json.dump(jobs_no_duplicates_ru, f, ensure_ascii=False, indent=4)

In [67]:
jobs_no_duplicates_ru_extra_roles = jobs_no_duplicates_ru

In [68]:
with open('hh_all_vacancies_no_duplicates_ru.json', 'r') as f:
    jobs_no_duplicates_ru_base = json.load(f)
    # json.dump(jobs_no_duplicates_ru, f, ensure_ascii=False, indent=4)

In [69]:
len(jobs_no_duplicates_ru_base)

14950

In [71]:
jobs_no_duplicates_ru_combined = jobs_no_duplicates_ru_base + jobs_no_duplicates_ru_extra_roles
len(jobs_no_duplicates_ru_combined)

32284

In [72]:
job_ids = set()
for job in jobs_no_duplicates_ru_combined:
    job_ids.add(job['id'])

len(job_ids)

32283

In [73]:
# remove duplicates
jobs_no_duplicates_ru_combined_no_duplicates = {}
for job in tqdm(jobs_no_duplicates_ru_combined):
    if job['id'] not in jobs_no_duplicates_ru_combined_no_duplicates:
        jobs_no_duplicates_ru_combined_no_duplicates[job['id']] = job
    else:
        if datetime.strptime(job['published_at'], "%Y-%m-%dT%H:%M:%S%z") > datetime.strptime(jobs_no_duplicates_ru_combined_no_duplicates[job['id']]['published_at'], "%Y-%m-%dT%H:%M:%S%z"):
            jobs_no_duplicates_ru_combined_no_duplicates[job['id']] = job

len(jobs_no_duplicates_ru_combined_no_duplicates)

100%|██████████| 32284/32284 [00:00<00:00, 345608.11it/s]


32283

In [74]:
jobs_no_duplicates_ru_combined_no_duplicates = [job for job in jobs_no_duplicates_ru_combined_no_duplicates.values()]

In [75]:
# save the result to hh_all_vacancies_no_duplicates_ru_combined.json
with open('hh_all_vacancies_no_duplicates_ru_combined.json', 'w') as f:
    json.dump(jobs_no_duplicates_ru_combined_no_duplicates, f, ensure_ascii=False, indent=4)

In [51]:
with open('../data/hh_all_vacancies_no_duplicates_ru_combined.json', 'r') as f:
    all_vacancies = json.load(f)

all_vacancies[0]

{'id': '108298685',
 'premium': False,
 'name': 'Senior Python разработчик',
 'department': None,
 'has_test': False,
 'response_letter_required': False,
 'area': {'id': '1', 'name': 'Москва', 'url': 'https://api.hh.ru/areas/1'},
 'salary': {'from': None, 'to': 370000, 'currency': 'RUR', 'gross': True},
 'type': {'id': 'open', 'name': 'Открытая'},
 'address': None,
 'response_url': None,
 'sort_point_distance': None,
 'published_at': '2024-10-07T18:23:45+0300',
 'created_at': '2024-10-07T18:23:45+0300',
 'archived': True,
 'apply_alternate_url': 'https://hh.ru/applicant/vacancy_response?vacancyId=108298685',
 'show_logo_in_search': None,
 'insider_interview': None,
 'url': 'https://api.hh.ru/vacancies/108298685?host=hh.ru',
 'alternate_url': 'https://hh.ru/vacancy/108298685',
 'relations': [],
 'employer': {'id': '2639497',
  'name': 'АйТиКвик',
  'url': 'https://api.hh.ru/employers/2639497',
  'alternate_url': 'https://hh.ru/employer/2639497',
  'logo_urls': {'original': 'https://img.

In [52]:
len(all_vacancies)

32283

## 2. Get the info for each job

Example for processing jobs # 18000 to 24000, change to the slice you need!

In [54]:
# Step 3: Collect details for each vacancy
all_vacancy_details = process_vacancies_in_batches(all_vacancies[18000:24000], batch_size=100, start_batch=35)

Collecting vacancy details: 100%|██████████| 100/100 [02:40<00:00,  1.61s/it]
Processing batches:   4%|▍         | 1/25 [02:40<1:04:16, 160.71s/it]

Saved batch 35 to ../data/hh_18000_24000/hh_vacancies_batch_35.json


Collecting vacancy details: 100%|██████████| 100/100 [01:58<00:00,  1.18s/it]
Processing batches:   8%|▊         | 2/25 [04:39<52:02, 135.76s/it]  

Saved batch 36 to ../data/hh_18000_24000/hh_vacancies_batch_36.json


Collecting vacancy details: 100%|██████████| 100/100 [01:57<00:00,  1.18s/it]
Processing batches:  12%|█▏        | 3/25 [06:36<46:44, 127.49s/it]

Saved batch 37 to ../data/hh_18000_24000/hh_vacancies_batch_37.json




Failed to fetch job details. Status code: 404




Failed to fetch job details. Status code: 404


Collecting vacancy details: 100%|██████████| 100/100 [02:05<00:00,  1.26s/it]
Processing batches:  16%|█▌        | 4/25 [08:42<44:23, 126.85s/it]

Saved batch 38 to ../data/hh_18000_24000/hh_vacancies_batch_38.json


Collecting vacancy details: 100%|██████████| 100/100 [01:59<00:00,  1.20s/it]
Processing batches:  20%|██        | 5/25 [10:42<41:26, 124.32s/it]

Saved batch 39 to ../data/hh_18000_24000/hh_vacancies_batch_39.json


Collecting vacancy details: 100%|██████████| 100/100 [01:57<00:00,  1.18s/it]
Processing batches:  24%|██▍       | 6/25 [12:40<38:39, 122.07s/it]

Saved batch 40 to ../data/hh_18000_24000/hh_vacancies_batch_40.json


Collecting vacancy details: 100%|██████████| 100/100 [02:02<00:00,  1.23s/it]
Processing batches:  28%|██▊       | 7/25 [14:42<36:42, 122.34s/it]

Saved batch 41 to ../data/hh_18000_24000/hh_vacancies_batch_41.json


Collecting vacancy details: 100%|██████████| 100/100 [02:08<00:00,  1.29s/it]
Processing batches:  32%|███▏      | 8/25 [16:51<35:15, 124.46s/it]

Saved batch 42 to ../data/hh_18000_24000/hh_vacancies_batch_42.json


Collecting vacancy details: 100%|██████████| 100/100 [02:01<00:00,  1.21s/it]
Processing batches:  36%|███▌      | 9/25 [18:53<32:56, 123.54s/it]

Saved batch 43 to ../data/hh_18000_24000/hh_vacancies_batch_43.json


Collecting vacancy details: 100%|██████████| 100/100 [02:27<00:00,  1.47s/it]
Processing batches:  40%|████      | 10/25 [21:20<32:43, 130.92s/it]

Saved batch 44 to ../data/hh_18000_24000/hh_vacancies_batch_44.json




Failed to fetch job details. Status code: 404


Collecting vacancy details: 100%|██████████| 100/100 [02:27<00:00,  1.47s/it]
Processing batches:  44%|████▍     | 11/25 [23:48<31:42, 135.92s/it]

Saved batch 45 to ../data/hh_18000_24000/hh_vacancies_batch_45.json


Collecting vacancy details: 100%|██████████| 100/100 [01:57<00:00,  1.17s/it]
Processing batches:  48%|████▊     | 12/25 [25:45<28:12, 130.20s/it]

Saved batch 46 to ../data/hh_18000_24000/hh_vacancies_batch_46.json


Collecting vacancy details: 100%|██████████| 100/100 [02:07<00:00,  1.27s/it]
Processing batches:  52%|█████▏    | 13/25 [27:52<25:52, 129.38s/it]

Saved batch 47 to ../data/hh_18000_24000/hh_vacancies_batch_47.json


Collecting vacancy details: 100%|██████████| 100/100 [01:58<00:00,  1.18s/it]
Processing batches:  56%|█████▌    | 14/25 [29:51<23:07, 126.11s/it]

Saved batch 48 to ../data/hh_18000_24000/hh_vacancies_batch_48.json


Collecting vacancy details: 100%|██████████| 100/100 [02:01<00:00,  1.21s/it]
Processing batches:  60%|██████    | 15/25 [31:52<20:45, 124.59s/it]

Saved batch 49 to ../data/hh_18000_24000/hh_vacancies_batch_49.json




Request failed: HTTPSConnectionPool(host='api.hh.ru', port=443): Max retries exceeded with url: /vacancies/110960359 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116eeb8d0>: Failed to resolve 'api.hh.ru' ([Errno 8] nodename nor servname provided, or not known)")). Retrying in 1 seconds...
Request failed: HTTPSConnectionPool(host='api.hh.ru', port=443): Max retries exceeded with url: /vacancies/110960359 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116f08650>: Failed to resolve 'api.hh.ru' ([Errno 8] nodename nor servname provided, or not known)")). Retrying in 2 seconds...
Request failed: HTTPSConnectionPool(host='api.hh.ru', port=443): Max retries exceeded with url: /vacancies/110960359 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116f3e210>: Failed to resolve 'api.hh.ru' ([Errno 8] nodename nor servname provided, or not known)")). Retrying in 4 seconds...


Collecting vacancy details: 100%|██████████| 100/100 [02:19<00:00,  1.39s/it]
Processing batches:  64%|██████▍   | 16/25 [34:11<19:20, 128.95s/it]

Saved batch 50 to ../data/hh_18000_24000/hh_vacancies_batch_50.json




Failed to fetch job details. Status code: 404


Collecting vacancy details: 100%|██████████| 100/100 [02:10<00:00,  1.31s/it]
Processing batches:  68%|██████▊   | 17/25 [36:22<17:15, 129.48s/it]

Saved batch 51 to ../data/hh_18000_24000/hh_vacancies_batch_51.json


Collecting vacancy details: 100%|██████████| 100/100 [02:19<00:00,  1.40s/it]
Processing batches:  72%|███████▏  | 18/25 [38:42<15:28, 132.61s/it]

Saved batch 52 to ../data/hh_18000_24000/hh_vacancies_batch_52.json




Request failed: HTTPSConnectionPool(host='api.hh.ru', port=443): Read timed out. (read timeout=60). Retrying in 1 seconds...


Collecting vacancy details: 100%|██████████| 100/100 [03:27<00:00,  2.08s/it]
Processing batches:  76%|███████▌  | 19/25 [42:09<15:31, 155.21s/it]

Saved batch 53 to ../data/hh_18000_24000/hh_vacancies_batch_53.json


Collecting vacancy details: 100%|██████████| 100/100 [02:29<00:00,  1.50s/it]
Processing batches:  80%|████████  | 20/25 [44:39<12:48, 153.61s/it]

Saved batch 54 to ../data/hh_18000_24000/hh_vacancies_batch_54.json




Request failed: HTTPSConnectionPool(host='api.hh.ru', port=443): Read timed out. (read timeout=60). Retrying in 1 seconds...


Collecting vacancy details: 100%|██████████| 100/100 [03:40<00:00,  2.20s/it]
Processing batches:  84%|████████▍ | 21/25 [48:20<11:34, 173.68s/it]

Saved batch 55 to ../data/hh_18000_24000/hh_vacancies_batch_55.json


Collecting vacancy details: 100%|██████████| 100/100 [02:22<00:00,  1.43s/it]
Processing batches:  88%|████████▊ | 22/25 [50:42<08:13, 164.34s/it]

Saved batch 56 to ../data/hh_18000_24000/hh_vacancies_batch_56.json


Collecting vacancy details: 100%|██████████| 100/100 [02:27<00:00,  1.48s/it]
Processing batches:  92%|█████████▏| 23/25 [53:10<05:18, 159.45s/it]

Saved batch 57 to ../data/hh_18000_24000/hh_vacancies_batch_57.json


Collecting vacancy details: 100%|██████████| 100/100 [02:12<00:00,  1.32s/it]
Processing batches:  96%|█████████▌| 24/25 [55:23<02:31, 151.25s/it]

Saved batch 58 to ../data/hh_18000_24000/hh_vacancies_batch_58.json




Request failed: HTTPSConnectionPool(host='api.hh.ru', port=443): Read timed out. (read timeout=60). Retrying in 1 seconds...


Collecting vacancy details: 100%|██████████| 100/100 [03:28<00:00,  2.09s/it]
Processing batches: 100%|██████████| 25/25 [58:51<00:00, 141.27s/it]

Saved batch 59 to ../data/hh_18000_24000/hh_vacancies_batch_59.json





### Postprocessing/cleaning

In [58]:
# write a script that reads all the hh_vacancies_batch_*.json files
# and combines them into a single list of dictionaries
# and saves the result to all_vacancy_details.json

import json
import os
%cd ../data/hh_24000_-1/
all_vacancy_details = []
for file in os.listdir():
    # if file.startswith("hh_vacancies_batch_") and file.endswith(".json"):
    if (file.startswith("hh_vacancies_batch_") or file.startswith("hh_24000_-1")) and file.endswith(".json"):
        with open(file, 'r') as f:
            all_vacancy_details.extend(json.load(f))

%cd ../../scraping/

  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


/Users/dmitrii.shiriaev/Code/learning/ods-nlp-2024/final-project/data/hh_24000_-1
/Users/dmitrii.shiriaev/Code/learning/ods-nlp-2024/final-project/scraping


In [59]:
len(all_vacancy_details), len(set([job['id'] for job in all_vacancy_details]))

(8283, 8283)

In [60]:
with open('../data/all_vacancy_details_24000_-1.json', 'w') as f:
    json.dump(all_vacancy_details, f, ensure_ascii=False, indent=4)

In [86]:
all_vacancy_details[0]

{'id': '109755906',
 'premium': False,
 'billing_type': {'id': 'standard', 'name': 'Стандарт'},
 'relations': [],
 'name': 'Системный администратор',
 'insider_interview': None,
 'response_letter_required': False,
 'area': {'id': '19', 'name': 'Брянск', 'url': 'https://api.hh.ru/areas/19'},
 'salary': {'from': 50000, 'to': None, 'currency': 'RUR', 'gross': False},
 'type': {'id': 'open', 'name': 'Открытая'},
 'address': {'city': 'Брянск',
  'street': 'Фосфоритная улица',
  'building': '1В',
  'lat': 53.259676,
  'lng': 34.476164,
  'description': None,
  'raw': 'Брянск, Фосфоритная улица, 1В',
  'metro': None,
  'metro_stations': []},
 'allow_messages': True,
 'experience': {'id': 'between1And3', 'name': 'От 1 года до 3 лет'},
 'schedule': {'id': 'fullDay', 'name': 'Полный день'},
 'employment': {'id': 'full', 'name': 'Полная занятость'},
 'department': None,
 'contacts': None,
 'description': '<p><strong>Компания «Фермер»</strong> основана в Брянске в 2004 году.</p> <p>Мы считаемся од

### Combine the batches

In [61]:
f1 = "../data/hh_0_6000/all_vacancy_details_0_6000.json"
f2 = "../data/hh_6000_12000/all_vacancy_details_6000_12000.json"
f3 = "../data/hh_12000_18000/all_vacancy_details_12000_18000.json"
f4 = "../data/hh_18000_24000/all_vacancy_details_18000_24000.json"
f5 = "../data/hh_24000_-1/all_vacancy_details_24000_-1.json"
# open all_vacancy_details_0_6000.json, all_vacancy_details_6000_12000.json, all_vacancy_details_12000_18000.json
# and combine them into a single list of dictionaries

all_vacancy_details = []
with open(f1, 'r') as f:
    all_vacancy_details.extend(json.load(f))

with open(f2, 'r') as f:
    all_vacancy_details.extend(json.load(f))

with open(f3, 'r') as f:
    all_vacancy_details.extend(json.load(f))

with open(f4, 'r') as f:
    all_vacancy_details.extend(json.load(f))

with open(f5, 'r') as f:
    all_vacancy_details.extend(json.load(f))

len(all_vacancy_details)

32283

### Extra cleaning

In [62]:
len(all_vacancy_details), len(set([job['id'] for job in all_vacancy_details]))

(32283, 32283)

In [63]:
# save the result to all_vacancy_details_0_18000.json
with open("../data/all_vacancy_details_combined.json", 'w') as f:
    json.dump(all_vacancy_details, f, ensure_ascii=False, indent=4)

In [64]:
all_jobs_json = []
with open('../data/hh_all_vacancies_no_duplicates_ru_combined.json', 'r') as f:
    all_jobs_json = json.load(f)

len(all_jobs_json)

32283

In [65]:
# get the set intersection of the ids from all_jobs_json and all_vacancy_details
job_ids = set([job['id'] for job in all_jobs_json])
details_ids = set([job['id'] for job in all_vacancy_details])
test_set = set([str(i) for i in range(18000)])

intersection = job_ids.intersection(details_ids)
print(len(intersection))

intersection = test_set.intersection(details_ids)
print(len(intersection))

32283
0


In [66]:
# sort all_vacancy_details by id to match the order of all_jobs_json
# so that the order in all_vacancy_details matches the order in all_jobs_json
# in all_jobs_json, the jobs are not sorted by id
# use map to get the index of the job in all_jobs_json
# then sort all_vacancy_details by the index
# then save the result to all_vacancy_details_sorted.json

job_id_mapping = {job['id']: job for job in all_vacancy_details}

all_vacancy_details_sorted = []

for job in all_jobs_json:
    all_vacancy_details_sorted.append(job_id_mapping[job['id']])

print(len(all_vacancy_details_sorted))
count = 0
for id1, id2 in zip(all_jobs_json, all_vacancy_details_sorted):
    if id1['id'] != id2['id']:
        count += 1
print(count)

32283
0


In [67]:
# save the result to all_vacancy_details_combined_sorted.json

with open('../data/all_vacancy_details_combined_sorted.json', 'w') as f:
    json.dump(all_vacancy_details_sorted, f, ensure_ascii=False, indent=4)

In [34]:
# check if there are mismatch triplets: id, published_at, name

job_ids = set()
for job in all_jobs_json[:18000]:
    job_ids.add((job['id'], job.get('name')))
mismatch_triplets = []
for job in all_vacancy_details:
    if (job['id'], job.get('name')) not in job_ids:
        mismatch_triplets.append((job['id'], job.get('name')))
print(len(mismatch_triplets))
mismatch_triplets[:10]

119


[('109868969', None),
 ('104147603', 'Системный администратор со знанием 1С'),
 ('110634418', 'Системный администратор'),
 ('110761403',
  'Старший JS-разработчик 3D- Middle Frontend Developer / ASP.NET RESTful API-JS/TS-React-gRPC-Three.js'),
 ('110833122', 'Руководитель группы 1С'),
 ('109704302', 'Офис-менеджер в IT компанию(центр)на неполный рабочий день'),
 ('109758868',
  'Junior системный инженер/DevOps инженер/системный администратор'),
 ('110804828', 'Инженер-программист'),
 ('108059937', None),
 ('107346869', 'Backend разработчик (PHP, Laravel - middle, middle+)')]

In [16]:
first_ids = [job['id'] for job in all_jobs_json[:18000]]
len(first_ids), len(set(first_ids))

(18000, 18000)

In [18]:
first_ids_details = [job['id'] for job in all_vacancy_details]
len(first_ids_details), len(set(first_ids_details))

(18000, 18000)

In [22]:
count = 0
for id1, id2 in zip(first_ids, first_ids_details):
    if id1 != id2:
        count += 1

count


18000

In [25]:
# get the set intersection of the ids from all_jobs_json and all_vacancy_details
job_ids = set([job['id'] for job in all_jobs_json[:18000]])
details_ids = set([job['id'] for job in all_vacancy_details[:18000]])
test_set = set([str(i) for i in range(18000)])

intersection = job_ids.intersection(details_ids)
print(len(intersection))

intersection = test_set.intersection(details_ids)
print(len(intersection))

18000
0


### Save resulting dataset as csv

In [2]:
import json
import csv
import re
from bs4 import BeautifulSoup

def clean_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text(separator=' ')


# Function to extract required information from a single vacancy
def extract_vacancy_info(vacancy):
    title = vacancy.get('name', '')
    url = vacancy.get('alternate_url', '')
    company = vacancy.get('employer', {}).get('name', '')
    skills = ', '.join(skill['name'] for skill in vacancy.get('key_skills', []))
    description = vacancy.get('description', '').replace('\n', ' ').replace('\r', ' ')
    description = clean_html_tags(description)
    salary_from = vacancy.get('salary', {}).get('from', '')
    salary_to = vacancy.get('salary', {}).get('to', '')
    currency = vacancy.get('salary', {}).get('currency', '')
    area = vacancy.get('area', {}).get('name', '')
    experience = vacancy.get('experience', {}).get('id', '')
    if experience == 'noExperience':
        experience_from = 0
        experience_to = 0
    # if there is match to 'from\d+to\d+' pattern, extract the values
    elif re.match(r'between(\d+)And(\d+)', experience):
        experience_from, experience_to = re.match(r'between(\d+)And(\d+)', experience).groups()
    elif re.match(r'moreThan(\d+)', experience):
        experience_from = re.match(r'moreThan(\d+)', experience).group(1)
        experience_to = -1
    else:
        experience_from = None
        experience_to = None
    

    
    return [url, title, area, company, skills, description, salary_from, salary_to, currency, experience_from, experience_to]

# Function to process JSON files and save the extracted information to a CSV file
def process_json_filee(json_file, output_csv):
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow([
                            'url', 
                            'title', 
                            'area',
                            'company', 'skills', 'description', 'salary_from', 'salary_to', 'currency', 'experience_from', 'experience_to'])
        
        with open(json_file, 'r', encoding='utf-8') as f:
            vacancies = json.load(f)
            for vacancy in vacancies:
                if vacancy.get('salary') is not None:
                    csvwriter.writerow(extract_vacancy_info(vacancy))

# Example usage
json_file = '../data/all_vacancy_details_combined.json'
output_csv = 'vacancies_combined_processed.csv'
process_json_filee(json_file, output_csv)

In [92]:
import json

with open('../data/hh_all_vacancies_no_duplicates_ru_combined.json', 'r') as f:
    all_vacancies = json.load(f)

len(all_vacancies), len(set([job['id'] for job in all_vacancies]))

(32283, 32283)