In [1]:
import os
import json
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from readability import Document
from urllib.parse import urlparse
import time
import random
from requests.exceptions import RequestException

In [2]:
US_STATE_ABBR = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'District of Columbia': 'DC',
    'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL',
    'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA',
    'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI',
    'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT',
    'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
    'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND',
    'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA',
    'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN',
    'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',
    'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
}


In [3]:
def extract_state_from_filename(filename):
    """Extract full state name from filename and map to abbreviation."""
    name = filename.replace("results_", "").replace(".json", "").strip()
    return US_STATE_ABBR.get(name, "Unknown")

def extract_jobs_from_json(file_path, state_abbr):
    with open(file_path, 'r', encoding='utf-8') as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            print(f"⚠️ Skipped invalid JSON file: {file_path}")
            return []

    all_jobs = []
    for block in data:
        source_url = block.get("url", "")
        for job in block.get("jobs", []):
            all_jobs.append({
                "state": state_abbr,
                "source_url": source_url,
                "job_id": job.get("job_id", ""),
                "job_name": job.get("job_name", ""),
                "company_name": job.get("company_name", ""),
                "job_location": job.get("job_location", ""),
                "job_url": job.get("job_url", "")
            })
    return all_jobs


def aggregate_all_json_to_csv(json_dir):
    """Aggregate job records from all JSON files into a single CSV."""
    all_records = []

    for filename in os.listdir(json_dir):
        if filename.endswith(".json"):
            file_path = os.path.join(json_dir, filename)
            state = extract_state_from_filename(filename)
            jobs = extract_jobs_from_json(file_path, state)
            all_records.extend(jobs)

    df = pd.DataFrame(all_records)
    print(f"✅ Saved {len(df)} rows")
    return df

In [4]:
input_folder = "results"
df = aggregate_all_json_to_csv(input_folder)

✅ Saved 38101 rows


In [5]:
df.drop_duplicates(subset=["job_id"], inplace=True)
df.reset_index(drop=True, inplace=True)

In [6]:
def extract_workplace(location):
    if pd.isna(location) or location is None:
        return "Unknown"
    elif "(Remote)" in location:
        return "Remote"
    elif "(Hybrid)" in location:
        return "Hybrid"
    elif "(On-site)" in location:
        return "On-site"
    else:
        return "Unknown"
    
df['workplace_type'] = df['job_location'].apply(extract_workplace)

In [7]:
df

Unnamed: 0,state,source_url,job_id,job_name,company_name,job_location,job_url,workplace_type
0,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4304560956,Target Digital Network Analyst II - Utah Data ...,Altamira Technologies Corporation,"Wahiawa, HI (On-site)",https://www.linkedin.com/jobs/view/4304560956/...,On-site
1,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4305161555,Society of Women Engineers WE25 Conference Att...,Jacobs,"Honolulu, HI",https://www.linkedin.com/jobs/view/4305161555/...,Unknown
2,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4304447345,"Graduate Research Intern, Chemistry",DataAnnotation,"Hawaii, United States (Remote)",https://www.linkedin.com/jobs/view/4304447345/...,Remote
3,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4293202567,Materials Engineering/Materials Science - Summ...,Honeywell,United States (Remote),https://www.linkedin.com/jobs/view/4293202567/...,Remote
4,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4294607020,"University, 2026 Summer Games Systems Engineer...",Booz Allen Hamilton,"Honolulu, HI (Hybrid)",https://www.linkedin.com/jobs/view/4294607020/...,Hybrid
...,...,...,...,...,...,...,...,...
11131,MO,https://www.linkedin.com/jobs/search/?sortBy=R...,4291961454,Marketing/Communications Intern - Transmission...,Burns & McDonnell,"Kansas City, MO",https://www.linkedin.com/jobs/view/4291961454/...,Unknown
11132,MO,https://www.linkedin.com/jobs/search/?sortBy=R...,4308497710,Data Analyst IV,Centene Corporation,"Missouri, United States (Remote)",https://www.linkedin.com/jobs/view/4308497710/...,Remote
11133,MO,https://www.linkedin.com/jobs/search/?sortBy=R...,4304439194,Paid Internship -- Software Development (Winte...,Interco,"St Louis, MO (On-site)",https://www.linkedin.com/jobs/view/4304439194/...,On-site
11134,MO,https://www.linkedin.com/jobs/search/?sortBy=R...,4288326603,Junior Helpdesk Specialist,Chenega MIOS SBU,"St Louis, MO (On-site)",https://www.linkedin.com/jobs/view/4288326603/...,On-site


In [8]:
def fetch_html_with_retry(url, retries=3, backoff=5):
    """
    Fetches the HTML of a URL with retries and backoff.
    """
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=20)  # plain requests
            if response.status_code == 200:
                return response.text
            else:
                print(f"⚠️ Status {response.status_code} for {url}")
        except RequestException as e:
            print(f"⚠️ Error fetching {url} (attempt {attempt+1}): {e}")
        # exponential backoff between retries
        sleep_time = backoff * (attempt + 1)
        print(f"Sleeping {sleep_time}s before retrying...")
        time.sleep(sleep_time)
    return None


def extract_job_description_from_html(html):
    """
    Extracts main text content from a LinkedIn job page HTML using readability.
    """
    try:
        doc = Document(html)
        soup = BeautifulSoup(doc.summary(), 'html.parser')
        text = soup.get_text(separator="\n").strip()
        return text
    except Exception as e:
        print(f"⚠️ Error extracting description: {e}")
        return ""



def fetch_job_descriptions(df, url_col='job_url', sleep_range=(20, 45), milestone=1000):
    job_descriptions = []

    for idx, url in enumerate(df[url_col]):
        html = fetch_html_with_retry(url, retries=3, backoff=5)
        if html:
            job_text = extract_job_description_from_html(html)
        else:
            job_text = "Failed to fetch"

        job_descriptions.append(job_text)

        # milestone reporting
        if (idx + 1) % milestone == 0:
            print(f"✅ {idx + 1} rows finished")

        # random polite delay
        delay = random.uniform(*sleep_range)
        time.sleep(delay)

    df['job_description'] = job_descriptions
    return df

In [9]:
df = fetch_job_descriptions(df)

⚠️ Status 404 for https://www.linkedin.com/jobs/view/4304178996/?eBP=CwEAAAGZnNreWSr22X5IAEM3w4_CzURCGrMsXNFiJkE6tQsaXOmOX70Is8jqXbJK6ISa-Bhp_L48wTst9ano03TiYearWrN4S0p3gTS9NKTNJxl6JokOA_aoKZ4-gL_IVjJwHJodR_sH9GkrGfoUrm7XPMJsArpE75u9keJzauIaC5qU5VIHFx5ANyx7Xo9UHPYUuObJB9pPcTl7ZlUQhI6okCBL0QVk9ADTugPijM83GOh5Xdo0yTSPTSnWgroj-6hKhsmleKaeic0jtrL5NvFHOZnsiEY06sn1Erl2ns7015r6tsMZXSBROkNQ4emTHnijNFXnSXUHF9NrTOFawJJgqRopuQxEQUH8lTJmEmdoA7o9W2j-f8drU7WJTttE2YLXRFPoQxcUD1Al3NqoR0B8WTT7GwQkKTD7Rq64faXiBw2yylsBdOBkxVVw-vDqonHnyrujkYEvos6s47UagUqLR27myQw7c-A0lMj7TcVl01ZYeZMg8I083lLXjN7TW9D3hA&refId=0sPCMB%2BeXJ%2BVWdEW%2F%2FVTFg%3D%3D&trackingId=1X4BWLQ2PR%2BhUQK2FvJcIg%3D%3D&trk=flagship3_search_srp_jobs
Sleeping 5s before retrying...
⚠️ Status 404 for https://www.linkedin.com/jobs/view/4304178996/?eBP=CwEAAAGZnNreWSr22X5IAEM3w4_CzURCGrMsXNFiJkE6tQsaXOmOX70Is8jqXbJK6ISa-Bhp_L48wTst9ano03TiYearWrN4S0p3gTS9NKTNJxl6JokOA_aoKZ4-gL_IVjJwHJodR_sH9GkrGfoUrm7XPMJsArpE75u9keJzauIaC5qU5VIHFx5ANyx7Xo9UHPYUu

In [11]:
df

Unnamed: 0,state,source_url,job_id,job_name,company_name,job_location,job_url,workplace_type,job_description
0,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4304560956,Target Digital Network Analyst II - Utah Data ...,Altamira Technologies Corporation,"Wahiawa, HI (On-site)",https://www.linkedin.com/jobs/view/4304560956/...,On-site,Target Digital Network Analyst\nClearance: TS/...
1,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4305161555,Society of Women Engineers WE25 Conference Att...,Jacobs,"Honolulu, HI",https://www.linkedin.com/jobs/view/4305161555/...,Unknown,"At Jacobs, we're challenging today to reinvent..."
2,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4304447345,"Graduate Research Intern, Chemistry",DataAnnotation,"Hawaii, United States (Remote)",https://www.linkedin.com/jobs/view/4304447345/...,Remote,Explore collaborative articles\n \n\n\n...
3,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4293202567,Materials Engineering/Materials Science - Summ...,Honeywell,United States (Remote),https://www.linkedin.com/jobs/view/4293202567/...,Remote,Job Description\nThe future is what you make i...
4,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4294607020,"University, 2026 Summer Games Systems Engineer...",Booz Allen Hamilton,"Honolulu, HI (Hybrid)",https://www.linkedin.com/jobs/view/4294607020/...,Hybrid,"Job Number: R0225531\nUniversity, 2026 Summer ..."
...,...,...,...,...,...,...,...,...,...
11131,MO,https://www.linkedin.com/jobs/search/?sortBy=R...,4291961454,Marketing/Communications Intern - Transmission...,Burns & McDonnell,"Kansas City, MO",https://www.linkedin.com/jobs/view/4291961454/...,Unknown,Description\nOur Engineering Interns are respo...
11132,MO,https://www.linkedin.com/jobs/search/?sortBy=R...,4308497710,Data Analyst IV,Centene Corporation,"Missouri, United States (Remote)",https://www.linkedin.com/jobs/view/4308497710/...,Remote,You could be the one who changes everything fo...
11133,MO,https://www.linkedin.com/jobs/search/?sortBy=R...,4304439194,Paid Internship -- Software Development (Winte...,Interco,"St Louis, MO (On-site)",https://www.linkedin.com/jobs/view/4304439194/...,On-site,Company Description\nInterco is a global metal...
11134,MO,https://www.linkedin.com/jobs/search/?sortBy=R...,4288326603,Junior Helpdesk Specialist,Chenega MIOS SBU,"St Louis, MO (On-site)",https://www.linkedin.com/jobs/view/4288326603/...,On-site,Summary\nJunior Helpdesk Specialist \nSt. Loui...


In [10]:
df.to_csv("data_center_jobs.csv", index=False)