In [1]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
from time import sleep

In [2]:
df = pd.read_csv('data_center_jobs.csv')

In [4]:
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [5]:
def extract_information_from_html(html):
    """
    Extract key job info from a LinkedIn job HTML page:
    - Seniority level
    - Employment type
    - Job function
    - Industries
    - Publish time
    - Num applicants
    """

    results = {
        "Seniority level": "",
        "Employment type": "",
        "Job function": "",
        "Industries": "",
        "publish_time": "",
        "num_applicants": ""
    }

    soup = BeautifulSoup(html, 'html.parser')

    # --- Publish time & applicants ---
    company = soup.find('h4', class_='top-card-layout__second-subline')
    if company:
        spans = company.find_all('span')
        if len(spans) >= 3:
            results['publish_time'] = spans[2].get_text(strip=True)
        if len(spans) >= 4:
            results['num_applicants'] = spans[3].get_text(strip=True)

    # --- Job description (structured criteria) ---
    description = soup.find('section', class_='description')
    if description:
        job_criteria = description.find('ul', class_='description__job-criteria-list')
        if job_criteria:
            criteria = job_criteria.find_all('li')
            for criterion in criteria:
                key = criterion.find('h3')
                value = criterion.find('span')
                if key and value:
                    key_text = key.get_text(strip=True)
                    val_text = value.get_text(strip=True)
                    if key_text in results:  # only keep the 4 description-related ones
                        results[key_text] = val_text

    return results


In [6]:
def extract_job_info_from_urls(df, url_col="job_url", sleep_between=1.5, max_retries=3):
    extracted_data = []

    for i, url in enumerate(df[url_col]):
        html = None
        for attempt in range(max_retries):
            try:
                response = requests.get(url, headers=header, timeout=15)
                if response.status_code == 200:
                    html = response.text
                    break
                else:
                    print(f"{response.status_code} @ row {i}")
            except requests.RequestException as e:
                print(f"{e} @ row {i} (attempt {attempt+1})")
            sleep(sleep_between)  # wait before retry

        if html:
            info = extract_information_from_html(html)
            extracted_data.append(info)
        else:
            extracted_data.append({k: "" for k in [
                "Seniority level", "Employment type", "Job function", 
                "Industries", "publish_time", "num_applicants"
            ]})

        sleep(sleep_between)

    info_df = pd.DataFrame(extracted_data)
    return pd.concat([df.reset_index(drop=True), info_df], axis=1)

In [9]:
df_final = extract_job_info_from_urls(df)
df_final

404 @ row 47
404 @ row 47
404 @ row 47
Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan? @ row 50 (attempt 1)
Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan? @ row 50 (attempt 2)
Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan? @ row 50 (attempt 3)
404 @ row 707
404 @ row 707
404 @ row 707
404 @ row 720
404 @ row 720
404 @ row 720
404 @ row 770
404 @ row 770
404 @ row 770
404 @ row 1383
404 @ row 1383
404 @ row 1383
404 @ row 1458
404 @ row 1458
404 @ row 1458
404 @ row 1461
404 @ row 1461
404 @ row 1461
404 @ row 1528
404 @ row 1528
404 @ row 1528
404 @ row 1761
404 @ row 1761
404 @ row 1761
404 @ row 1836
404 @ row 1836
404 @ row 1836
429 @ row 1932
404 @ row 1940
404 @ row 1940
404 @ row 1940
429 @ row 1966
404 @ row 1986
404 @ row 1986
404 @ row 1986
404 @ row 2345
404 @ row 2345
404 @ row 2345
404 @ row 2781
404 @ row 2781
404 @ row 2781
404 @ row 2782
404 @ row 2782
404 @ row 2782
404 @ row 2884
404 @ row 2884
404 @ r

Unnamed: 0,state,source_url,job_id,job_name,company_name,job_location,job_url,workplace_type,job_description,Seniority level,Employment type,Job function,Industries,publish_time,num_applicants
0,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4.304561e+09,Target Digital Network Analyst II - Utah Data ...,Altamira Technologies Corporation,"Wahiawa, HI (On-site)",https://www.linkedin.com/jobs/view/4304560956/...,On-site,Target Digital Network Analyst\nClearance: TS/...,Entry level,Full-time,Information Technology,Software Development,5 days ago,
1,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4.305162e+09,Society of Women Engineers WE25 Conference Att...,Jacobs,"Honolulu, HI",https://www.linkedin.com/jobs/view/4305161555/...,Unknown,"At Jacobs, we're challenging today to reinvent...",Not Applicable,Full-time,Engineering and Information Technology,"Civil Engineering, Design Services, and IT Ser...",1 week ago,
2,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4.304447e+09,"Graduate Research Intern, Chemistry",DataAnnotation,"Hawaii, United States (Remote)",https://www.linkedin.com/jobs/view/4304447345/...,Remote,Explore collaborative articles\n \n\n\n...,Internship,Contract,"Research, Analyst, and Information Technology",Software Development,2 weeks ago,
3,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4.293203e+09,Materials Engineering/Materials Science - Summ...,Honeywell,United States (Remote),https://www.linkedin.com/jobs/view/4293202567/...,Remote,Job Description\nThe future is what you make i...,Internship,Full-time,Engineering and Information Technology,"Appliances, Electrical, and Electronics Manufa...",2 weeks ago,142 applicants
4,HI,https://www.linkedin.com/jobs/search/?sortBy=R...,4.294607e+09,"University, 2026 Summer Games Systems Engineer...",Booz Allen Hamilton,"Honolulu, HI (Hybrid)",https://www.linkedin.com/jobs/view/4294607020/...,Hybrid,"Job Number: R0225531\nUniversity, 2026 Summer ...",Not Applicable,Internship,Information Technology,IT Services and IT Consulting,1 week ago,58 applicants
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11131,MO,https://www.linkedin.com/jobs/search/?sortBy=R...,4.291961e+09,Marketing/Communications Intern - Transmission...,Burns & McDonnell,"Kansas City, MO",https://www.linkedin.com/jobs/view/4291961454/...,Unknown,Description\nOur Engineering Interns are respo...,Not Applicable,Full-time,Marketing and Sales,Construction,2 weeks ago,47 applicants
11132,MO,https://www.linkedin.com/jobs/search/?sortBy=R...,4.308498e+09,Data Analyst IV,Centene Corporation,"Missouri, United States (Remote)",https://www.linkedin.com/jobs/view/4308497710/...,Remote,You could be the one who changes everything fo...,Not Applicable,Full-time,Information Technology,Hospitals and Health Care and Insurance,1 week ago,62 applicants
11133,MO,https://www.linkedin.com/jobs/search/?sortBy=R...,4.304439e+09,Paid Internship -- Software Development (Winte...,Interco,"St Louis, MO (On-site)",https://www.linkedin.com/jobs/view/4304439194/...,On-site,Company Description\nInterco is a global metal...,Internship,Internship,Engineering and Information Technology,Renewable Energy Semiconductor Manufacturing,2 weeks ago,189 applicants
11134,MO,https://www.linkedin.com/jobs/search/?sortBy=R...,4.288327e+09,Junior Helpdesk Specialist,Chenega MIOS SBU,"St Louis, MO (On-site)",https://www.linkedin.com/jobs/view/4288326603/...,On-site,Summary\nJunior Helpdesk Specialist \nSt. Loui...,Entry level,Full-time,Information Technology,Defense and Space Manufacturing,6 days ago,63 applicants


In [10]:
df_final.to_csv('data_center_jobs_enriched.csv', index=False)