In [1]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
from time import sleep

In [2]:
df = pd.read_csv('data_center_jobs.csv')

In [4]:
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [5]:
def extract_information_from_html(html):
    """
    Extract key job info from a LinkedIn job HTML page:
    - Seniority level
    - Employment type
    - Job function
    - Industries
    - Publish time
    - Num applicants
    """

    results = {
        "Seniority level": "",
        "Employment type": "",
        "Job function": "",
        "Industries": "",
        "publish_time": "",
        "num_applicants": ""
    }

    soup = BeautifulSoup(html, 'html.parser')

    # --- Publish time & applicants ---
    company = soup.find('h4', class_='top-card-layout__second-subline')
    if company:
        spans = company.find_all('span')
        if len(spans) >= 3:
            results['publish_time'] = spans[2].get_text(strip=True)
        if len(spans) >= 4:
            results['num_applicants'] = spans[3].get_text(strip=True)

    # --- Job description (structured criteria) ---
    description = soup.find('section', class_='description')
    if description:
        job_criteria = description.find('ul', class_='description__job-criteria-list')
        if job_criteria:
            criteria = job_criteria.find_all('li')
            for criterion in criteria:
                key = criterion.find('h3')
                value = criterion.find('span')
                if key and value:
                    key_text = key.get_text(strip=True)
                    val_text = value.get_text(strip=True)
                    if key_text in results:  # only keep the 4 description-related ones
                        results[key_text] = val_text

    return results


In [6]:
def extract_job_info_from_urls(df, url_col="job_url", sleep_between=1.5, max_retries=3):
    extracted_data = []

    for i, url in enumerate(df[url_col]):
        html = None
        for attempt in range(max_retries):
            try:
                response = requests.get(url, headers=header, timeout=15)
                if response.status_code == 200:
                    html = response.text
                    break
                else:
                    print(f"{response.status_code} @ row {i}")
            except requests.RequestException as e:
                print(f"{e} @ row {i} (attempt {attempt+1})")
            sleep(sleep_between)  # wait before retry

        if html:
            info = extract_information_from_html(html)
            extracted_data.append(info)
        else:
            extracted_data.append({k: "" for k in [
                "Seniority level", "Employment type", "Job function", 
                "Industries", "publish_time", "num_applicants"
            ]})

        sleep(sleep_between)

    info_df = pd.DataFrame(extracted_data)
    return pd.concat([df.reset_index(drop=True), info_df], axis=1)

In [None]:
df_final = extract_job_info_from_urls(df)
df_final

In [None]:
df_final.to_csv('data_center_jobs_enriched.csv', index=False)