In [1]:
import re, time, random, json
from html import unescape
from crawler import linkedin_common_crawler, login_linkedin
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException
from webdriver_manager.chrome import ChromeDriverManager
import pickle
import os
from getpass import getpass

In [2]:
df = pd.read_csv("Raw_linkedin_jobs.csv")

In [3]:
driver = login_linkedin() 

WebDriverException: Message: Can not connect to the Service /Users/50357691/.wdm/drivers/chromedriver/mac64/140.0.7339.207/chromedriver-mac-arm64/chromedriver


In [None]:
# ---------- optional cookie helpers ----------
def save_cookies(driver, path="li_cookies.pkl"):
    pickle.dump(driver.get_cookies(), open(path, "wb"))
    print(f"Saved cookies → {path}")

def load_cookies(driver, path="li_cookies.pkl"):
    if not os.path.exists(path):
        print("No cookie file found."); return False
    driver.get("https://www.linkedin.com/")
    for c in pickle.load(open(path, "rb")):
        c.pop('sameSite', None)
        try:
            driver.add_cookie(c)
        except Exception:
            pass
    driver.get("https://www.linkedin.com/feed/")
    time.sleep(2)
    return True

# ---------- parsing config ----------
_HEADING_MAP = {
    "responsibilities": "Responsibilities",
    "what you will do": "Responsibilities",
    "what you'll do": "Responsibilities",
    "what you do": "Responsibilities",
    "duties": "Responsibilities",
    "key duties": "Responsibilities",
    "role & responsibilities": "Responsibilities",
    "role and responsibilities": "Responsibilities",
    "requirements": "Requirements",
    "must have": "QualificationsRequired",
    "you have": "QualificationsRequired",
    "required qualifications": "QualificationsRequired",
    "minimum qualifications": "QualificationsRequired",
    "basic qualifications": "QualificationsRequired",
    "qualifications": "QualificationsRequired",
    "preferred qualifications": "QualificationsPreferred",
    "preferred": "QualificationsPreferred",
    "nice to have": "QualificationsPreferred",
    "bonus": "QualificationsPreferred",
    "plus": "QualificationsPreferred",
}
_heading_regex = re.compile(r"^\s*(?:" + r"|".join(re.escape(k) for k in sorted(_HEADING_MAP, key=len, reverse=True)) + r")\s*:?\s*$", re.IGNORECASE)
_bullet_like = re.compile(r"^\s*[-–•●◦▪■☐✅▶︎»]+(?:\s+|\t)|^\s*\d+[\.\)]\s+")

def _sleep(a=0.6, b=1.2): time.sleep(random.uniform(a,b))
def _clean_text(t):
    t = re.sub(r"\r", "\n", t); t = re.sub(r"[ \t]+\n", "\n", t); t = re.sub(r"\n{3,}", "\n\n", t)
    return t.strip()
def _normalize_bullets_block(text):
    lines = [ln.strip() for ln in text.splitlines()]
    out = []
    for ln in lines:
        if _bullet_like.search(ln):
            ln = _bullet_like.sub("", ln).strip()
        out.append(ln)
    return "\n".join(out).strip()
def wait(driver, t=15): return WebDriverWait(driver, t)
def _scroll_into_view(driver, el):
    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
    _sleep(0.2,0.4)

# ---------- expand "About the job" ----------
def expand_about_section(driver):
    sec = None
    for xp in [
        "//section[.//h2[contains(translate(.,'ABOUTHEJOB','abouthejob'),'about the job')]]",
        "//section[contains(@aria-label,'About the job') or contains(@class,'jobs-description')]",
        "//div[contains(@class,'jobs-description')]//section",
    ]:
        try:
            sec = wait(driver, 12).until(EC.presence_of_element_located((By.XPATH, xp)))
            _scroll_into_view(driver, sec); break
        except TimeoutException: pass
    containers = [sec] if sec else [driver.find_element(By.TAG_NAME,"body")]
    clicked = 0
    for root in containers:
        for xp in [
            ".//button[contains(., 'See more') or .//span[contains(.,'See more')]]",
            ".//a[contains(., 'See more') or .//span[contains(.,'See more')]]",
            ".//button[contains(@aria-expanded,'false') and (contains(.,'See more') or contains(@aria-label,'See more'))]",
        ]:
            try:
                for btn in root.find_elements(By.XPATH, xp):
                    if not btn.is_displayed(): continue
                    _scroll_into_view(driver, btn); _sleep(0.1,0.2)
                    try: btn.click()
                    except ElementClickInterceptedException: driver.execute_script("arguments[0].click();", btn)
                    clicked += 1; _sleep(0.15,0.35)
            except Exception: pass
    print(f"[expand] Clicked {clicked} 'See more' buttons." if clicked else "[expand] No 'See more' buttons (maybe already expanded).")

# ---------- hidden JSON helpers ----------
def _flatten_strings(x):
    if isinstance(x, (list, tuple)):
        return "\n".join(_flatten_strings(i) for i in x if i is not None).strip()
    return str(x) if x is not None else ""
def _strip_html(html):
    if not html: return ""
    html = unescape(html)
    html = re.sub(r"<\s*br\s*/?>", "\n", html, flags=re.I)
    html = re.sub(r"</\s*p\s*>", "\n\n", html, flags=re.I)
    html = re.sub(r"<[^>]+>", "", html)
    html = re.sub(r"\n{3,}", "\n\n", html)
    return html.strip()
def _walk_find(obj, keys=("formattedDescription","description","plainDescription")):
    if isinstance(obj, dict):
        for k in keys:
            if k in obj and obj[k]: return obj[k]
        for v in obj.values():
            got = _walk_find(v, keys)
            if got: return got
    elif isinstance(obj, list):
        for v in obj:
            got = _walk_find(v, keys)
            if got: return got
    return None

# ---------- extraction (DOM → bpr-guid → JSON-LD → body text) ----------
def extract_about_text(driver):
    # extra time for hydration
    try:
        wait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "main#main, body")))
    except TimeoutException:
        pass

    # 1) Visible DOM
    selectors = [
        (By.CSS_SELECTOR, "section[aria-label*='About the job'] .show-more-less-html__markup"),
        (By.CSS_SELECTOR, "section[aria-label*='About the job'] [data-test-description]"),
        (By.CSS_SELECTOR, "section[aria-label*='About the job'] .jobs-description__content"),
        (By.CSS_SELECTOR, "section[aria-label*='About the job'] .jobs-description__container"),
        (By.CSS_SELECTOR, "div.show-more-less-html__markup"),
        (By.CSS_SELECTOR, ".jobs-description__content .show-more-less-html__markup"),
        (By.XPATH, "//section[.//h2[contains(translate(.,'ABOUTHEJOB','abouthejob'),'about the job')]]//*[self::div or self::article][string-length(normalize-space())>0]"),
    ]
    for by, sel in selectors:
        try:
            el = wait(driver, 10).until(EC.presence_of_element_located((by, sel)))
            _scroll_into_view(driver, el)
            txt = (el.get_attribute("innerText") or el.text or "").strip()
            if txt:
                print(f"[extract] DOM matched {by} {sel}")
                return _clean_text(txt)
        except TimeoutException:
            continue

    # 2) <code id="bpr-guid-..."> JSON
    try:
        codes = driver.find_elements(By.CSS_SELECTOR, "code[id^='bpr-guid-']")
        for c in codes:
            raw = (c.get_attribute("innerHTML") or "").strip()
            if not raw or raw[0] not in "{[}": continue
            if ("description" not in raw) and ("jobPosting" not in raw) and ("decoratedJobPosting" not in raw): continue
            try: data = json.loads(raw)
            except Exception: continue
            html_block = _walk_find(data, keys=("formattedDescription","description","plainDescription"))
            if html_block:
                print("[extract] Using JSON fallback (bpr-guid).")
                return _clean_text(_strip_html(_flatten_strings(html_block)))
    except Exception as e:
        print(f"[extract] JSON bpr error: {e}")

    # 3) <script type="application/ld+json"> JSON-LD
    try:
        scripts = driver.find_elements(By.CSS_SELECTOR, "script[type='application/ld+json']")
        for s in scripts:
            raw = (s.get_attribute("innerHTML") or "").strip()
            if not raw or '"description"' not in raw: continue
            try:
                data = json.loads(raw)
            except Exception:
                continue
            def _ld_desc(d):
                return d.get("description") if isinstance(d, dict) else None
            if isinstance(data, list):
                for item in data:
                    desc = _ld_desc(item)
                    if desc:
                        print("[extract] Using JSON-LD fallback (list).")
                        return _clean_text(_strip_html(desc))
            else:
                desc = _ld_desc(data)
                if desc:
                    print("[extract] Using JSON-LD fallback (dict).")
                    return _clean_text(_strip_html(desc))
    except Exception as e:
        print(f"[extract] JSON-LD error: {e}")

    # 4) body-text slice
    body_text = (driver.execute_script("return document.body.innerText") or "").strip()
    if body_text:
        t = re.sub(r"[ \t]+", " ", body_text); t = re.sub(r"\n{3,}", "\n\n", t)
        m = re.search(r"(about the job)\s*[:：]?\s*(.+)", t, flags=re.I|re.S)
        if m:
            after = m.group(2)
            stops = [r"\n\s*benefits\b", r"\n\s*pay\s+or\s+salary\b", r"\n\s*posted by\b",
                     r"\n\s*seniority level\b", r"\n\s*job function\b", r"\n\s*industr(y|ies)\b",
                     r"\n\s*skills\b", r"\n\s*company overview\b", r"\n\s*more about\b"]
            stop_m = re.compile("|".join(stops), flags=re.I).search(after)
            about = (after[:stop_m.start()] if stop_m else after).strip()
            print("[extract] Using body-text fallback slice.")
            return _clean_text(about)

    print("[extract] No content found."); return ""

# ---------- parsing → 5 columns ----------
def split_into_sections(full_text):
    full_text = re.sub(r"^\s*about the job\s*:\s*", "", full_text, flags=re.I).strip()
    full_text = _normalize_bullets_block(_clean_text(full_text))
    sections, buf, current_key = [], [], "Description"
    for raw_line in full_text.splitlines():
        line = raw_line.strip()
        if _heading_regex.match(line):
            if buf: sections.append((current_key, "\n".join(buf).strip())); buf=[]
            current_key = _HEADING_MAP[line.lower()]
        else:
            buf.append(raw_line)
    if buf: sections.append((current_key, "\n".join(buf).strip()))
    if not any(k != "Description" for k,_ in sections):
        return {"Description": full_text, "Responsibilities":"", "QualificationsRequired":"", "QualificationsPreferred":"", "Requirements":""}
    out = {k:"" for k in ["Description","Responsibilities","QualificationsRequired","QualificationsPreferred","Requirements"]}
    for k,v in sections:
        v=v.strip()
        if not v: continue
        if k=="QualificationsRequired" and re.search(r"\b(preferred|nice to have|bonus|plus)\b", v, flags=re.I):
            k="QualificationsPreferred"
        out[k] = (out[k] + ("\n\n" if out[k] else "") + v).strip()
    return out

def parse_about_to_columns(about_text):
    parsed = split_into_sections(about_text)
    for k in ["Description","Responsibilities","QualificationsRequired","QualificationsPreferred","Requirements"]:
        parsed.setdefault(k,"")
    return parsed

# ---------- public API ----------
def load_job_and_get_about(driver, url, timeout=25):
    driver.get(url)
    try: wait(driver, timeout).until(EC.presence_of_element_located((By.CSS_SELECTOR, "main#main, body")))
    except TimeoutException: print("[warn] main/body not detected in time; continuing.")
    _sleep(0.8,1.6); expand_about_section(driver)
    return extract_about_text(driver)

def enrich_with_about_columns(driver, df, url_col="job_url", max_urls=None):
    assert url_col in df.columns, f"Column '{url_col}' not in df. Columns: {df.columns.tolist()}"
    df_in = df[df[url_col].astype(str).str.startswith("http")].copy()
    if max_urls is not None: df_in = df_in.iloc[:max_urls].copy()
    rows = []
    for i, url in enumerate(df_in[url_col].tolist(), 1):
        print(f"\n[{i}/{len(df_in)}] {url}")
        try:
            about = load_job_and_get_about(driver, url)
            cols = parse_about_to_columns(about)
        except Exception as e:
            print(f"[error] {e.__class__.__name__}: {e}")
            cols = {"Description":"","Responsibilities":"","QualificationsRequired":"","QualificationsPreferred":"","Requirements":""}
        rows.append({url_col: url, **cols})
        _sleep(0.8,1.4)
    out_df = pd.DataFrame(rows)
    merged = df.merge(out_df, on=url_col, how="left")
    for k in ["Description","Responsibilities","QualificationsRequired","QualificationsPreferred","Requirements"]:
        if k not in merged.columns: merged[k] = ""
    return merged

def debug_one(driver, url):
    print("URL:", url)
    driver.get(url)
    try: WebDriverWait(driver, 25).until(EC.presence_of_element_located((By.CSS_SELECTOR, "main#main, body")))
    except TimeoutException: print("[debug] main/body did not appear; continuing.")
    time.sleep(1.0); expand_about_section(driver)
    about = extract_about_text(driver)
    print("\n---- ABOUT (first 600 chars) ----\n", (about or "")[:600])
    return about

In [None]:
test_df = df.head(2)
test_df = enrich_with_about_columns(driver, test_df, url_col= 'job_url')
test_df


[1/2] https://www.linkedin.com/jobs/view/4092074034/?eBP=CwEAAAGZhxZCI9vwBpBI6eId1Zx3Mf1JuIGWmnJtjEnsoAv9T0iE-htd73q_t3lftPPZfTYGC77a3gd-El5lFq0fQe5-YQRZYOEye0d9j9YoYnNmAqtZaQVFwBJnInY4rCW0w_T_GGlzvXzvIJPRiysjjagNKVTadpohwooei1qs1dROGU4GrpafBr73-HM4txURoUM1FmtAjmnkSTtnTNtH0sA3j4Mr4yV537kSf6owb6lMUjH3_d8RP75_gigJw_ZJeFzlbPdNrg3L2nQxcYG4KMOksHtPqlAvh1_xHdGuRJtHpgn5RL5eBdJAWDLN-9HDwfGLR5BT2BiAFmH2peHSiPC8-ZKtkXqpmSqlJ1q1CuaXyaVDIrIoQH3acaDssAH_dudZK-lQQdo9y3Y15FH2_kLNLgsh7jixxrgZOxBp4CcKgak97sKHBpB-dIBQ7_1na97mJzguUqIkkabdacPYmOCQFnPYaC3ujBsc-DBlfngM_gftr6r7uvzt_a7tI_PFKQA&refId=AFK0i5jcJKRfsl58VjGnCQ%3D%3D&trackingId=UbrP4fMEOUvYe1oeK2HrzQ%3D%3D&trk=flagship3_search_srp_jobs
[expand] No 'See more' buttons (maybe already expanded).
[extract] DOM matched css selector div.show-more-less-html__markup

[2/2] https://www.linkedin.com/jobs/view/4150887505/?eBP=CwEAAAGZhxZCI2hKoJCF4gOaMS9-HIpIFrV5_aYaZvE4Jn79WmAMtMTp9xIFLCeMZRLT6IrO9xpGav6VuH3ZQIwheDhImSPJEny9puU3iShGo6oddcgbZxpeGaYI8dRzYGuo2Re

Unnamed: 0,job_id,job_name,company_name,job_location,job_metadata,job_url,original_url,Description,Responsibilities,QualificationsRequired,QualificationsPreferred,Requirements
0,4092074000.0,Experienced Apprentice Electrician - Commercial,Weifield Group Contracting | A LOENBRO Company,"Centennial, CO (On-site)","$23/hr - $29/hr · 401(k), Medical, Vision, +1 ...",https://www.linkedin.com/jobs/view/4092074034/...,https://www.linkedin.com/jobs/search/?keywords...,Brief Description\n\n*This posting is for Expe...,While continuing your education in the evening...,,,
1,4150888000.0,2025 Summer Civil Engineer Assistant,BKF Engineers,"Pleasanton, CA (On-site)",$20/hr - $31.47/hr,https://www.linkedin.com/jobs/view/4150887505/...,https://www.linkedin.com/jobs/search/?keywords...,Are you a passionate civil engineering student...,,,,Currently enrolled in B.S. Civil Engineering o...
