In [None]:
import re
import pandas as pd

In [None]:
pd.set_option('display.max_colwidth', None)
jobs = pd.read_csv("../tech_jobs_clean.csv")
sample_jobs = jobs.sample(300,random_state=42)
sample_jobs_before = sample_jobs.copy(deep=True)

In [None]:
CUE_PATTERN = re.compile(
    r"""
    (?:
        experience\s+(?:with|in) |
        proficiency\s+in |
        knowledge\s+of |
        familiar(?:ity)?\s+with |
        skilled\s+in |
        expertise\s+in |
        working\s+knowledge\s+of |
        hands[-\s]?on\s+experience\s+(?:with|in)
    )
    \s+                                   # whitespace after cue
    (?P<chunk>                             # capture the chunk
        .*?                                # non-greedy
    )
    (?=                                    # stop when we hit a delimiter
        [\.\;\n\r] |                       # period/semicolon/newline
        \u2022 |                           # bullet •
        \s-\s |                            # " - " often used in listings
        $                                   # or end of string
    )
    """,
    re.IGNORECASE | re.VERBOSE | re.DOTALL
)

In [None]:
def extract_skill_chunks_from_description(text: str, max_chunks: int = 10) -> list[str]:
    if pd.isna(text) or not isinstance(text, str) or not text.strip():
        return []

    chunks = []
    for m in CUE_PATTERN.finditer(text):
        chunk = m.group("chunk").strip()
        chunk = re.sub(r"\s+", " ", chunk)
        chunk = chunk.strip(" :,-–—•*")
        if len(chunk) >= 2:
            chunks.append(chunk)
        if len(chunks) >= max_chunks:
            break

    # dedupe preserving order
    seen = set()
    out = []
    for c in chunks:
        k = c.lower()
        if k not in seen:
            seen.add(k)
            out.append(c)
    return out

In [None]:
def add_description_chunks_to_skills_desc(df: pd.DataFrame,
                                         desc_col="description",
                                         skills_col="skills_desc") -> pd.DataFrame:
    # make sure skills_desc exists
    if skills_col not in df.columns:
        df[skills_col] = ""

    def _append(row):
        desc = row.get(desc_col, "")
        existing = row.get(skills_col, "")
        existing = "" if pd.isna(existing) else str(existing)

        chunks = extract_skill_chunks_from_description(desc)
        if not chunks:
            return existing

        chunk_text = "; ".join(chunks)
        return (existing + ("; " if existing.strip() else "") + chunk_text).strip()

    df[skills_col] = df.apply(_append, axis=1)
    return df

sample_jobs = add_description_chunks_to_skills_desc(sample_jobs)

In [None]:
missing_total_before = (
    sample_jobs_before["skills_desc"]
    .fillna("")
    .astype(str)
    .str.strip()
    .eq("")
    .sum()
)

print("Total missing before:", missing_total_before)

In [None]:
sample_jobs_after = sample_jobs_before.copy(deep=True)
sample_jobs_after = add_description_chunks_to_skills_desc(sample_jobs_after)

In [None]:
missing_total_after = (
    sample_jobs_after["skills_desc"]
    .fillna("")
    .astype(str)
    .str.strip()
    .eq("")
    .sum()
)

print("Total missing after:", missing_total_after)

In [None]:
sample_jobs_before["skills_desc"].head(10)

In [None]:
sample_jobs_after["skills_desc"].head(10)

In [None]:
before_len = sample_jobs_before["skills_desc"].fillna("").str.len()
after_len = sample_jobs_after["skills_desc"].fillna("").str.len()

print("Average length BEFORE:", before_len.mean())
print("Average length AFTER:", after_len.mean())
print("Average increase:", (after_len - before_len).mean())

In [None]:
sample_jobs_after["extracted_chunks"] = sample_jobs_after["description"].apply(
    extract_skill_chunks_from_description
)

# Flatten all chunks
all_chunks = [chunk for row in sample_jobs_after["extracted_chunks"] for chunk in row]

print("Total chunks extracted:", len(all_chunks))

# Look at most common ones
from collections import Counter
Counter(all_chunks).most_common(15)

In [None]:
tech_skills = {
    "python", "java", "c++", "c#", "javascript", "typescript",
    "sql", "html", "css",
    "react", "angular", "vue",
    "node.js", "spring", "django", "flask",
    "machine learning", "deep learning", "data analysis",
    "natural language processing", "nlp",
    "numpy", "pandas", "scikit-learn",
    "tensorflow", "keras", "pytorch",
    "linux", "unix", "bash",
    "docker", "kubernetes",
    "aws", "azure", "gcp",
    "git", "ci/cd",
    "autocad", "solidworks",
    "matlab", "simulink",
    "plc", "control systems",
    "six sigma", "lean",
    "quality assurance",
    "project management",
    "agile", "scrum",
    "jira", "confluence",
    "pytest", "junit", "selenium",
    "cypress", "jest", "test automation",
    "unit testing", "integration testing",
    "siemens apogee", "siemens desigo", "c shell", "vsam",
    "angular", "security clearance"
}

In [None]:
def chunk_contains_tech_skill(chunk: str, tech_skills: set[str]) -> bool:
    chunk_lower = chunk.lower()
    for skill in tech_skills:
        if skill in chunk_lower:
            return True
    return False

In [None]:
valid_chunks = [chunk for chunk in all_chunks if chunk_contains_tech_skill(chunk, tech_skills)]
print("Valid chunks that contain tech skills:", len(valid_chunks))
print("Precision:", len(valid_chunks) / len(all_chunks) if all_chunks else 0)

In [None]:
# display 10 random samples of skills_desc from sample_jobs_after
display(sample_jobs_after[["title", "skills_desc"]].sample(10, random_state=42))

In [None]:
all_jobs = pd.read_csv('../postings.csv')
all_jobs_before = all_jobs.copy(deep=True)

In [None]:
missing_skills_before = (
    all_jobs_before["skills_desc"]
    .fillna("")
    .astype(str)
    .str.strip()
    .eq("")
    .sum()
)

In [None]:
print("Missing before on full dataset:", missing_skills_before)

In [None]:
all_jobs_after = all_jobs_before.copy(deep=True)
all_jobs_after = add_description_chunks_to_skills_desc(all_jobs_after)

In [None]:
missing_skills_after = (
    all_jobs_after["skills_desc"]
    .fillna("")
    .astype(str)
    .str.strip()
    .eq("")
    .sum()
)

In [None]:
print("Missing after on full dataset:", missing_skills_after)

Missing Experience Level Values