In [None]:
import spacy
from spacy.matcher import PhraseMatcher
import pandas as pd

In [None]:

nlp = spacy.load(
    "en_core_web_sm",
    disable=["ner", "parser"]
)

jobs = pd.read_csv("../tech_jobs_clean.csv")
sample_jobs = jobs.sample(300,random_state=42)


In [None]:
tech_skills = {
    "python", "java", "c++", "c#", "javascript", "typescript",
    "sql", "html", "css",
    "react", "angular", "vue",
    "node.js", "spring", "django", "flask",
    "machine learning", "deep learning", "data analysis",
    "natural language processing", "nlp",
    "numpy", "pandas", "scikit-learn",
    "tensorflow", "keras", "pytorch",
    "linux", "unix", "bash",
    "docker", "kubernetes",
    "aws", "azure", "gcp",
    "git", "ci/cd",
    "autocad", "solidworks",
    "matlab", "simulink",
    "plc", "control systems",
    "six sigma", "lean",
    "quality assurance",
    "project management",
    "agile", "scrum",
    "jira", "confluence",
    "pytest", "junit", "selenium",
    "cypress", "jest", "test automation",
    "unit testing", "integration testing"
}

In [None]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(s) for s in tech_skills]
matcher.add("SKILLS", patterns)

In [None]:
def extract_skills(texts):
    skills_out = []
    for doc in nlp.pipe(texts, batch_size=64):
        matches = matcher(doc)
        found = {doc[start:end].text.lower() for _, start, end in matches}
        skills_out.append(sorted(found))
    return skills_out

In [None]:
desc_skills = extract_skills(sample_jobs['description'].fillna("").astype(str).tolist())
title_skills = extract_skills(sample_jobs['title'].fillna("").astype(str).tolist())

sample_jobs['extracted_skills'] = [sorted(set(d) | set(t)) for d, t in zip(desc_skills, title_skills)]

In [None]:
sample_jobs['extracted_skills'].apply(len)

In [None]:
print(sample_jobs['extracted_skills'].value_counts())

In [None]:
print(sample_jobs[['title','extracted_skills','description']])

In [None]:
print(sample_jobs['title'].value_counts())