In [None]:
import spacy
from spacy.matcher import PhraseMatcher
import pandas as pd
from collections import Counter

In [None]:

nlp_fast = spacy.load(
    "en_core_web_sm",
    disable=["ner","parser"]
)
nlp_parser = spacy.load(
    "en_core_web_sm",
    disable=["ner"]
)

jobs = pd.read_csv("../tech_jobs_clean.csv")
sample_jobs = jobs.sample(300,random_state=42)


In [None]:
tech_skills = {
    "python", "java", "c++", "c#", "javascript", "typescript",
    "sql", "html", "css",
    "react", "angular", "vue",
    "node.js", "spring", "django", "flask",
    "machine learning", "deep learning", "data analysis",
    "natural language processing", "nlp",
    "numpy", "pandas", "scikit-learn",
    "tensorflow", "keras", "pytorch",
    "linux", "unix", "bash",
    "docker", "kubernetes",
    "aws", "azure", "gcp",
    "git", "ci/cd",
    "autocad", "solidworks",
    "matlab", "simulink",
    "plc", "control systems",
    "six sigma", "lean",
    "quality assurance",
    "project management",
    "agile", "scrum",
    "jira", "confluence",
    "pytest", "junit", "selenium",
    "cypress", "jest", "test automation",
    "unit testing", "integration testing",
    "siemens apogee", "siemens desigo", "c shell", "vsam",
    "angular", "security clearance"
}

In [None]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp_fast.make_doc(s) for s in tech_skills]
matcher.add("SKILLS", patterns)

In [None]:
def extract_skills(texts):
    skills_out = []
    for doc in nlp_fast.pipe(texts, batch_size=64):
        matches = matcher(doc)
        found = {doc[start:end].text.lower() for _, start, end in matches}
        skills_out.append(sorted(found))
    return skills_out

In [None]:
desc_skills = extract_skills(sample_jobs['description'].fillna("").astype(str).tolist())
title_skills = extract_skills(sample_jobs['title'].fillna("").astype(str).tolist())

sample_jobs['extracted_skills'] = [sorted(set(d) | set(t)) for d, t in zip(desc_skills, title_skills)]

In [None]:
sample_jobs['extracted_skills'].apply(len)

In [None]:
print(sample_jobs['extracted_skills'].value_counts())

In [None]:
print(sample_jobs[['title','extracted_skills','description']])

In [None]:
print(sample_jobs['title'].value_counts())

In [None]:
DOMAIN_STOPWORDS = {
    "experience", "work", "team", "include", "provide", "require",
    "skill", "year", "opportunity", "business", "customer",
    "support", "product", "project", "development", "engineering",
    "design", "position", "ability", "job", "company", "role",
    "employee", "knowledge", "service", "technical", "technology",
    "information", "management", "application", "develop", "engineer",
    "you", "that", "what", "who", "which", "they",
    "this position", "this role", "the ability",
    "employment", "equal opportunity", "opportunity employer",
    "race", "color", "religion", "sex", "gender identity",
    "sexual orientation", "national origin", "age", "disability",
    "veteran status",  "this", "skills", "benefits", "part", "responsibilities",
    "customers", "regard", "people", "solutions",
    "procedures", "employees", "them", "training",
    "the company", "diversity", "the world"
}

In [None]:
def common_words(texts):
    phrase_counts = Counter()

    for doc in nlp_parser.pipe(texts, batch_size=64):
        for chunk in doc.noun_chunks:
            phrase = chunk.text.lower().strip()

        if (
            4 <= len(phrase) <= 40
            and phrase not in DOMAIN_STOPWORDS
            and not phrase.startswith("this ")
        ):
            phrase_counts[phrase] += 1

    return phrase_counts.most_common(20)

In [None]:
common_desc = common_words(sample_jobs['description'].fillna("").astype(str).tolist())
common_title = common_words(sample_jobs['title'].fillna("").astype(str).tolist())

In [None]:
common_desc

In [None]:
common_title