In [None]:
import spacy
from spacy.matcher import PhraseMatcher
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:

nlp_fast = spacy.load(
    "en_core_web_sm",
    disable=["ner","parser"]
)
nlp_parser = spacy.load(
    "en_core_web_sm",
    disable=["ner"]
)

jobs = pd.read_csv("../tech_jobs_clean.csv")
sample_jobs = jobs.sample(300,random_state=42)


In [None]:
tech_skills = {
    "python", "java", "c++", "c#", "javascript", "typescript",
    "sql", "html", "css",
    "react", "angular", "vue",
    "node.js", "spring", "django", "flask",
    "machine learning", "deep learning", "data analysis",
    "natural language processing", "nlp",
    "numpy", "pandas", "scikit-learn",
    "tensorflow", "keras", "pytorch",
    "linux", "unix", "bash",
    "docker", "kubernetes",
    "aws", "azure", "gcp",
    "git", "ci/cd",
    "autocad", "solidworks",
    "matlab", "simulink",
    "plc", "control systems",
    "six sigma", "lean",
    "quality assurance",
    "project management",
    "agile", "scrum",
    "jira", "confluence",
    "pytest", "junit", "selenium",
    "cypress", "jest", "test automation",
    "unit testing", "integration testing",
    "siemens apogee", "siemens desigo", "c shell", "vsam",
    "angular", "security clearance"
}

In [None]:
SKILL_VARIATIONS = {
        # JavaScript variations
    'javascript': ['javascript', 'js', 'node.js', 'nodejs', 'node js'],
    'typescript': ['typescript', 'ts'],
    
    # Python variations
    'python': ['python', 'python3', 'python 3', 'py'],
    
    # Database variations
    'sql': ['sql', 'mysql', 'postgresql', 'postgres', 't-sql', 'pl/sql'],
    'nosql': ['nosql', 'no-sql', 'mongodb', 'cassandra', 'redis'],
    
    # Cloud variations
    'aws': ['aws', 'amazon web services', 'ec2', 's3', 'lambda'],
    'azure': ['azure', 'microsoft azure'],
    'gcp': ['gcp', 'google cloud', 'google cloud platform'],
    
    # DevOps variations
    'docker': ['docker', 'containerization', 'containers'],
    'kubernetes': ['kubernetes', 'k8s'],
    'ci/cd': ['ci/cd', 'ci cd', 'continuous integration', 'continuous deployment'],
    
    # Frameworks
    'react': ['react', 'react.js', 'reactjs'],
    'angular': ['angular', 'angularjs', 'angular.js'],
    'vue': ['vue', 'vue.js', 'vuejs']
}

In [None]:
variant_to_canonical = {}
for canonical, variants in SKILL_VARIATIONS.items():
    for v in variants:
        variant_to_canonical[v.lower()] = canonical

In [None]:
matcher = PhraseMatcher(nlp_fast.vocab, attr="LOWER")

patterns = [nlp_fast.make_doc(s) for s in tech_skills]
matcher.add("SKILLS", patterns)

variation_patterns = [nlp_fast.make_doc(v) for v in variant_to_canonical.keys()]
matcher.add("SKILL_VARIANTS", variation_patterns)

In [None]:
def extract_skills(texts):
    skills_out = []
    for doc in nlp_fast.pipe(texts, batch_size=64):
        matches = matcher(doc)

        found = set()
        for _, start, end in matches:
            span_text = doc[start:end].text.lower()

            canonical = variant_to_canonical.get(span_text, span_text)
            found.add(canonical)

        skills_out.append(sorted(found))
    return skills_out

In [None]:
desc_skills = extract_skills(sample_jobs['description'].fillna("").astype(str).tolist())
title_skills = extract_skills(sample_jobs['title'].fillna("").astype(str).tolist())

sample_jobs['extracted_skills'] = [sorted(set(d) | set(t)) for d, t in zip(desc_skills, title_skills)]

In [None]:
sample_jobs['extracted_skills'].apply(len)

In [None]:
print(sample_jobs['extracted_skills'].value_counts())

In [None]:
print(sample_jobs[['title','extracted_skills','description']])

In [None]:
print(sample_jobs['title'].value_counts())

In [None]:
DOMAIN_STOPWORDS = {
    "experience", "work", "team", "include", "provide", "require",
    "skill", "year", "opportunity", "business", "customer",
    "support", "product", "project", "development", "engineering",
    "design", "position", "ability", "job", "company", "role",
    "employee", "knowledge", "service", "technical", "technology",
    "information", "management", "application", "develop", "engineer",
    "you", "that", "what", "who", "which", "they",
    "this position", "this role", "the ability",
    "employment", "equal opportunity", "opportunity employer",
    "race", "color", "religion", "sex", "gender identity",
    "sexual orientation", "national origin", "age", "disability",
    "veteran status",  "this", "skills", "benefits", "part", "responsibilities",
    "customers", "regard", "people", "solutions",
    "procedures", "employees", "them", "training",
    "the company", "diversity", "the world"
}

In [None]:
def common_words(texts):
    phrase_counts = Counter()

    for doc in nlp_parser.pipe(texts, batch_size=64):
        for chunk in doc.noun_chunks:
            phrase = chunk.text.lower().strip()

        if (
            4 <= len(phrase) <= 40
            and phrase not in DOMAIN_STOPWORDS
            and not phrase.startswith("this ")
        ):
            phrase_counts[phrase] += 1

    return phrase_counts.most_common(20)

In [None]:
common_desc = common_words(sample_jobs['description'].fillna("").astype(str).tolist())
common_title = common_words(sample_jobs['title'].fillna("").astype(str).tolist())

In [None]:
common_desc

In [None]:
common_title

Skill Statistics

In [None]:
skill_counts = sample_jobs['extracted_skills'].value_counts()
skill_counts

In [None]:
jobs_with_skills = sample_jobs[sample_jobs['extracted_skills'].apply(len) > 0]
jobs_without_skills = sample_jobs[sample_jobs['extracted_skills'].apply(len) == 0]

In [None]:
print(f"Jobs with skills: {len(jobs_with_skills)}")
print(f"Jobs without skills: {len(jobs_without_skills)}")

In [None]:
sample_jobs['num_skills'] = sample_jobs['extracted_skills'].apply(len)

In [None]:
print("Mean number of skills per job:", sample_jobs['num_skills'].mean())
print("Median number of skills per job:", sample_jobs['num_skills'].median())
print("Min number of skills per job:", sample_jobs['num_skills'].min())
print("Max number of skills per job:", sample_jobs['num_skills'].max())

In [None]:
print(sample_jobs['num_skills'].value_counts().sort_index())

In [None]:
all_skills = []
for skills in sample_jobs['extracted_skills']:
    all_skills.extend(skills)

In [None]:
skill_freq = Counter(all_skills)
for skill, freq in skill_freq.most_common(20):
    print(f"{skill}: {freq}")

Visualize

In [None]:
fig, ax = plt.subplots(2,2,figsize=(16,12))
ax1 = ax[0,0]
sample_jobs['num_skills'].value_counts().sort_index().plot(kind='bar', ax=ax1)
ax1.set_title("Distribution of Number of Skills per Job")
ax1.set_xlabel("Number of Skills")
ax1.set_ylabel("Number of Jobs")
ax1.grid(axis='y',alpha=0.3)

In [None]:
ax2 = ax[0,1]
has_skills=[len(jobs_with_skills),len(jobs_without_skills)]
labels = [f'With Skills ({len(jobs_with_skills)})', f'No Skills ({len(jobs_without_skills)})']
colors = ['#4CAF50', '#F44336']
ax2.pie(has_skills, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
ax2.set_title("Proportion of Jobs with Extracted Skills")

In [None]:
ax3 = ax[1,0]
top_skills = skill_freq.most_common(15)
skill_names, skill_counts = zip(*top_skills)
ax3.barh(skill_names, skill_counts, color='teal')
ax3.set_title('Top 15 Most Common Skills', fontsize=14, fontweight='bold')
ax3.set_xlabel('Frequency')
ax3.invert_yaxis()

In [None]:
ax4 = ax[1, 1]
ax4.hist(sample_jobs[sample_jobs['num_skills'] > 0]['num_skills'], bins=20, color='coral', edgecolor='black')
ax4.set_title('Skills per Job (Excluding Jobs with 0 Skills)', fontsize=14, fontweight='bold')
ax4.set_xlabel('Number of Skills')
ax4.set_ylabel('Frequency')

In [None]:
plt.tight_layout()
plt.show()