In [None]:
pip install spacy

In [None]:
import re
import pandas as pd
import spacy
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_colwidth', None)
df = pd.read_csv('../postings.csv')
print(df.head())


In [None]:
jobs = df.loc[:, ['job_id', 'title', 'description', 'skills_desc','location', 'formatted_experience_level', 'formatted_work_type', 'remote_allowed', 'company_id']]
print(jobs.head())

In [None]:
jobs=jobs.dropna(subset=['description','company_id'])
jobs['remote_allowed']=jobs['remote_allowed'].fillna(0)

In [None]:
jobs.isnull().sum()

In [None]:
tech_keywords = [
        "software", "engineer", "developer", "programmer",
    "data", "machine learning", "ml", "ai",
    "backend", "frontend", "full stack",
    "cloud", "devops", "platform"
]

In [None]:
def is_tech_job(title):
    if pd.isna(title):
        return 0
    title = title.lower()
    return(int(any(keyword in title for keyword in tech_keywords)))
jobs['is_tech_job'] = jobs['title'].apply(is_tech_job)

In [None]:
tech_jobs=jobs[jobs['is_tech_job']==1]

In [None]:
cum_dist = tech_jobs["title"].value_counts().cumsum() / tech_jobs["title"].value_counts().sum()

plt.figure(figsize=(10, 6))
plt.plot(range(len(cum_dist)), cum_dist.values)
plt.xlabel("Job Titles (sorted by frequency)")
plt.ylabel("Cumulative Proportion")
plt.title("Cumulative Distribution of Job Titles in Tech Postings")
plt.grid(True)
plt.show()


In [None]:
tech_patterns = [
    r"\bsoftware\b", r"\bswe\b", r"\bdeveloper\b", r"\bdev\b",
    r"\bengineer\b", r"\bsdet\b", r"\bprogrammer\b",
    r"\bdata\b", r"\banalytics?\b",
    r"\bmachine\s*learning\b", r"\bml\b", r"\bai\b", r"\bllm\b",
    r"\bbackend\b", r"\bfront\s*end\b|\bfrontend\b", r"\bfull\s*stack\b|\bfullstack\b",
    r"\bdevops\b", r"\bcloud\b", r"\bplatform\b",
    r"\bsecurity\b", r"\bqa\b|\btest\b|\bautomation\b"
]

tech_regex = re.compile("|".join(tech_patterns), flags=re.IGNORECASE)

jobs["is_tech_job"] = jobs["title"].fillna("").apply(lambda t: int(bool(tech_regex.search(t))))
tech_jobs = jobs[jobs["is_tech_job"] == 1]

In [None]:
def normalize_title(title: str) -> str:
    if pd.isna(title):
        return title
    t = title.lower()

    # remove seniority/level noise
    t = re.sub(r"\b(senior|sr|junior|jr|lead|principal|staff|i{1,3}|iv|v)\b", "", t)
    t = re.sub(r"\b(level)\s*\d+\b", "", t)

    # remove punctuation-ish noise
    t = re.sub(r"[\(\)\[\]\|,/\\\-]+", " ", t)
    t = re.sub(r"\s+", " ", t).strip()

    # map common variants to canonical labels
    if re.search(r"\b(data scientist|data science)\b", t):
        return "data scientist"
    if re.search(r"\b(data engineer|data engineering)\b", t):
        return "data engineer"
    if re.search(r"\b(machine learning|ml)\b.*\bengineer\b|\bml engineer\b", t):
        return "machine learning engineer"
    if re.search(r"\bsoftware\b.*\bengineer\b|\bswe\b", t):
        return "software engineer"
    if re.search(r"\b(frontend|front end)\b", t):
        return "frontend engineer"
    if re.search(r"\b(backend|back end)\b", t):
        return "backend engineer"
    if re.search(r"\b(full stack|fullstack)\b", t):
        return "full stack engineer"

    return t  # fallback

tech_jobs["title_norm"] = tech_jobs["title"].apply(normalize_title)


In [None]:
counts = tech_jobs["title_norm"].value_counts()
cum_dist = counts.cumsum() / counts.sum()

import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(range(len(cum_dist)), cum_dist.values)
plt.xlabel("Normalized job titles (sorted by frequency)")
plt.ylabel("Cumulative proportion")
plt.title("Cumulative Distribution of Normalized Tech Job Titles")
plt.grid(True)
plt.show()


In [None]:
raw_unique = tech_jobs["title"].nunique()
norm_unique = tech_jobs["title_norm"].nunique()

reduction_pct = 1 - (norm_unique / raw_unique)

raw_unique, norm_unique, reduction_pct
