In [3]:
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from pathlib import Path



In [None]:
GLOSSARY_XLSX = "Computer_science_glossary_terms.xlsx"             
TERM_COL      = "Computer Science Term"         
terms = pd.read_excel(GLOSSARY_XLSX, usecols=[TERM_COL])[TERM_COL].dropna().astype(str).tolist()

In [5]:
nlp = spacy.blank("en")                         

# We use Spacy Matcher to do Name Entity Recognition 
# Now we build the metcher pipeline
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
matcher.add("CS_TERMS", [nlp.make_doc(t) for t in terms])

def _to_text(x):
    if isinstance(x, list):
        return "\n".join(map(str, x))
    return "" if pd.isna(x) else str(x)

def extract_terms(series: pd.Series) -> pd.Series:
    texts = series.fillna("").map(_to_text).tolist()
    out = []
    for doc in nlp.pipe(texts, batch_size=64):
        found = {doc[s:e].text for _, s, e in matcher(doc)}
        out.append(sorted(found, key=lambda z: (z.lower(), len(z))))
    return pd.Series(out, index=series.index)

In [None]:
import pandas as pd

jobs = pd.read_csv("linkedin_jobs.csv")

cols_to_match = [
    "Description",
    "Responsibilities",
    "QualificationsRequired",
    "QualificationsPreferred",
    "Requirements",
]


# Apply the matcher to job requirements, qulification, and so on to get corresponding CS skills
for col in cols_to_match:
    out_col = f"CS_Terms_{col}"
    jobs[out_col] = extract_terms(jobs[col])  

match_cols = [f"CS_Terms_{c}" for c in cols_to_match]
jobs["CS_Terms"] = jobs[match_cols].apply(
    lambda row: sorted(set(sum((row[c] for c in match_cols if isinstance(row[c], list)), [])),
                       key=lambda z: (z.lower(), len(z))),
    axis=1
)

In [None]:
jobs.to_csv("LinkedIn_NER.csv", index=False)