In [3]:
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from pathlib import Path

In [4]:
GLOSSARY_XLSX = "Computer_science_glossary_terms.xlsx"             
TERM_COL      = "Computer Science Term"         
terms = pd.read_excel(GLOSSARY_XLSX, usecols=[TERM_COL])[TERM_COL].dropna().astype(str).tolist()

In [5]:
nlp = spacy.blank("en")                         

# We use Spacy Matcher to do Name Entity Recognition 
# Now we build the metcher pipeline
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
matcher.add("CS_TERMS", [nlp.make_doc(t) for t in terms])

def _to_text(x):
    if isinstance(x, list):
        return "\n".join(map(str, x))
    return "" if pd.isna(x) else str(x)

def extract_cs_terms(df: pd.DataFrame, cols, batch_size: int = 64):
    # Build one text per row by joining target columns
    texts = [
        "\n".join(_to_text(df.at[i, c]) for c in cols)
        for i in df.index
    ]

    out = []
    for doc in nlp.pipe(texts, batch_size=batch_size):
        found = {doc[s:e].text for _, s, e in matcher(doc)}
        out.append(sorted(found, key=lambda z: (z.lower(), len(z))))
    return pd.Series(out, index=df.index)


In [9]:
import pandas as pd

jobs = pd.read_csv("linkedin_data_center_jobs.csv")
cols_to_match = ['job_description']

jobs["cs glossary terms"] = extract_cs_terms(jobs, cols_to_match, batch_size=64)
jobs["cs glossary terms"] = jobs["cs glossary terms"].apply(lambda xs: " | ".join(xs))

In [10]:
jobs.head()

Unnamed: 0,job_title,company_name,Salary ($),job_description,num_applicants,job_url,Seniority level,Employment type,Job function,Industries,workplace_type,State,publish_date,cs glossary terms
0,Imaging Student Intern PRN,Alaska Regional Hospital,,DescriptionIntroductionDo you have the PRN car...,,https://www.linkedin.com/jobs/view/4239122420/...,Internship,Part-time,Health Care Provider,Hospitals and Health Care,On-site,AK,10/04/25,
1,DOD SkillBridge Internship – Field Technician,Siemens,,"Here at Siemens, we take pride in enabling sus...",,https://www.linkedin.com/jobs/view/4269469231/...,Mid-Senior level,Full-time,Other,Automation Machinery Manufacturing,Unknown,AK,09/23/25,documentation
2,Geotechnical Engineering Intern - Summer 2026,WSP in the U.S.,,Job DescriptionThis OpportunityWSP USA hosts h...,,https://www.linkedin.com/jobs/view/4286958078/...,Internship,Internship,Engineering and Information Technology,Professional Services,On-site,AK,09/30/25,client | documentation
3,"Field Engineer Intern - Northwest District, He...",Kiewit,46280.0,Requisition ID:177228Job Level:InternshipHome ...,31 applicants,https://www.linkedin.com/jobs/view/4287575140/...,Internship,Internship,Engineering and Information Technology,Construction and Civil Engineering,On-site,AK,10/02/25,
4,Entry Level Engineers - Nationwide,Jacobs,72500.0,"At Jacobs, we're challenging today to reinvent...",,https://www.linkedin.com/jobs/view/4305166349/...,Not Applicable,Full-time,Engineering and Information Technology,"Civil Engineering, Design Services, and IT Ser...",Unknown,AK,09/30/25,client | data center


In [12]:
jobs['cs glossary terms'].value_counts().head(20)

cs glossary terms
                                                      3429
client                                                 721
documentation                                          690
data center | Data Center                              397
Python                                                 397
data center                                            344
client | documentation                                 218
client | data center | Data Center                     218
Data Center                                            195
cloud computing | data center | Data Center            185
client | data center                                   153
database                                               123
data center | Data Center | documentation               94
documentation | Python                                  93
client | data center | Data Center | documentation      80
client | Data Center                                    78
peripheral                            

In [11]:
jobs.to_csv("linkedIn_NER.csv", index=False)