In [1]:
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from pathlib import Path



In [2]:
GLOSSARY_XLSX = "Computer_science_glossary_terms.xlsx"             
TERM_COL      = "Computer Science Term"         
terms = pd.read_excel(GLOSSARY_XLSX, usecols=[TERM_COL])[TERM_COL].dropna().astype(str).tolist()

In [3]:
nlp = spacy.blank("en")                         

# We use Spacy Matcher to do Name Entity Recognition 
# Now we build the metcher pipeline
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
matcher.add("CS_TERMS", [nlp.make_doc(t) for t in terms])

def _to_text(x):
    if isinstance(x, list):
        return "\n".join(map(str, x))
    return "" if pd.isna(x) else str(x)

def extract_cs_terms(df: pd.DataFrame, cols, batch_size: int = 64):
    # Build one text per row by joining target columns
    texts = [
        "\n".join(_to_text(df.at[i, c]) for c in cols)
        for i in df.index
    ]

    out = []
    for doc in nlp.pipe(texts, batch_size=batch_size):
        found = {doc[s:e].text for _, s, e in matcher(doc)}
        out.append(sorted(found, key=lambda z: (z.lower(), len(z))))
    return pd.Series(out, index=df.index)


In [5]:
import pandas as pd

jobs = pd.read_csv("linkedin_jobs.csv")

# Define which columns to search for CS terms
cols_to_match = [
    "Description",
    "Responsibilities",
    "QualificationsRequired",
    "QualificationsPreferred",
    "Requirements",
]

jobs["CS_Terms"] = extract_cs_terms(jobs, cols_to_match, batch_size=128)
jobs["CS_Terms"] = jobs["CS_Terms"].apply(lambda xs: " | ".join(xs))


In [12]:
jobs

Unnamed: 0,Title,Company,Location,Link,Description,Responsibilities,QualificationsRequired,QualificationsPreferred,Requirements,Salary,EmploymentType,AboutTheCompany,WorkplaceType,Applicants,Posted_date,CS_Terms
0,Machine Learning Intern,Hireshire,,https://www.linkedin.com/jobs/view/4295567445/...,About HireShire\n\nHireShire is a modern staff...,[],[],['Knowledge of BI tools (Power BI / Tableau / ...,['Pursuing (or recently completed) B.Tech/BE/M...,$18/hour,Internship,"Hireshire\n5,119 followers\nFollow\nStaffing a...",Remote,,,client | Python
1,"Fully Remote, Entry - Level Data Entry Job",Dolan Mental Health,"Florida, United States",https://www.linkedin.com/jobs/view/4298234161/...,Exciting Career Opportunity - Join Our Team!\n...,[],['High school diploma or equivalent (college d...,[],[],$35-40/hr,Full Time,"Dolan Mental Health\n11,402 followers\nFollow\...",Remote,26 applicants,2025/09/09,
2,IT Intern,Oxy,"The Woodlands, TX",https://www.linkedin.com/jobs/view/4295526435/...,Oxy is an international energy company with as...,[],['Pursuing a Bachelor’s or Master’s degree in ...,[],[],$16/hr,Intern,"Oxy\n661,874 followers\nFollow\nOil and Gas 10...",Remote,Over 100 people clicked apply,2025/09/10,Software Development
3,IT Intern,Xcel Energy,"Denver, CO",https://www.linkedin.com/jobs/view/4297654935/...,Are you looking for an exciting job where you ...,[],"['3.0 GPA (out of a 4.0 scale) or higher', 'Co...",[],[],$27.20 per hour,Full Time,"Xcel Energy\n136,660 followers\nFollow\nUtilit...",Remote,26 people clicked apply,2025/09/09,
4,"Computer, Computational & Stat Sciences Underg...",Los Alamos National Laboratory,"Los Alamos, NM",https://www.linkedin.com/jobs/view/4204748299/...,What You Will Do\n\nCome join the brightest mi...,"[""Come join the brightest minds at the most in...",[],[],[],$99.5K/yr,Full Time,"Los Alamos National Laboratory\n168,852 follow...",Remote,Over 100 people clicked apply,2025/09/06,Computational Physics | Computer programming |...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,Analytics Specialist,Joni and Friends,"Agoura Hills, CA",https://www.linkedin.com/jobs/view/4296699675/...,Duties:\n\nUnder the supervision of the Data B...,['Under the supervision of the Data Batching T...,['Must possess a vibrant personal relationship...,[],[],$22.00 per hour,Full Time,"Joni and Friends\n7,510 followers\nFollow\nNon...",Hybrid,4 applicants,,database | documentation | Documentation
479,Senior Fullstack Developer (Java/Python/React)...,Synergy Interactive,"Irvine, CA",https://www.linkedin.com/jobs/view/4297573443/...,We are looking for a highly skilled Senior Ful...,[],['Bachelor’s or Master’s degree in Computer Sc...,[],[],$60/hr,Intern,"Synergy Interactive\n525,439 followers\nFollow...",Hybrid,Over 100 applicants,2025/09/09,Java | Python
480,Technical Support Analyst,Winter Park Recruiting,"Orlando, FL",https://www.linkedin.com/jobs/view/4294491217/...,Winter Park Recruiting is a leading recruitmen...,[],"[""performing teams. We believe in personalized...",[],[],$60K/yr,Intern,"Winter Park Recruiting\n4,085 followers\nFollo...",Remote,Over 100 applicants,2025/09/06,client | database | peripheral
481,Enterprise Account Executive,RevPilots,,https://www.linkedin.com/jobs/view/4297244796/...,(This is for a RevPilots' client)\n\nAccount E...,[],"['Required:', ""Bachelor's degree in Business (...","[""Master's degree in a technical or business d...",[],"Salary: $150,000",Full Time,"RevPilots\n25,058 followers\nFollow\nTechnolog...",Remote,,,client | Client


In [11]:
jobs.to_csv("linkedIn_NER.csv", index=False)