In [3]:
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from pathlib import Path



In [4]:
GLOSSARY_XLSX = "/Users/50357691/Desktop/Job_Skill_Gap_Analysis/Computer_science_glossary_terms.xlsx"             
TERM_COL      = "Computer Science Term"         
terms = pd.read_excel(GLOSSARY_XLSX, usecols=[TERM_COL])[TERM_COL].dropna().astype(str).tolist()

In [5]:
nlp = spacy.blank("en")                         

# We use Spacy Matcher to do Name Entity Recognition 
# Now we build the metcher pipeline
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
matcher.add("CS_TERMS", [nlp.make_doc(t) for t in terms])

def _to_text(x):
    if isinstance(x, list):
        return "\n".join(map(str, x))
    return "" if pd.isna(x) else str(x)

def extract_terms(series: pd.Series) -> pd.Series:
    texts = series.fillna("").map(_to_text).tolist()
    out = []
    for doc in nlp.pipe(texts, batch_size=64):
        found = {doc[s:e].text for _, s, e in matcher(doc)}
        out.append(sorted(found, key=lambda z: (z.lower(), len(z))))
    return pd.Series(out, index=series.index)

In [6]:
import pandas as pd

jobs = pd.read_csv("linkedin_jobs_demo.csv")

cols_to_match = [
    "Description",
    "Responsibilities",
    "QualificationsRequired",
    "QualificationsPreferred",
    "Requirements",
]


# Apply the matcher to job requirements, qulification, and so on to get corresponding CS skills
for col in cols_to_match:
    out_col = f"CS_Terms_{col}"
    jobs[out_col] = extract_terms(jobs[col])  

match_cols = [f"CS_Terms_{c}" for c in cols_to_match]
jobs["CS_Terms"] = jobs[match_cols].apply(
    lambda row: sorted(set(sum((row[c] for c in match_cols if isinstance(row[c], list)), [])),
                       key=lambda z: (z.lower(), len(z))),
    axis=1
)

In [7]:
jobs

Unnamed: 0,Title,Company,Location,Link,Description,Responsibilities,QualificationsRequired,QualificationsPreferred,Requirements,Posted_date,CS_Terms_Description,CS_Terms_Responsibilities,CS_Terms_QualificationsRequired,CS_Terms_QualificationsPreferred,CS_Terms_Requirements,CS_Terms
0,Data Analyst -- Entry Level,CGI,"Dallas, TX",https://www.linkedin.com/jobs/view/4290918685/...,About the job\nPosition Description\n\nLaunch ...,[],"['What You’ll Bring', 'Bachelor’s degree from ...",[],[],2025/08/26,"[class, cloud computing, Computer, Computer Sc...",[],"[cloud computing, Computer, Computer Science, ...",[],[],"[class, cloud computing, Computer, Computer Sc..."
1,Tech Intern,Hewlett Packard Enterprise,"Spring, TX",https://www.linkedin.com/jobs/view/4289266480/...,About the job\nThis role has been designed as ...,"['Management Level Definition:', 'Support seni...",[],[],[],2025/08/26,"[Computer, Computer Science, data, field, Java...","[Computer, Computer Science, field, Java, Pyth...",[],[],[],"[Computer, Computer Science, data, field, Java..."
2,Information Security Intern,SoTalent,"Dallas, TX",https://www.linkedin.com/jobs/view/4290763403/...,About the job\nJob Title : Securities Research...,[],[],['Coursework or exposure to cybersecurity / da...,[],2025/08/28,"[Computer, Computer Science, data]",[],[],[data],[],"[Computer, Computer Science, data]"
3,Data Science Intern,Hireshire,,https://www.linkedin.com/jobs/view/4291119840/...,About the job\nAbout HireShire\n\nHireShire is...,[],[],['Knowledge of BI tools (Power BI / Tableau / ...,['Pursuing (or recently completed) B.Tech/BE/M...,,"[client, Computer, Computer Science, data, Dat...",[],[],"[client, data, Python]","[Computer, Computer Science, data, Data, Data ...","[client, Computer, Computer Science, data, Dat..."
4,Data Science Intern - Summer 2026,Altruist,"Los Angeles, CA",https://www.linkedin.com/jobs/view/4279910921/...,About the job\nAbout Altruist\n\nAltruist is t...,[],['clearing brokerage firm with intuitive softw...,[],[],2025/08/26,"[computer, computer science, data, Data, data ...",[],"[computer, computer science, data, Data, Data ...",[],[],"[computer, computer science, data, Data, data ..."
5,Information Security Analyst Intern,"TrueNorth Companies, L.C.","Cedar Rapids, IA",https://www.linkedin.com/jobs/view/4280490167/...,About the job\nAre you looking for an opportun...,[],"['week paid program, running May', 'August 202...",[],[],2025/08/27,"[Computer, Computer Science, documentation, ex...",[],"[expression, state]",[],[],"[Computer, Computer Science, documentation, ex..."
6,Data Analyst,Paramount Pictures,"Los Angeles, CA",https://www.linkedin.com/jobs/view/4279650100/...,About the job\n#WeAreParamount on a mission to...,"['Lead all aspects of the collection, cleaning...",['Proficiency in data analysis tools and softw...,[],[],2025/08/26,"[collection, computer, computer science, data,...","[collection, data]","[collection, computer, computer science, data,...",[],[],"[collection, computer, computer science, data,..."


In [8]:
jobs.to_csv("LinkedIn_NER_demo.csv", index=False)