In [6]:
# Imports
!pip install requests
import requests
from typing import List, Optional
!pip install spacy

!pip install skillNer
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [7]:
import re
import string
import spacy
from bs4 import BeautifulSoup
# Load spaCy language model for lemmatization
light_preproces_nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# Step 1: Imports
import spacy
from spacy.matcher import PhraseMatcher

# Step 2: Load the skill DB and SkillExtractor
from skillNer.general_params import SKILL_DB
# from skillNer.general_params import SKILL_DB # This import is no longer needed
from skillNer.skill_extractor_class import SkillExtractor

# Step 3: Load spaCy model and initialize SkillExtractor
heavy_nlp = spacy.load("en_core_web_lg")

# Initialize SkillExtractor without the 'skill_db' argument
skill_extractor = SkillExtractor(heavy_nlp, SKILL_DB, PhraseMatcher)

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


In [8]:
# Api keys. for now
rapid_api_key = '8ffd8240d0msh1d58889c2512c5dp1d4b20jsnc4ac59c1f46d'
rapid_api_host = 'jsearch.p.rapidapi.com'

In [9]:
import requests

def search_jobs_with_filters(
    job_name,
    skills=None,
    location=None,
    experience_level=None,
    employment_type=None,
    remote=None,
    posted_on=None,
    api_key=None
):
    if not job_name:
        raise ValueError("Job name is required.")

    url = "https://jsearch.p.rapidapi.com/search"
    headers = {
        "X-RapidAPI-Key": api_key,
        "X-RapidAPI-Host": "jsearch.p.rapidapi.com"
    }

    # Combine job name and skills
    query = job_name
    if skills:
        query += " " + " ".join(skills)

    # Base parameters
    params = {
        "query": query,
        "page": "1",
        "num_pages": "1"
    }

    # Optional filters
    if location:
        params["location"] = location
    if experience_level:
        params["experience_level"] = experience_level
    if employment_type:
        params["employment_type"] = employment_type
    if remote:
        params["remote"] = remote
    if posted_on:
        params["date_posted"] = posted_on  # e.g., "today", "3days", "7days", "30days"

    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200:
        return response.json().get("data", [])
    else:
        print(f"❌ Failed: {response.status_code} - {response.text}")
        return []


In [10]:
jobs = search_jobs_with_filters(
    job_name="Data Analyst",
    skills=["SQL", "Tableau", "Python"],
    location="Sydney",
    experience_level="Entry",  # Optional
    employment_type="Full-time",  # Optional
    remote="Remote",  # Optional
    posted_on="week",  # Optional
    api_key=rapid_api_key
)
print(jobs[0]['job_title'])

Data Analyst (SQL, Python, Alteryx, Tableau, Power BI and KNIME)


In [11]:
def get_job_ids_from_search(jobs):
    """
    Extract job IDs from a list of job search results.

    Parameters:
    - jobs (list): List of job dictionaries returned by the job search API.

    Returns:
    - list: List of job IDs.
    """
    job_ids = []
    for job in jobs:
        job_id = job.get("job_id")
        if job_id:
            job_ids.append(job_id)

    return job_ids

In [12]:
# Example of getting jobs from the search function
jobs = search_jobs_with_filters("AI Developer", api_key = rapid_api_key)

# Get job IDs from the search result
job_ids = get_job_ids_from_search(jobs)

In [13]:
def get_job_details_by_id(job_id: str):
    """
    Fetch full job details from the JSearch API using job ID.

    Parameters:
    - job_id (str): The unique identifier of the job.
    - api_key (str): Your JSearch API key from RapidAPI.

    Returns:
    - dict: Full job details if found, else None.
    """
    url = "https://jsearch.p.rapidapi.com/job-details"

    headers = {
        "X-RapidAPI-Key": rapid_api_key,
        "X-RapidAPI-Host": "jsearch.p.rapidapi.com"
    }

    params = {"job_id": job_id}

    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200:
        data = response.json()
        if data.get("data"):
            return data["data"][0]  # Return the first match
        else:
            print("⚠️  No job details found.")
            return None
    else:
        print(f"Failed to fetch job details: {response.status_code} - {response.text}")
        return None

In [14]:
job_details = get_job_details_by_id(job_id=job_ids[1])

In [15]:
def combine_job_text_fields(
    job_title: Optional[str],
    job_description: Optional[str],
    job_highlights: Optional[List[str] or str] = None
) -> str:
    """
    Combine job title, description, and highlights into a single text block.

    Args:
        job_title (str): Title of the job posting.
        job_description (str): Full job description.
        job_highlights (list or str, optional): Highlights as a list of strings or a single string.

    Returns:
        str: Combined string of all text fields.
    """
    parts = []

    if job_title:
        parts.append(str(job_title))

    if job_description:
        parts.append(str(job_description))

    if job_highlights:
        if isinstance(job_highlights, list):
            parts.append(" ".join(str(highlight) for highlight in job_highlights))
        else:
            parts.append(str(job_highlights))

    return "\n".join(parts).strip()

In [16]:
combined_text_input = combine_job_text_fields(
    job_title=job_details.get("job_title"),
    job_description=job_details.get("job_description"),
    job_highlights=job_details.get("job_highlights")
)

In [17]:
combined_text_input[:100]

'AI Full Stack Developer\nPosition Overview:\nThe AI Full Stack Developer will be responsible for build'

In [18]:
def preprocess_job_text(text: str, lemmatize: bool = True) -> str:
    """
    Clean and preprocess a block of job-related text.

    Args:
        text (str): Input raw job-related text (title, description, highlights).
        lemmatize (bool): Whether to lemmatize words (default True).

    Returns:
        str: Preprocessed, clean text.
    """
    if not isinstance(text, str):
        return ""

    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)

    # Remove special characters & numbers
    text = re.sub(r"[^a-z\s]", ' ', text)

    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize & remove stopwords + optional lemmatization
    doc = light_preproces_nlp(text)
    tokens = [
        token.lemma_ if lemmatize else token.text
        for token in doc
        if not token.is_stop and not token.is_punct and token.is_alpha
    ]

    return " ".join(tokens)

In [19]:
preprocessed_text_input = preprocess_job_text(combined_text_input)
preprocessed_text_input[:100]

'ai stack developer position overview ai stack developer responsible build maintain ai drive applicat'

In [38]:
preprocessed_text_input

'ai stack developer position overview ai stack developer responsible build maintain ai drive application work entire technology stack involve develop end user interface design deploy end ai system service role require strong programming skill understanding ai ml model ability build end end solution highly responsive scalable efficient key responsibility ai application development design implement ai drive application integrate machine learning model ai algorithm stack web application develop user face feature end modern web technology e g react vue js angular implement end service node js python java etc integrate machine learning model apis database build deploy ai ml model ensure integration production environment optimal performance machine learn model integration collaborate data scientist ai researcher integrate pre train ai model production ready application ensure seamless deployment scaling ai model cloud environment aws gcp azure monitor improve ai model performance ensure acc

# Model Experimentation

1. Fuzzy or exact matching

In [20]:
known_skills = [
    "python", "java", "c++", "pytorch", "tensorflow", "scikit-learn", "nlp",
    "machine learning", "deep learning", "sql", "git", "docker", "kubernetes",
    "linux", "azure", "aws", "gcp", "pandas", "numpy", "react", "javascript",
    "flask", "fastapi", "huggingface", "transformers", "bert", "llm", "spacy",
]

In [21]:
from collections import Counter
from difflib import get_close_matches

def extract_skills_from_text(text: str, known_skills: list) -> list:
    """
    Extract known skills that are present in the job text.

    Args:
        text (str): Preprocessed job text
        known_skills (list): List of skills to match against

    Returns:
        list: Top 10 most relevant skills found in text
    """
    words = text.split()
    skill_matches = []

    for skill in known_skills:
        skill_tokens = skill.split()
        if all(token in words for token in skill_tokens):
            skill_matches.append(skill)
        else:
            # Optionally use fuzzy matching for slight spelling variations
            if get_close_matches(skill, words, n=1, cutoff=0.9):
                skill_matches.append(skill)

    return [skill for skill, _ in Counter(skill_matches).most_common(10)]


In [22]:
extracted_skills = extract_skills_from_text(preprocessed_text_input, known_skills)
extracted_skills

['python',
 'java',
 'pytorch',
 'tensorflow',
 'machine learning',
 'sql',
 'git',
 'docker',
 'kubernetes',
 'azure']

In [23]:
def compare_user_skills(extracted_skills: list, user_skills: list):
    """
    Compare user's skills with extracted job requirements.

    Returns:
        matched_skills, missing_skills
    """
    user_skills_lower = [skill.lower() for skill in user_skills]
    matched = list(set(extracted_skills) & set(user_skills_lower))
    missing = list(set(extracted_skills) - set(user_skills_lower))
    return matched, missing

In [24]:
user_skills = ['machine Lerning', 'azure', 'tensor']

In [25]:
matched_skills, missing_skills = compare_user_skills(extracted_skills=extracted_skills, user_skills=user_skills)

In [26]:
print(f"'Matched skills: {matched_skills}")
print(f"Missing skills: {missing_skills}")

'Matched skills: ['azure']
Missing skills: ['pytorch', 'docker', 'git', 'sql', 'python', 'machine learning', 'kubernetes', 'java', 'tensorflow']


2. SkillsNER - specifically pre-trained for Skill tagging

In [35]:
from collections import Counter

def extract_skills_skillsNer(job_text: str, top_n: int = 10, include_partial: bool = False):
    """
    Extracts and returns the top N core skills from job text.

    Args:
        job_text (str): Combined job text including title, description, etc.
        top_n (int): Number of top skills to return.
        include_partial (bool): Whether to include partial matches in ranking.

    Returns:
        List[str]: Top N skills sorted by frequency and relevance.
    """
    result = skill_extractor.annotate(job_text)  # Pass raw string instead of doc

    full_matches = [match["doc_node_value"] for match in result["results"]["full_matches"]]
    partial_matches = [match["doc_node_value"] for match in result["results"]["partial_matches"]] if include_partial else []

    all_matches = full_matches + partial_matches if include_partial else full_matches

    # Count frequencies and return top N
    most_common_skills = [skill for skill, _ in Counter(all_matches).most_common(top_n)]

    return most_common_skills


In [39]:
extracted_skills_ner = extract_skills_skillsNer(combined_text_input)
extracted_skills_ner

  vec_similarity = token1.similarity(token2)


['front end',
 'back end',
 'vue js',
 'node js',
 'API',
 'user interface',
 'web application',
 'relational database',
 'continuous integration',
 'performance tuning']

In [40]:
matched_skills, missing_skills = compare_user_skills(extracted_skills=extracted_skills_ner, user_skills=user_skills)
print(f"'Matched skills (NER): {matched_skills}")
print(f"Missing skills (NER): {missing_skills}")

'Matched skills (NER): []
Missing skills (NER): ['user interface', 'web application', 'node js', 'relational database', 'back end', 'continuous integration', 'front end', 'performance tuning', 'vue js', 'API']


3. BERT

In [41]:
!pip install transformers
!pip install torch  # PyTorch is required for the transformer models

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [42]:
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline
import torch


In [43]:
# Load pre-trained BERT model and tokenizer
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"  # NER model fine-tuned for Named Entity Recognition
model = BertForTokenClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Use the pipeline API for NER
nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Device set to use cpu


In [71]:
from transformers import BertTokenizer, BertForTokenClassification
import torch
from collections import Counter

# Load a case-insensitive pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased')

def extract_skills_bert(job_text: str, top_n=10):
    # Tokenize the text using BERT tokenizer (with truncation to handle long texts)
    inputs = tokenizer(job_text, return_tensors="pt", truncation=True, max_length=512, padding=True)

    # Perform token classification with BERT
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted token classes (simplification, usually a label ID for skills)
    predictions = outputs[0].argmax(dim=2).squeeze().tolist()

    # Extract tokens from BERT output (mapping token ids back to words)
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze().tolist())

    # In a typical model trained for NER, you'd map predicted tokens to skill-related labels
    # Assuming that we can match tokens classified as skills here (you'd need to customize this)
    matched_skills = [tokens[i] for i in range(len(tokens)) if predictions[i] == 1]  # '1' might represent 'skill' in your model

    # Count the frequency of each skill
    skill_counts = Counter(matched_skills)

    # Get the top N most common skills
    top_skills = [skill for skill, _ in skill_counts.most_common(top_n)]

    return top_skills

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [82]:
extracted_skills_bert = extract_skills_bert(preprocessed_text_input)

In [83]:
matched_skills, missing_skills = compare_user_skills(extracted_skills=extracted_skills_bert, user_skills=['Python', 'Angular'])
print(f"'Matched skills (BERT): {matched_skills}")
print(f"Missing skills (BERT): {missing_skills}")

'Matched skills (BERT): []
Missing skills (BERT): ['model', 'developer', 'ai', 'build', '##s', 'api', 'end', 'stack', 'development', 'application']
