In [2]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.8.0-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.35.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached tokenizers-0.22.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Using cached sentence_transformers-5.1.0-py3-none-any.whl (483 kB)
Using cached transformers-4.56.1-py3-none-any.whl (11.6 MB)
Downloading huggingface_hub-0.35.

In [12]:
import pandas as pd
import numpy as np
import spacy
import re
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

sns.set(style="whitegrid")

In [16]:
def clean_text(text):
    """Light cleaning: remove emails, urls, phones, html tags; lowercase; remove extra spaces."""
    if not isinstance(text, str):
        return ""
    t = text
    t = re.sub(r'<.*?>', ' ', t)                   # html
    t = re.sub(r'\S+@\S+', ' ', t)                 # emails
    t = re.sub(r'http\S+|www\.\S+', ' ', t)        # urls
    t = re.sub(r'\+?\d[\d\-\s]{6,}\d', ' ', t)     # phones (approx)
    t = re.sub(r'[^A-Za-z0-9\s\-\./]', ' ', t)     # keep some tokens
    t = re.sub(r'\s+', ' ', t).strip()
    return t.lower()

def truncate(text, n=200):
    if not isinstance(text, str):
        return ""
    return text if len(text) <= n else text[:n] + "..."

def extract_skills_from_text(text, skills_set):
    """Return set of matched skills from skills_set found in text (word-boundary matching)."""
    matched = set()
    if not isinstance(text, str) or not skills_set:
        return matched
    text_low = text.lower()
    for s in skills_set:
        pattern = r'\b' + re.escape(s.lower()) + r'\b'
        if re.search(pattern, text_low):
            matched.add(s.lower())
    return matched

In [20]:
import os
import re

resumes_path =  r"C:\Users\USER\Documents\Cleaned_Resume_Full.csv" 
jobs_path = r"C:\Users\USER\Documents\fyp dataset\jobstreet_all_job_dataset.csv"       

# Read the CSV files directly
resumes_df = pd.read_csv(resumes_path)
jobs_df = pd.read_csv(jobs_path)

print("Datasets successfully loaded!")
print("Resumes shape:", resumes_df.shape)
print("Jobs shape:", jobs_df.shape)

job_col = 'descriptions'

# create cleaned columns
jobs_df['cleaned_job_description'] = jobs_df[job_col].apply(clean_text)

# quick peek
print("Resumes sample:")
display(resumes_df.head(3))
print("Jobs sample:")
display(jobs_df.head(3))

jobs_df.to_csv(r"C:\Users\USER\Documents\fyp dataset\jobs_with_clean_text.csv", index=False)

Datasets successfully loaded!
Resumes shape: (2484, 3)
Jobs shape: (69024, 11)
Resumes sample:


Unnamed: 0,Category,Category_Encoded,Cleaned_Resume_str
0,HR,19,hr administratormarketing associate hr adminis...
1,HR,19,hr specialist u hr operation summary versatile...
2,HR,19,hr director summary 20 year experience recruit...


Jobs sample:


Unnamed: 0,job_id,job_title,company,descriptions,location,category,subcategory,role,type,salary,listingDate,cleaned_job_description
0,74630583.0,Procurement Executive (Contract),Coca-Cola Bottlers (Malaysia) Sdn Bhd,Position Purpose\nManage aspects of procuremen...,Negeri Sembilan,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",procurement-executive,Contract/Temp,,2024-03-21T05:58:35Z,position purpose manage aspects of procurement...
1,74660602.0,Account Executive/ Assistant,Acoustic & Lighting System Sdn Bhd,We are looking for a Account Executive/ Assist...,Petaling,Accounting,Bookkeeping & Small Practice Accounting,executive-assistant,Full time,"RM 2,800 – RM 3,200 per month",2024-03-22T06:52:57Z,we are looking for a account executive/ assist...
2,74655679.0,"Data Analyst - Asset Management, SPX Express",Shopee Mobile Malaysia Sdn Bhd,Performs detailed data analysis on existing sp...,Klang District,"Manufacturing, Transport & Logistics",Analysis & Reporting,asset-management-analyst,Full time,,2024-03-22T04:22:43Z,performs detailed data analysis on existing sp...


In [4]:
import pandas as pd

# Load the O*NET skills file
skills_path = r"C:\Users\USER\Documents\fyp dataset\db_30_0_text\Skills.txt" 
skills_df = pd.read_csv(skills_path, sep='\t', encoding='utf-8')

# Preview columns
print(skills_df.head())
print(skills_df.columns)

# Extract the unique skill names
external_skills = set(skills_df['Element Name'].str.lower().unique())

print(f"Loaded {len(external_skills)} O*NET skills")
print(list(external_skills)[:30])

def extract_skills(text, skill_list):
    text_lower = str(text).lower()
    return [skill for skill in skill_list if skill in text_lower]

jobs_df['extracted_skills'] = jobs_df['cleaned_job_description'].apply(lambda x: extract_skills(x, external_skills))

print(jobs_df[['job_title', 'extracted_skills']].head(10))

  O*NET-SOC Code Element ID           Element Name Scale ID  Data Value  N  \
0     11-1011.00    2.A.1.a  Reading Comprehension       IM        4.12  8   
1     11-1011.00    2.A.1.a  Reading Comprehension       LV        4.62  8   
2     11-1011.00    2.A.1.b       Active Listening       IM        4.00  8   
3     11-1011.00    2.A.1.b       Active Listening       LV        4.75  8   
4     11-1011.00    2.A.1.c                Writing       IM        4.12  8   

   Standard Error  Lower CI Bound  Upper CI Bound Recommend Suppress  \
0          0.1250          3.8800          4.3700                  N   
1          0.1830          4.2664          4.9836                  N   
2          0.0000          4.0000          4.0000                  N   
3          0.1637          4.4292          5.0708                  N   
4          0.1250          3.8800          4.3700                  N   

  Not Relevant     Date Domain Source  
0          NaN  08/2023       Analyst  
1            N  08

In [29]:
skills_df.head()

Unnamed: 0,O*NET-SOC Code,Element ID,Element Name,Scale ID,Data Value,N,Standard Error,Lower CI Bound,Upper CI Bound,Recommend Suppress,Not Relevant,Date,Domain Source
0,11-1011.00,2.A.1.a,Reading Comprehension,IM,4.12,8,0.125,3.88,4.37,N,,08/2023,Analyst
1,11-1011.00,2.A.1.a,Reading Comprehension,LV,4.62,8,0.183,4.2664,4.9836,N,N,08/2023,Analyst
2,11-1011.00,2.A.1.b,Active Listening,IM,4.0,8,0.0,4.0,4.0,N,,08/2023,Analyst
3,11-1011.00,2.A.1.b,Active Listening,LV,4.75,8,0.1637,4.4292,5.0708,N,N,08/2023,Analyst
4,11-1011.00,2.A.1.c,Writing,IM,4.12,8,0.125,3.88,4.37,N,,08/2023,Analyst


In [24]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.1-cp313-cp313-win_amd64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.1-cp313-cp313-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 1.7 MB/s eta 0:00:01
   -------------------- ------------------- 0.8/1.5 MB 1.7 MB/s eta 0:00:01
   --------------------------- ------------ 1.0/1.5 MB 1.6 MB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 1.5 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 1.5 MB/s  0:00:01
Installing collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.1


In [3]:
import pandas as pd

#Load O*NET Skills dataset
skills_path = r"C:\Users\USER\Documents\fyp dataset\db_30_0_text\Skills.txt"

# Try reading as tab-separated text file
skills_df = pd.read_csv(skills_path, sep='\t', encoding='utf-8', engine='python')

# Clean column names (remove spaces, BOM marks, etc.)
skills_df.columns = skills_df.columns.str.strip().str.replace('\ufeff', '', regex=True)

# Extract the skill names (remove missing values)
skill_list = skills_df[col_name].dropna().unique().tolist()

print(f"\n Loaded {len(skill_list)} unique skills from O*NET")
print("\nSample skills:", skill_list[:20])

jobs_path = r"C:\Users\USER\Documents\fyp dataset\jobs_with_clean_text.csv"
jobs_df = pd.read_csv(jobs_path)

# ensure column exists and convert NaNs to empty strings
jobs_df['cleaned_job_description'] = jobs_df['cleaned_job_description'].fillna('').astype(str)


 Loaded 35 unique skills from O*NET

Sample skills: ['Reading Comprehension', 'Active Listening', 'Writing', 'Speaking', 'Mathematics', 'Science', 'Critical Thinking', 'Active Learning', 'Learning Strategies', 'Monitoring', 'Social Perceptiveness', 'Coordination', 'Persuasion', 'Negotiation', 'Instructing', 'Service Orientation', 'Complex Problem Solving', 'Operations Analysis', 'Technology Design', 'Equipment Selection']


In [28]:
from rapidfuzz import fuzz

def fuzzy_skill_match(text, skill_list, threshold=85):
    if not isinstance(text, str):
        text = ""  # convert non-strings to empty string
    text_lower = text.lower()
    matched_skills = []
    for skill in skill_list:
        if not isinstance(skill, str) or skill.strip() == '':
            continue
        score = fuzz.partial_ratio(skill.lower(), text_lower)
        if score >= threshold:
            matched_skills.append(skill)
    return matched_skills

# Apply fuzzy matching to jobs
jobs_df['fuzzy_skills'] = jobs_df['cleaned_job_description'].fillna('').astype(str).apply(
    lambda x: fuzzy_skill_match(x, skill_list)
)
job_fuzzy_skills_list = jobs_df['fuzzy_skills'].tolist()

# Apply fuzzy matching to resumes
resumes_df['resume_skills'] = resumes_df['Cleaned_Resume_str'].fillna('').astype(str).apply(
    lambda x: fuzzy_skill_match(x, skill_list)
)
resume_skills_list = resumes_df['resume_skills'].tolist()

In [6]:
from sentence_transformers import SentenceTransformer

# Load pre-trained Sentence-BERT
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')




In [7]:
print(jobs_df.columns)

Index(['job_id', 'job_title', 'company', 'descriptions', 'location',
       'category', 'subcategory', 'role', 'type', 'salary', 'listingDate',
       'cleaned_job_description', 'extracted_skills'],
      dtype='object')


In [9]:
import pandas as pd

jobs_path = r"C:\Users\USER\Documents\fyp dataset\jobs_with_clean_text.csv"
jobs_df = pd.read_csv(jobs_path)

resume_path = r"C:\Users\USER\Documents\Cleaned_Resume_Full.csv"
resume_df = pd.read_csv(resume_path)

# Encode resumes and job descriptions
resume_embeddings = model.encode(resume_df['Cleaned_Resume_str'].tolist(), convert_to_tensor=True)
job_embeddings = model.encode(jobs_df['cleaned_job_description'].tolist(), convert_to_tensor=True)

In [29]:
from sentence_transformers.util import cos_sim

# Define the skill overlap function BEFORE the loop
def skill_overlap_score(resume_skills, job_skills):
    overlap = set(resume_skills).intersection(set(job_skills))
    if len(job_skills) == 0:
        return 0
    return len(overlap) / len(job_skills)

all_results = []

for resume_idx, resume_embedding in enumerate(resume_embeddings):
    resume_skills = resume_skills_list[resume_idx]  # skills extracted from the resume
    
    # Compare this resume against all jobs
    similarities = cos_sim(resume_embedding, job_embeddings)
    
    # Get top 5 jobs for this resume
    top_results = similarities[0].topk(5)
    
    for score, job_idx in zip(top_results[0], top_results[1]):
        job_idx = job_idx.item()
        
        # Get fuzzy skills for this job from jobs_df
        job_fuzzy_skills = jobs_df['fuzzy_skills'].iloc[job_idx]
        
        # Compute skill overlap score
        overlap_score = skill_overlap_score(resume_skills, job_fuzzy_skills)
        
        # Combine SBERT similarity with skill overlap
        final_score = 0.7 * score.item() + 0.3 * overlap_score
        
        all_results.append({
            "Resume ID": resume_idx,
            "Job ID": job_idx,
            "Similarity Score": score.item(),
            "Skill Overlap Score": overlap_score,
            "Final Combined Score": final_score,
            "Job Description (Preview)": jobs_df['descriptions'].iloc[job_idx][:200] + "..."
        })

In [30]:
results_df = pd.DataFrame(all_results)
print(results_df.head(10))  # preview first 10 results

   Resume ID  Job ID  Similarity Score  Skill Overlap Score  \
0          0   23734          0.860141             0.500000   
1          0   53645          0.835883             0.000000   
2          0   15474          0.824807             0.000000   
3          0   15549          0.823283             0.000000   
4          0    1812          0.821957             0.000000   
5          1   26651          0.866998             0.666667   
6          1   39795          0.866466             0.000000   
7          1   57892          0.866466             0.000000   
8          1   30042          0.857582             1.000000   
9          1   19249          0.856655             0.000000   

   Final Combined Score                          Job Description (Preview)  
0              0.752099  A Human Resources Training Officer is responsi...  
1              0.585118  A Human Resources Executive is responsible for...  
2              0.577365  Description\nAn Assistant Human Resources Mana... 

In [34]:
# For a specific resume, e.g., Resume 0
resume_id = 0
resume_jobs = [r for r in all_results if r['Resume ID'] == resume_id]

# Sort by Final Combined Score descending
resume_jobs_sorted = sorted(resume_jobs, key=lambda x: x['Final Combined Score'], reverse=True)

# Print top-ranked jobs
for rank, job in enumerate(resume_jobs_sorted[:5], start=1):
    print(f"\nRank {rank}")
    print(f"Final Combined Score: {job['Final Combined Score']:.4f}")
    print(f"SBERT Similarity: {job['Similarity Score']:.4f}")
    print(f"Skill Overlap Score: {job['Skill Overlap Score']:.4f}")
    print(f"Job Description Preview: {job['Job Description (Preview)']}")


Rank 1
Final Combined Score: 0.7521
SBERT Similarity: 0.8601
Skill Overlap Score: 0.5000
Job Description Preview: A Human Resources Training Officer is responsible for organising, delivering, and evaluating training programs to deliver an excellent staff experience while support management on proper policies and ...

Rank 2
Final Combined Score: 0.5851
SBERT Similarity: 0.8359
Skill Overlap Score: 0.0000
Job Description Preview: A Human Resources Executive is responsible for supporting and advising management on proper policies and procedures to deliver an excellent staff experience while assisting with employee relations.
Wh...

Rank 3
Final Combined Score: 0.5774
SBERT Similarity: 0.8248
Skill Overlap Score: 0.0000
Job Description Preview: Description
An Assistant Human Resources Manager is responsible for providing first-class employee relations services to the Human Resources Manager and management team to deliver an excellent staff e...

Rank 4
Final Combined Score: 0.5763
SBERT 

In [35]:
model.save(r"C:\Users\USER\Documents\GitHub\AI-Powered Resume Analyzer and Job Matching\trained_jobmatching_model")