## Data Preprocessing

In [10]:
resumes_df = pd.read_csv('cleaned_resume.csv')
jobs_df = pd.read_csv('cleaned_job.csv')

print("Resume DataFrame columns:", resumes_df.columns.tolist())
print("Job DataFrame columns:", jobs_df.columns.tolist())

resumes_df.columns = [col.replace('\ufeff', '') for col in resumes_df.columns]
jobs_df.columns = [col.replace('\ufeff', '') for col in jobs_df.columns]

# Clean column names by removing leading/trailing spaces
resumes_df.columns = resumes_df.columns.str.strip()
jobs_df.columns = jobs_df.columns.str.strip()

# Verify column names after cleaning
print("Resume DataFrame columns after cleaning:", resumes_df.columns.tolist())
print("Job DataFrame columns after cleaning:", jobs_df.columns.tolist())

Resume DataFrame columns: ['Unnamed: 0', 'skills', 'educational_institution_name', 'degree_names', 'passing_years', 'educational_results', 'result_types', 'major_field_of_studies', 'professional_company_names', 'company_urls', 'start_dates', 'end_dates', 'related_skils_in_job', 'positions', 'locations', 'responsibilities', '\ufeffjob_position_name']
Job DataFrame columns: ['Unnamed: 0', 'job_id', 'skill_abr', 'skill_name', 'industry_id', 'industry_name', 'title', 'description']
Resume DataFrame columns after cleaning: ['Unnamed: 0', 'skills', 'educational_institution_name', 'degree_names', 'passing_years', 'educational_results', 'result_types', 'major_field_of_studies', 'professional_company_names', 'company_urls', 'start_dates', 'end_dates', 'related_skils_in_job', 'positions', 'locations', 'responsibilities', 'job_position_name']
Job DataFrame columns after cleaning: ['Unnamed: 0', 'job_id', 'skill_abr', 'skill_name', 'industry_id', 'industry_name', 'title', 'description']


In [16]:
import pandas as pd
import numpy as np

# Clean data
# 1. Handle missing values
resumes_df = resumes_df.fillna('')
jobs_df = jobs_df.fillna('')

def safe_join(item):
    if isinstance(item, list):
        return ', '.join(item)
    elif isinstance(item, str):
        return item
    elif item is None:
        return ''
    else:
        return str(item)

# 2. Create unified text representation for resumes
resumes_df['resume_text'] = resumes_df.apply(
    lambda row: f"Skills: {safe_join(row['skills'])} \n"
                f"Education: {safe_join(row['educational_institution_name'])} - "
                f"{safe_join(row['degree_names'])} - "
                f"{safe_join(row['major_field_of_studies'])} \n"
                f"Work Experience: {safe_join(row['professional_company_names'])} - "
                f"{safe_join(row['positions'])} \n"
                f"Responsibilities: {row['responsibilities']} \n"
                f"Target Position: {row['job_position_name']}", axis=1)

jobs_df['job_text'] = jobs_df.apply(
    lambda row: f"Job Title: {row['title']} \n"
                f"Required Skills: {safe_join(row['skill_name'])} \n"
                f"Industry: {safe_join(row['industry_name'])} \n"
                f"Job Description: {row['description']}", axis=1)

## Vector Embedding

In [19]:
# Use pre-trained models to convert resumes and job descriptions into vectors:

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

resume_embeddings = model.encode(resumes_df['resume_text'].tolist(), show_progress_bar=True)

job_embeddings = model.encode(jobs_df['job_text'].tolist(), show_progress_bar=True)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/299 [00:00<?, ?it/s]

Batches:   0%|          | 0/3973 [00:00<?, ?it/s]

## Building Vector Database

In [28]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp312-cp312-win_amd64.whl.metadata (4.5 kB)
Downloading faiss_cpu-1.10.0-cp312-cp312-win_amd64.whl (13.7 MB)
   ---------------------------------------- 0.0/13.7 MB ? eta -:--:--
   --------------- ------------------------ 5.2/13.7 MB 29.0 MB/s eta 0:00:01
   ---------------------------------------  13.6/13.7 MB 37.2 MB/s eta 0:00:01
   ---------------------------------------- 13.7/13.7 MB 33.1 MB/s eta 0:00:00
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [30]:
# use FAISS as vector databases:
import faiss
import numpy as np

# Convert embeddings to float32 type
resume_embeddings = np.array(resume_embeddings).astype('float32')
job_embeddings = np.array(job_embeddings).astype('float32')

# Create FAISS index
resume_dimension = resume_embeddings.shape[1]
resume_index = faiss.IndexFlatL2(resume_dimension)
resume_index.add(resume_embeddings)

job_dimension = job_embeddings.shape[1]
job_index = faiss.IndexFlatL2(job_dimension)
job_index.add(job_embeddings)

faiss.write_index(resume_index, "resume_index.faiss")
faiss.write_index(job_index, "job_index.faiss")

## Similarity Calculation and Matching

In [123]:
from sklearn.metrics.pairwise import cosine_similarity

def get_safe_value(item, default_value):
    if isinstance(item, list) and len(item) > 0:
        return item[0]
    elif isinstance(item, str) and item:
        return item
    else:
        return default_value

def find_matching_resumes(job_id, top_k=5):
    """Find resumes that best match a specific job"""
    job_vector = job_embeddings[job_id].reshape(1, -1).astype('float32')
    
    distances, indices = resume_index.search(job_vector, top_k)
    
    matches = []
    for i, idx in enumerate(indices[0]):
        resume_vector = resume_embeddings[idx].reshape(1, -1)
        similarity_score = cosine_similarity(job_vector, resume_vector)[0][0]
        
        company = resumes_df.iloc[idx]['professional_company_names']
        position = resumes_df.iloc[idx]['positions']
        
        company_name = get_safe_value(company, "Unknown Company")
        position_name = get_safe_value(position, "Unknown Position")
        
        candidate_identifier = f"{company_name} - {position_name}"
        
        matches.append({
            'resume_id': idx,
            'similarity': similarity_score,
            'resume_text': resumes_df.iloc[idx]['resume_text'],
            'candidate_identifier': candidate_identifier
        })
    
    return matches


def find_matching_jobs(resume_id, top_k=5):
    """Find jobs that best match a specific resume"""
    resume_vector = resume_embeddings[resume_id].reshape(1, -1).astype('float32')
    
    distances, indices = job_index.search(resume_vector, top_k)
    
    matches = []
    for i, idx in enumerate(indices[0]):
        job_vector = job_embeddings[idx].reshape(1, -1)
        similarity_score = cosine_similarity(resume_vector, job_vector)[0][0]
        
        # Use job titles and industries as job identifiers
        title = jobs_df.iloc[idx]['title']
        
        industry = jobs_df.iloc[idx]['industry_name']
        industry_name = get_safe_value(industry, "Unknown Industry")
            
        job_identifier = f"{title} - {industry_name}"
        
        matches.append({
            'job_id': idx,
            'similarity': similarity_score,
            'job_title': title,
            'job_identifier': job_identifier
        })
    
    return matches

## Enhanced Match Result Generation

In [125]:
from openai import OpenAI

client = OpenAI(
    api_key="sk-3778094c3c034b5bbc65218e7f65f6b3",
    base_url="https://api.deepseek.com"
)

def generate_match_explanation(resume_text, job_text, similarity_score):
    """Use LLM to generate match explanations"""
    prompt = f"""
    Resume content:
    {resume_text}
    
    Job description:
    {job_text}
    
    Similarity score: {similarity_score:.2f}
    
    Please analyze the match between this resume and the job, indicating:
    1. Which skills and experiences of the candidate match the job requirements
    2. Key skills or experiences the candidate may be lacking
    3. Suggestions for how the candidate could adjust their resume to better match this position
    4. Provide a match rating from 1-10 with explanation
    """
    
    response = client.chat.completions.create(
        model="deepseek-reasoner",  
        messages=[
            {"role": "system", "content": "You are a professional recruitment consultant who specializes in analyzing resume-job matches."},
            {"role": "user", "content": prompt}
        ],
        temperature=1,
        max_tokens=800
    )
    
    return response.choices[0].message.content

## Building Retrieval Interface (optional)

In [38]:
!pip install fastapi uvicorn

Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Using cached uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Using cached starlette-0.46.1-py3-none-any.whl.metadata (6.2 kB)
Downloading fastapi-0.115.12-py3-none-any.whl (95 kB)
Using cached uvicorn-0.34.0-py3-none-any.whl (62 kB)
Using cached starlette-0.46.1-py3-none-any.whl (71 kB)
Installing collected packages: uvicorn, starlette, fastapi
Successfully installed fastapi-0.115.12 starlette-0.46.1 uvicorn-0.34.0


In [97]:
# Create a simple API interface to query matching results:
from fastapi import FastAPI, Query
from typing import List, Optional

app = FastAPI()

@app.get("/match/job/{job_id}")
def match_job(job_id: int, top_k: Optional[int] = 5, explain: Optional[bool] = False):
    """Find resumes matching a specific job"""
    matches = find_matching_resumes(job_id, top_k)
    
    if explain and matches:
        # Add explanation for each match
        for match in matches:
            job_text = jobs_df.iloc[job_id]['job_text']
            resume_text = match['resume_text']
            match['explanation'] = generate_match_explanation(
                resume_text, job_text, match['similarity']
            )
    
    return {
        "job": jobs_df.iloc[job_id]['title'],
        "matches": matches
    }

@app.get("/match/resume/{resume_id}")
def match_resume(resume_id: int, top_k: Optional[int] = 5, explain: Optional[bool] = False):
    """Find jobs matching a specific resume"""
    matches = find_matching_jobs(resume_id, top_k)
    
    if explain and matches:
        # Add explanation for each match
        for match in matches:
            resume_text = resumes_df.iloc[resume_id]['resume_text']
            job_text = match['job_text'] = jobs_df.iloc[match['job_id']]['job_text']
            match['explanation'] = generate_match_explanation(
                resume_text, job_text, match['similarity']
            )
    
    return {
        "candidate": f"Candidate_{resume_id}",
        "matches": matches
    }

In [99]:
# Only used when API services are needed
import uvicorn
import nest_asyncio

nest_asyncio.apply()

print("Start the API server, press Ctrl+C to stop...")
uvicorn.run(app, host="127.0.0.1", port=8000)

Start the API server, press Ctrl+C to stop...


INFO:     Started server process [5900]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [5900]


## System Evaluation

In [127]:
def evaluate_matching_system(test_cases):
    """Evaluate matching system performance"""
    precision_at_k = []
    recall_at_k = []
    
    for test in test_cases:
        job_id = test['job_id']
        relevant_resumes = set(test['relevant_resume_ids'])
        
        # Get system recommended resumes
        matches = find_matching_resumes(job_id, top_k=10)
        recommended_resumes = set([m['resume_id'] for m in matches])
        
        # Calculate precision and recall
        precision = len(relevant_resumes.intersection(recommended_resumes)) / len(recommended_resumes)
        recall = len(relevant_resumes.intersection(recommended_resumes)) / len(relevant_resumes)
        
        precision_at_k.append(precision)
        recall_at_k.append(recall)
    
    return {
        'avg_precision': sum(precision_at_k) / len(precision_at_k),
        'avg_recall': sum(recall_at_k) / len(recall_at_k)
    }

## Test

In [130]:
# Test matching functionality
if __name__ == "__main__":
    sample_job_id = 0
    sample_resume_id = 0
    
    print("===== Testing Job-to-Resume Matching =====")
    matching_resumes = find_matching_resumes(sample_job_id, top_k=10)
    print(f"Matching resumes for job '{jobs_df.iloc[sample_job_id]['title']}':")
    for i, match in enumerate(matching_resumes):
        print(f"{i+1}. Similarity: {match['similarity']:.4f} - {match['candidate_identifier']}")
    
    print("\n===== Testing Resume-to-Job Matching =====")
    matching_jobs = find_matching_jobs(sample_resume_id, top_k=10)
    print(f"Matching jobs for resume #{sample_resume_id}:")
    for i, match in enumerate(matching_jobs):
        print(f"{i+1}. Similarity: {match['similarity']:.4f} - {match['job_identifier']}")
    
    print("\n===== Testing Match Explanation =====")
    if matching_resumes:
        job_text = jobs_df.iloc[sample_job_id]['job_text']
        resume_text = matching_resumes[0]['resume_text']
        similarity = matching_resumes[0]['similarity']
        
        print("Generating match explanation...")
        explanation = generate_match_explanation(resume_text, job_text, similarity)
        print("Match explanation:")
        print(explanation)

===== Testing Job-to-Resume Matching =====
Matching resumes for job 'Marketing Coordinator':
1. Similarity: 0.5708 - ['FreshDirect', 'Marketing Science Associates'] - ['Full Stack Engineer Intern', 'Full Stack Engineer Intern']
2. Similarity: 0.5685 - ['Titli Foundation'] - ['System Developer']
3. Similarity: 0.5761 - ['N/A'] - ['N/A']
4. Similarity: 0.6001 - ['Flipkart'] - ['Intern']
5. Similarity: 0.5826 - ['ASM Management'] - ['Intern']
6. Similarity: 0.5394 - ['Titli Foundation'] - ['System Developer']
7. Similarity: 0.5516 - ['Biswa Pvt Ltd'] - ['Part-Time Analyst']
8. Similarity: 0.5443 - Unknown Company - Unknown Position
9. Similarity: 0.5386 - ['RNT Laboratories'] - ['Intern Trainee']
10. Similarity: 0.5678 - ['DKB Innovations'] - ['Python Intern']

===== Testing Resume-to-Job Matching =====
Matching jobs for resume #0:
1. Similarity: 0.7597 - Big Data Developer - ['Information Services']
2. Similarity: 0.7438 - Data Analyst: 24-00481 - ['IT Services and IT Consulting', 'Softw

## About saving and next loading

In [134]:
# Step 1: Save the Current RAG System
# Save the complete RAG system
import pickle
import faiss
import numpy as np

# 1. Save FAISS indexes
faiss.write_index(resume_index, "resume_index.faiss")
faiss.write_index(job_index, "job_index.faiss")
print("FAISS indexes saved")

# 2. Save embedding vectors
np.save("resume_embeddings.npy", resume_embeddings)
np.save("job_embeddings.npy", job_embeddings)
print("Embedding vectors saved")

# 3. Save dataframes
rag_data = {
    "resumes_df": resumes_df,
    "jobs_df": jobs_df
}

with open("rag_data.pkl", 'wb') as f:
    pickle.dump(rag_data, f)
print("Dataframes saved")

print("RAG system saving completed!")

FAISS indexes saved
Embedding vectors saved
Dataframes saved
RAG system saving completed!


In [1]:
# Step 2: Load the System Next Time
# Load the previously saved RAG system
import pickle
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# 1. Load dataframes
with open("rag_data.pkl", 'rb') as f:
    rag_data = pickle.load(f)

resumes_df = rag_data["resumes_df"]
jobs_df = rag_data["jobs_df"]
print(f"Loaded {len(resumes_df)} resumes and {len(jobs_df)} jobs")

# 2. Load embedding vectors
resume_embeddings = np.load("resume_embeddings.npy")
job_embeddings = np.load("job_embeddings.npy")
print("Embedding vectors loaded")

# 3. Load FAISS indexes
resume_index = faiss.read_index("resume_index.faiss")
job_index = faiss.read_index("job_index.faiss")
print("FAISS indexes loaded")

# 4. Load embedding model (if you need to create new embeddings)
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Embedding model loaded")

print("RAG system loading completed, ready to use matching functions!")

Loaded 9544 resumes and 127125 jobs
Embedding vectors loaded
FAISS indexes loaded
Embedding model loaded
RAG system loading completed, ready to use matching functions!


In [None]:
# Step 3: Use the Matching Functions
# e.g. Find resumes matching a specific job
job_id = 1  # Replace with the job ID you want to query
matching_resumes = find_matching_resumes(job_id, top_k=5)
print(f"Resumes matching job {job_id}:")
print(matching_resumes)

# e.g. Find jobs matching a specific resume
resume_id = 1  # Replace with the resume ID you want to query
matching_jobs = find_matching_jobs(resume_id, top_k=5)
print(f"Jobs matching resume {resume_id}:")
print(matching_jobs)