In [47]:
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re
import nltk
import os

In [48]:
# Set NLTK data path
nltk_data_path = os.path.join(os.getcwd(), 'nltk_data')
os.environ['NLTK_DATA'] = nltk_data_path

if not os.path.exists(nltk_data_path):
    os.makedirs(nltk_data_path)

try:
    nltk.data.find('tokenizers/punkt', paths=[nltk_data_path])
except LookupError:
    nltk.download('punkt', download_dir=nltk_data_path, quiet=True)
    
try:
    nltk.data.find('corpora/stopwords', paths=[nltk_data_path])
except LookupError:
    nltk.download('stopwords', download_dir=nltk_data_path, quiet=True)
    
try:
    nltk.data.find('corpora/wordnet', paths=[nltk_data_path])
except LookupError:
    nltk.download('wordnet', download_dir=nltk_data_path, quiet=True)

nltk.data.path.append(nltk_data_path)

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [49]:
# Initialize models
bi_encoder = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')

In [50]:
# Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [51]:
# Configuration (as integers)
TITLE_WEIGHT = 2  # Now integer
SKILLS_WEIGHT = 2  # Now integer
DESC_WEIGHT = 1    # Now integer
ALPHA = 0.7
MIN_SIMILARITY = 0.4
RERANK_TOP_N = 10

In [52]:
def preprocess_text(text):
    """Clean and normalize text for better embeddings"""
    if not text or text.strip() == "":
        return ""
    
    # Lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize and remove stopwords
    try:
        tokens = nltk.word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        
        # Lemmatization
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    except:
        # Fallback to simple splitting
        tokens = text.split()
    
    return " ".join(tokens)

In [53]:
def create_weighted_text(item, item_type):
    """Create weighted text representation based on item type"""
    if item_type == "job":
        title = item.get('title', '')
        description = item.get('description', '')
        skills = item.get('skills', [])
    else:  # service
        title = item.get('title', '')
        description = item.get('description', '')
        skills = item.get('tags', [])  # Using tags for services
    
    # Create weighted representation
    weighted_terms = []
    
    # Add title words multiple times based on weight
    for word in title.split():
        weighted_terms.extend([word] * TITLE_WEIGHT)
    
    # Add skills multiple times based on weight
    for skill in skills:
        weighted_terms.extend([skill] * SKILLS_WEIGHT)
    
    # Add description words
    weighted_terms.extend(description.split())
    
    return " ".join(weighted_terms)

def extract_skills(text):
    """Simple skill extractor"""
    skills = set()
    # Look for capitalized words that look like skills
    for word in re.findall(r'\b[A-Z][a-z]{2,}\b', text):
        if word.lower() not in stop_words and len(word) > 3:
            skills.add(word)
    return list(skills)

In [54]:
def recommend(cv_text, jobs, services, top_n=5):
    # 1. Preprocess CV and extract skills
    cv_processed = preprocess_text(cv_text)
    print(f"Processed CV: {cv_processed[:200]}...")
    cv_skills = extract_skills(cv_text)
    print(f"Extracted CV skills: {cv_skills}")
    
    # 2. Create job representations
    job_texts = []
    job_skill_lists = []
    for job in jobs:
        job_text = create_weighted_text(job, "job")
        job_texts.append(job_text)
        job_skill_lists.append(job.get('skills', []))
    
    # 3. Create service representations
    service_texts = []
    service_skill_lists = []
    for service in services:
        service_text = create_weighted_text(service, "service")
        service_texts.append(service_text)
        service_skill_lists.append(service.get('tags', []))
    
    # 4. Calculate skill match scores
    job_skill_scores = []
    for skills in job_skill_lists:
        if not skills:
            job_skill_scores.append(0)
            continue
        common = set(cv_skills) & set(skills)
        job_skill_scores.append(len(common) / len(skills))
    
    service_skill_scores = []
    for skills in service_skill_lists:
        if not skills:
            service_skill_scores.append(0)
            continue
        common = set(cv_skills) & set(skills)
        service_skill_scores.append(len(common) / len(skills))
    
    # 5. TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    all_texts = [cv_processed] + job_texts + service_texts
    tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)
    cv_tfidf = tfidf_matrix[0]
    job_tfidfs = tfidf_matrix[1:1+len(jobs)]
    service_tfidfs = tfidf_matrix[1+len(jobs):]
    
    # 6. Bi-encoder Embeddings
    cv_embedding = bi_encoder.encode(cv_processed, convert_to_tensor=True)
    job_embeddings = bi_encoder.encode(job_texts, convert_to_tensor=True)
    service_embeddings = bi_encoder.encode(service_texts, convert_to_tensor=True)
    
    # 7. Calculate similarities
    # Semantic similarities
    job_scores_semantic = util.cos_sim(cv_embedding, job_embeddings)[0].cpu().numpy()
    service_scores_semantic = util.cos_sim(cv_embedding, service_embeddings)[0].cpu().numpy()
    
    # Keyword similarities
    job_scores_keyword = np.array([(cv_tfidf @ job_tfidfs[i].T).toarray()[0][0] 
                                 for i in range(len(jobs))])
    service_scores_keyword = np.array([(cv_tfidf @ service_tfidfs[i].T).toarray()[0][0] 
                                     for i in range(len(services))])
    
    # Hybrid scores
    job_scores_hybrid = (ALPHA * job_scores_semantic + 
                         (1 - ALPHA) * job_scores_keyword)
    service_scores_hybrid = (ALPHA * service_scores_semantic + 
                             (1 - ALPHA) * service_scores_keyword)
    
    # Add skill bonus
    job_scores_hybrid += 0.2 * np.array(job_skill_scores)
    service_scores_hybrid += 0.2 * np.array(service_skill_scores)
    
    # 8. Cross-encoder re-ranking (for top candidates)
    def rerank(candidates, texts, scores, top_n):
        # Get top candidates for re-ranking
        top_indices = np.argsort(-scores)[:RERANK_TOP_N]
        pairs = [(cv_processed, texts[i]) for i in top_indices]
        rerank_scores = cross_encoder.predict(pairs)
        
        # Update scores
        updated_scores = scores.copy()
        for idx, score in zip(top_indices, rerank_scores):
            updated_scores[idx] = 0.7 * updated_scores[idx] + 0.3 * score
        
        # Get final top indices
        final_indices = np.argsort(-updated_scores)[:top_n]
        return [candidates[i]['id'] for i in final_indices 
                if updated_scores[i] > MIN_SIMILARITY]
    
    # 9. Get recommendations
    recommended_job_ids = rerank(jobs, job_texts, job_scores_hybrid, top_n)
    recommended_service_ids = rerank(services, service_texts, service_scores_hybrid, top_n)
    
    print(f"Recommended job IDs: {recommended_job_ids}")
    print(f"Recommended service IDs: {recommended_service_ids}")
    
    return {
        "recommendedJobs": recommended_job_ids,
        "recommendedServices": recommended_service_ids
    }

In [55]:
# Sample data
developer_cv = """
Experienced full-stack developer with expertise in React, Node.js, PostgreSQL, and cloud platforms.
Passionate about scalable backend systems and responsive frontend interfaces. 
Worked on SaaS platforms and internal tools for data analysis.
"""

job_posts = [
    {
        "id": "job1",
        "title": "Full Stack Developer",
        "description": "Looking for a React and Node.js developer to build scalable applications.",
        "skills": ["React", "Node.js", "PostgreSQL"]
    },
    {
        "id": "job2",
        "title": "Backend Engineer",
        "description": "Seeking expert in Python and Django for backend API development.",
        "skills": ["Python", "Django", "APIs"]
    },
    {
        "id": "job3",
        "title": "Frontend Developer",
        "description": "We need someone skilled in Vue.js and TailwindCSS for UI work.",
        "skills": ["Vue.js", "TailwindCSS", "UI"]
    },
    {
        "id": "job4",
        "title": "Cloud Infrastructure Engineer",
        "description": "Design and implement cloud solutions on AWS and Azure.",
        "skills": ["AWS", "Azure", "Cloud", "DevOps"]
    }
]

service_posts = [
    {
        "id": "service1",
        "title": "React Mentor",
        "description": "Offering 1:1 React mentorship and live project reviews.",
        "skills": ["React", "Mentorship", "Frontend"]
    },
    {
        "id": "service2",
        "title": "Database Consultant",
        "description": "Helping startups optimize PostgreSQL queries and schema design.",
        "skills": ["PostgreSQL", "Database", "Performance"]
    },
    {
        "id": "service3",
        "title": "DevOps Setup",
        "description": "Set up CI/CD pipelines and Docker containers for fast deployment.",
        "skills": ["DevOps", "CI/CD", "Docker"]
    },
    {
        "id": "service4",
        "title": "Node.js Performance Tuning",
        "description": "Optimize your Node.js backend for high throughput and low latency.",
        "skills": ["Node.js", "Performance", "Backend"]
    }
]

In [56]:
# Run recommendation
print("Starting recommendation...")
results = recommend(developer_cv, job_posts, service_posts, top_n=3)
print("\nFinal Recommendations:")
print(results)

Starting recommendation...
Processed CV: experienced fullstack developer with expertise in react nodejs postgresql and cloud platforms passionate about scalable backend systems and responsive frontend interfaces worked on saas platforms and ...
Extracted CV skills: ['Worked', 'Experienced', 'React', 'Node', 'Passionate']
Recommended job IDs: ['job1']
Recommended service IDs: []

Final Recommendations:
{'recommendedJobs': ['job1'], 'recommendedServices': []}
