In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity

# Use current working directory for notebook
DATA_PATH = Path.cwd() / "data"
job_profiles = pd.read_csv(DATA_PATH / "job_profiles_cleaned.csv")

# Detect encoded skills
def get_encoded_skill_columns():
    return [col for col in job_profiles.columns if col.startswith("Skill List_")]

skill_cols = get_encoded_skill_columns()

# Function to compute similarity score
def compute_similarity(user_profile):
    """
    Compute final hybrid similarity score using RIASEC + Education + Skill overlap
    """
    user_riasec_vector = np.array([
        user_profile["R"],
        user_profile["I"],
        user_profile["A"],
        user_profile["S"],
        user_profile["E"],
        user_profile["C"]
    ]).reshape(1, -1)

    job_riasec_vectors = job_profiles[["R", "I", "A", "S", "E", "C"]].values

    riasec_similarities = cosine_similarity(user_riasec_vector, job_riasec_vectors).flatten()

    job_profiles["education_similarity"] = job_profiles["Normalized_Education_Score"]

    if user_profile["Selected Skills"]:
        for skill in skill_cols:
            job_profiles[skill] = job_profiles[skill].fillna(0)
        user_skills_binary = np.array([
            1 if skill.replace("Skill List_", "") in user_profile["Selected Skills"] else 0
            for skill in skill_cols
        ])
        job_skills_matrix = job_profiles[skill_cols].values
        skill_similarity = cosine_similarity([user_skills_binary], job_skills_matrix).flatten()
    else:
        skill_similarity = np.ones(len(job_profiles)) * 0.5

    final_score = (
        0.5 * riasec_similarities +
        0.3 * job_profiles["education_similarity"].values +
        0.2 * skill_similarity
    )

    job_profiles["Hybrid_Similarity_Score"] = final_score

    return job_profiles


# Generate recommendations
def generate_recommendations(user_profile, top_n=10):
    """
    Return top N recommended job profiles for the given user profile.
    """
    scored_jobs = compute_similarity(user_profile)

    recommended = (
        scored_jobs
        .sort_values("Hybrid_Similarity_Score", ascending=False)
        .head(top_n)
        .copy()
    )

    return recommended[[
        "Title", "Description", "Hybrid_Similarity_Score",
        "Most Common Education", "Required_Skills"
    ]]



In [5]:
sample_user_profile = {
    "R": 0.2,
    "I": 0.9,
    "A": 0.4,
    "S": 0.6,
    "E": 0.1,
    "C": 0.3,
    "Selected Skills": ["Critical Thinking", "Project Management"]
}

recommendations = generate_recommendations(sample_user_profile)
recommendations.head()


Unnamed: 0,Title,Description,Hybrid_Similarity_Score,Most Common Education,Required_Skills
416,Genetic Counselors,Assess individual or family risk for a variety...,0.780592,Master's Degree,"{'Speaking', 'Instructing', 'Programming', 'Te..."
256,"Anthropology and Archeology Teachers, Postseco...",Teach courses in anthropology or archeology. I...,0.765102,Doctoral Degree,"{'Speaking', 'Instructing', 'Programming', 'Te..."
201,Clinical Neuropsychologists,Assess and diagnose patients with neurobehavio...,0.758384,Post-Doctoral Training,"{'Speaking', 'Instructing', 'Programming', 'Te..."
260,"Political Science Teachers, Postsecondary","Teach courses in political science, internatio...",0.757086,Doctoral Degree,"{'Speaking', 'Instructing', 'Programming', 'Te..."
369,Audiologists,Assess and treat persons with hearing and rela...,0.753696,Doctoral Degree,"{'Speaking', 'Instructing', 'Programming', 'Te..."
