In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# -----------------------------
# Load datasets
# -----------------------------
jobs = pd.read_csv("combined_jobs_cleaned.csv")
courses = pd.read_csv("coursera_cleaned.csv")
uni_courses = pd.read_csv("uni_courses_aligned.csv")

# -----------------------------
# Load embeddings
# -----------------------------
job_embeddings = np.load("job_embeddings.npy")
course_embeddings = np.load("course_embeddings.npy")
uni_course_embeddings = np.load("uni_course_embeddings.npy")

# -----------------------------
# Normalise career labels
# -----------------------------
def normalise_career(label):
    if not isinstance(label, str):
        return ""
    label = label.lower()
    if "software" in label:
        return "software_engineering"
    if "data analyst" in label:
        return "data_analytics"
    if "data scientist" in label:
        return "data_science"
    if "network" in label:
        return "network_engineering"
    if "health" in label or "medical" in label or "nurse" in label:
        return "healthcare"
    return label

jobs["career_group"] = jobs["career_label"].apply(normalise_career)

# -----------------------------
# Recommendation function
# -----------------------------
def recommend_courses_for_career(career_group, top_n=5):
    job_idxs = jobs[jobs["career_group"] == career_group].index

    if len(job_idxs) == 0:
        print(f"No jobs found for career group: {career_group}")
        return None

    avg_job_embedding = job_embeddings[job_idxs].mean(axis=0)

    # Coursera recommendations
    coursera_scores = cosine_similarity(
        [avg_job_embedding], course_embeddings
    )[0]
    coursera_top = coursera_scores.argsort()[::-1][:top_n]
    coursera_recs = courses.iloc[coursera_top][
        ["course_title", "course_organization", "course_difficulty", "course_rating"]
    ]

    # University recommendations
    uni_scores = cosine_similarity(
        [avg_job_embedding], uni_course_embeddings
    )[0]
    uni_top = uni_scores.argsort()[::-1][:top_n]
    uni_recs = uni_courses.iloc[uni_top][
        ["course_name", "University", "Specialization"]
    ]

    return {
        "Coursera Courses": coursera_recs.reset_index(drop=True),
        "University Programmes": uni_recs.reset_index(drop=True)
    }


In [7]:
# Software Engineering
se_results = recommend_courses_for_career("software_engineering", top_n=5)

# Data Analytics
da_results = recommend_courses_for_career("data_analytics", top_n=5)

se_results, da_results


({'Coursera Courses':                                         course_title  \
  0             Excel/VBA for Creative Problem Solving   
  1  Java Programming and Software Engineering Fund...   
  2                          Excel Skills for Business   
  3     Excel/VBA for Creative Problem Solving, Part 1   
  4                              What is Data Science?   
  
                course_organization course_difficulty  course_rating  
  0  University of Colorado Boulder          Beginner            4.8  
  1                 Duke University          Beginner            4.6  
  2            Macquarie University          Beginner            4.9  
  3  University of Colorado Boulder          Beginner            4.8  
  4                             IBM          Beginner            4.7  ,
  'University Programmes':                                          course_name University  \
  0                Bachelor of Computer Science (Hons)        UOW   
  1    BA132 - Diploma in Office Techno

In [1]:
se_results["Coursera Courses"].to_csv(
    "software_engineering_coursera_recs.csv", index=False
)

se_results["University Programmes"].to_csv(
    "software_engineering_university_recs.csv", index=False
)

da_results["Coursera Courses"].to_csv(
    "data_analytics_coursera_recs.csv", index=False
)

da_results["University Programmes"].to_csv(
    "data_analytics_university_recs.csv", index=False
)


NameError: name 'se_results' is not defined