# Career Recommender System - Quick Demo

This notebook demonstrates the complete career recommendation pipeline using synthetic data.

## Overview

The system consists of three stages:
1. **Path Generation**: BERT4Rec model generates candidate career paths
2. **Skill Gap Analysis**: Analyzes feasibility based on user skills and ESCO knowledge
3. **Resource Recommendation**: Suggests learning resources for skill gaps

Let's walk through each component with synthetic data.

In [None]:
# Setup and imports
import sys
import os
from pathlib import Path

# Add src to path
project_root = Path.cwd().parent
sys.path.append(str(project_root / 'src'))

import pandas as pd
import numpy as np
import torch
import json
from typing import List, Dict, Set

# Import our modules
from ingest.download_and_prepare import create_sample_data, DataIngestionPipeline
from ingest.esco_loader import create_esco_loader
from ingest.text_to_esco_mapper import create_text_mapper
from models.bert4rec import BERT4RecConfig, create_bert4rec_model
from models.train_path_model import create_synthetic_data, CareerSequenceDataset
from reasoner.skill_gap import create_skill_gap_analyzer

print("Imports successful!")

## Step 1: Create Synthetic Data

First, let's create some synthetic data to work with.

In [None]:
# Create sample ESCO data
print("Creating sample ESCO data...")
create_sample_data()

# Run data ingestion pipeline
print("Running data ingestion pipeline...")
pipeline = DataIngestionPipeline()
success = pipeline.run()

if success:
    print("✅ Data ingestion completed successfully!")
else:
    print("❌ Data ingestion failed!")

In [None]:
# Check the processed data
processed_dir = Path("../data/processed")

print("Processed data files:")
for file in processed_dir.glob("*.parquet"):
    df = pd.read_parquet(file)
    print(f"  {file.name}: {len(df)} records")
    print(f"    Columns: {list(df.columns)}")
    print()

## Step 2: Initialize ESCO Knowledge Graph

Load the ESCO knowledge graph for skill analysis.

In [None]:
# Initialize ESCO loader
print("Initializing ESCO knowledge graph...")
esco_loader = create_esco_loader("../data/processed")

# Test ESCO functions
print("\nTesting ESCO functions:")

# Test get_job_skills
job_skills = esco_loader.get_job_skills('occ_001')
print(f"Skills for job 'occ_001': {job_skills}")

# Test get_skill_parents
skill_parents = esco_loader.get_skill_parents('skill_001')
print(f"Parents for skill 'skill_001': {skill_parents}")

# Test get_skill_distance
distance = esco_loader.get_skill_distance('skill_001', 'skill_002')
print(f"Distance between skill_001 and skill_002: {distance}")

## Step 3: Text-to-ESCO Mapping

Test the text mapping functionality.

In [None]:
# Initialize text mapper
print("Initializing text-to-ESCO mapper...")
try:
    text_mapper = create_text_mapper("../data/processed")
    
    # Test occupation mapping
    print("\nTesting occupation mapping:")
    job_matches = text_mapper.map_text_to_occupations("software developer", top_k=3)
    for match in job_matches:
        print(f"  {match['title']} (ID: {match['esco_id']}, Score: {match['score']:.3f})")
    
    # Test skill mapping
    print("\nTesting skill mapping:")
    skill_matches = text_mapper.map_text_to_skills("programming", top_k=3)
    for match in skill_matches:
        print(f"  {match['title']} (ID: {match['esco_id']}, Score: {match['score']:.3f})")
        
except Exception as e:
    print(f"Text mapper initialization failed (likely missing dependencies): {e}")
    print("Continuing with manual mapping...")
    text_mapper = None

## Step 4: BERT4Rec Model Training (Synthetic Data)

Create and train a small BERT4Rec model on synthetic career sequences.

In [None]:
# Create synthetic career sequences
print("Creating synthetic career sequences...")
sequences, job_to_id = create_synthetic_data(num_sequences=100, vocab_size=20)

print(f"Created {len(sequences)} sequences")
print(f"Vocabulary size: {len(job_to_id)}")
print(f"Sample sequence: {sequences[0]}")
print(f"Job vocabulary: {list(job_to_id.keys())[:10]}...")

In [None]:
# Create a small BERT4Rec model for demo
print("Creating BERT4Rec model...")
config = BERT4RecConfig(
    vocab_size=len(job_to_id),
    d_model=64,  # Small for demo
    n_layers=2,  # Small for demo
    n_heads=4,
    max_seq_len=20
)

model = create_bert4rec_model(config)
print(f"Model created with {sum(p.numel() for p in model.parameters())} parameters")

In [None]:
# Test model inference (without training for demo)
print("Testing model inference...")

# Create a sample input sequence
sample_sequence = torch.tensor([sequences[0][:5]], dtype=torch.long)  # First 5 jobs
print(f"Input sequence: {sample_sequence}")

# Generate next job recommendations
model.eval()
with torch.no_grad():
    recommendations = model.generate_next_jobs(sample_sequence, top_k=5)
    
print("\nTop 5 next job recommendations:")
for i, (job_id, prob) in enumerate(recommendations[0]):
    job_name = [k for k, v in job_to_id.items() if v == job_id][0] if job_id in job_to_id.values() else f"job_{job_id}"
    print(f"  {i+1}. {job_name} (ID: {job_id}, Prob: {prob:.3f})")

## Step 5: Skill Gap Analysis

Test the skill gap reasoning module.

In [None]:
# Initialize skill gap analyzer
print("Initializing skill gap analyzer...")
try:
    skill_analyzer = create_skill_gap_analyzer("../configs/system_config.yaml")
    
    # Define a sample user with some skills
    user_skills = {'skill_001', 'skill_002'}  # User has Python and Data Analysis
    print(f"User skills: {user_skills}")
    
    # Define candidate career paths with mock probabilities
    candidate_paths = [
        (['occ_001', 'occ_002'], 0.8),  # Software Engineer -> Data Scientist
        (['occ_001', 'occ_003'], 0.6),  # Software Engineer -> Marketing Manager
        (['occ_002', 'occ_003'], 0.4),  # Data Scientist -> Marketing Manager
    ]
    
    print(f"\nCandidate paths: {candidate_paths}")
    
    # Analyze all paths
    analyses = skill_analyzer.analyze_multiple_paths(user_skills, candidate_paths)
    
    print("\nPath Analysis Results:")
    for i, analysis in enumerate(analyses):
        print(f"\nPath {i+1}: {' -> '.join(analysis.path)}")
        print(f"  Model Probability: {analysis.model_prob:.3f}")
        print(f"  Feasibility Score: {analysis.feasibility_score:.3f}")
        print(f"  Combined Score: {analysis.combined_score:.3f}")
        print(f"  Total Missing Skills: {analysis.total_missing_skills}")
        
        # Show per-job gaps
        for job_id, gap in analysis.per_job_gaps.items():
            print(f"    {job_id}: {len(gap.missing_skills)} missing skills, gap score: {gap.gap_score:.3f}")
    
except Exception as e:
    print(f"Skill gap analyzer failed: {e}")
    import traceback
    traceback.print_exc()

## Step 6: Generate Learning Plan

Create a learning plan for the best career path.

In [None]:
# Generate learning plan for the best path
if 'analyses' in locals() and analyses:
    best_path = analyses[0]  # Highest combined score
    
    print(f"Learning Plan for Best Path: {' -> '.join(best_path.path)}")
    print(f"Combined Score: {best_path.combined_score:.3f}")
    
    learning_plan = skill_analyzer.get_learning_plan(best_path)
    
    if learning_plan:
        print("\nRecommended Learning Sequence:")
        for job_id, skills_to_learn in learning_plan.items():
            print(f"\nFor {job_id}:")
            for i, skill_id in enumerate(skills_to_learn, 1):
                print(f"  {i}. {skill_id}")
    else:
        print("\n✅ No additional skills needed - you're ready for this path!")
    
    # Generate explanation
    explanation = skill_analyzer.explain_path_feasibility(best_path)
    print("\nPath Feasibility Explanation:")
    print(f"  Overall Score: {explanation['overall_score']:.3f}")
    print(f"  Model Confidence: {explanation['model_confidence']:.3f}")
    print(f"  Feasibility: {explanation['feasibility']:.3f}")
    print(f"  Total Missing Skills: {explanation['total_missing_skills']}")
    
    if explanation['recommendations']:
        print("\nRecommendations:")
        for rec in explanation['recommendations']:
            print(f"  • {rec}")
else:
    print("No path analyses available")

## Step 7: Summary

Let's summarize what we've demonstrated.

In [None]:
print("🎉 Career Recommender System Demo Complete!")
print("\nWhat we demonstrated:")
print("✅ Data ingestion pipeline with synthetic ESCO data")
print("✅ ESCO knowledge graph loading and querying")
print("✅ Text-to-ESCO mapping (if dependencies available)")
print("✅ BERT4Rec model creation and inference")
print("✅ Skill gap analysis and feasibility scoring")
print("✅ Learning plan generation")
print("✅ Path feasibility explanation")

print("\nNext steps to run with real data:")
print("1. Replace synthetic data with real Karrierewege and ESCO datasets")
print("2. Train BERT4Rec model on real career sequences")
print("3. Build resource recommendation system (Stage 3)")
print("4. Create Streamlit web application")
print("5. Set up FastAPI backend")
print("6. Implement evaluation framework")

print("\n📁 Check the following directories for generated files:")
print("  - data/raw/ - Sample input data")
print("  - data/processed/ - Processed Parquet files")
print("  - configs/ - Configuration files")