In [2]:

# O*NET Career Path Recommender
import os
import glob
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import cosine_similarity

print("imports finished")

imports finished


In [3]:
# Step 1: Data Preprocessing
# Load and clean O*NET data files

def load_onet_data():
    """Load O*NET data files and prepare for modeling"""
    base_dir = "db_29_3_text/db_29_3_text"
    
    # Load Work Values
    work_values = pd.read_csv(f"{base_dir}/Work Values.txt", sep='\t', encoding='utf-8')
    print(f"Work Values loaded: {work_values.shape}")
    
    # Load Interests (RIASEC)
    interests = pd.read_csv(f"{base_dir}/Interests.txt", sep='\t', encoding='utf-8')
    print(f"Interests loaded: {interests.shape}")
    
    # Load Work Styles
    work_styles = pd.read_csv(f"{base_dir}/Work Styles.txt", sep='\t', encoding='utf-8')
    print(f"Work Styles loaded: {work_styles.shape}")
    
    # Load Occupation Data
    occupations = pd.read_csv(f"{base_dir}/Occupation Data.txt", sep='\t', encoding='utf-8')
    print(f"Occupations loaded: {occupations.shape}")
    
    return work_values, interests, work_styles, occupations

# Load the data
work_values, interests, work_styles, occupations = load_onet_data()

Work Values loaded: (7866, 7)
Interests loaded: (8307, 7)
Work Styles loaded: (14064, 12)
Occupations loaded: (1016, 3)


In [None]:
# Step 2: Feature Engineering
# Transform raw data into occupation profiles

def process_work_values(work_values):
    """Process work values data into occupation profiles"""
    # Filter for IM scale (Importance)
    wv_im = work_values[work_values['Scale ID'] == 'IM'].copy()
    
    # Pivot to get occupation profiles
    wv_pivot = wv_im.pivot_table(
        index='O*NET-SOC Code',
        columns='Element Name',
        values='Data Value',
        aggfunc='mean'
    ).fillna(0)
    
    # Normalize to 0-1 scale
    wv_pivot = wv_pivot / 7.0  # O*NET uses 1-7 scale
    return wv_pivot

def process_interests(interests):
    """Process RIASEC interests data"""
    # Filter for OI scale (Occupational Interest)
    int_oi = interests[interests['Scale ID'] == 'OI'].copy()
    
    # Pivot to get occupation profiles
    int_pivot = int_oi.pivot_table(
        index='O*NET-SOC Code',
        columns='Element Name',
        values='Data Value',
        aggfunc='mean'
    ).fillna(0)
    
    # Normalize to 0-1 scale
    int_pivot = int_pivot / 7.0  # O*NET uses 1-7 scale
    return int_pivot

def process_work_styles(work_styles):
    """Process work styles data"""
    # Filter for IM scale (Importance)
    ws_im = work_styles[work_styles['Scale ID'] == 'IM'].copy()
    
    # Pivot to get occupation profiles
    ws_pivot = ws_im.pivot_table(
        index='O*NET-SOC Code',
        columns='Element Name',
        values='Data Value',
        aggfunc='mean'
    ).fillna(0)
    
    # Normalize to 0-1 scale
    ws_pivot = ws_pivot / 5.0  # O*NET uses 1-5 scale
    return ws_pivot

print("feature engineering")

Feature engineering functions ready!


In [6]:
# Step 3: Create Occupation Profiles
# Combine all O*NET dimensions into complete occupation profiles

def create_occupation_profiles(work_values, interests, work_styles, occupations):
    """Create complete occupation profiles combining all O*NET dimensions"""
    
    # Process each domain
    wv_profiles = process_work_values(work_values)
    int_profiles = process_interests(interests)
    ws_profiles = process_work_styles(work_styles)
    
    # Get occupation titles
    occ_titles = occupations[['O*NET-SOC Code', 'Title']].drop_duplicates()
    occ_titles.columns = ['soc', 'title']
    occ_titles = occ_titles.set_index('soc')  # Set soc as index
    
    # Merge all profiles
    profiles = wv_profiles.merge(int_profiles, left_index=True, right_index=True, how='outer')
    profiles = profiles.merge(ws_profiles, left_index=True, right_index=True, how='outer')
    profiles = profiles.fillna(0)
    
    # Add occupation titles (now both have soc as index)
    profiles = profiles.merge(occ_titles, left_index=True, right_index=True, how='left')
    
    # Reset index to make soc a column
    profiles = profiles.reset_index()
    profiles = profiles.rename(columns={'index': 'soc'})
    
    print(f"Complete profiles created: {profiles.shape}")
    return profiles

# Create the profiles
profiles = create_occupation_profiles(work_values, interests, work_styles, occupations)
print(f"Total features: {len([col for col in profiles.columns if col not in ['soc', 'title']])}")

Complete profiles created: (923, 24)
Total features: 23


In [9]:
# Step 4: Train Recommendation Model
# Train a model to map user responses to occupation recommendations

def train_recommendation_model(profiles):
    """Train a model to map user responses to occupation recommendations"""
    
    # Separate features and target - ONLY numeric columns
    feature_cols = [col for col in profiles.columns if col not in ['soc', 'title']]
    
    # Double-check: ensure all features are numeric
    numeric_features = []
    for col in feature_cols:
        if profiles[col].dtype in ['int64', 'float64']:
            numeric_features.append(col)
        else:
            print(f"Removing non-numeric column: {col} (type: {profiles[col].dtype})")
    
    print(f"Using {len(numeric_features)} numeric features for training")
    
    X = profiles[numeric_features].values
    y = profiles[numeric_features].values  # Self-supervised learning
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    score = model.score(X_test, y_test)
    print(f"Model R² score: {score:.4f}")
    
    return model, numeric_features

# Train the model
model, feature_cols = train_recommendation_model(profiles)
print("model done")

Removing non-numeric column: O*NET-SOC Code (type: object)
Using 22 numeric features for training
Model R² score: 0.9997
model done


In [10]:
# Step 5: Questionnaire Scoring System
# Convert user responses to O*NET-compatible profiles

def score_questionnaire(responses):
    """Score questionnaire responses to create user profile"""
    
    # Extract responses for each section
    work_values_resp = responses[:6]      # 6 work value questions
    riasec_resp = responses[6:36]        # 30 RIASEC questions (5 per category)
    work_styles_resp = responses[36:52]  # 16 work style questions
    
    # Calculate Work Values scores (normalize 1-5 to 0-1)
    wv_scores = {
        'Achievement': np.mean([work_values_resp[0], work_values_resp[1]]) / 5.0,
        'Working Conditions': np.mean([work_values_resp[2], work_values_resp[3]]) / 5.0,
        'Relationships': np.mean([work_values_resp[4], work_values_resp[5]]) / 5.0
    }
    
    # Calculate RIASEC scores (normalize 1-5 to 0-1)
    riasec_scores = {}
    categories = ['Realistic', 'Investigative', 'Artistic', 'Social', 'Enterprising', 'Conventional']
    for i, category in enumerate(categories):
        start_idx = 6 + (i * 5)
        end_idx = start_idx + 5
        riasec_scores[category] = np.mean(riasec_resp[start_idx:end_idx]) / 5.0
    
    # Calculate Work Styles scores (normalize 1-5 to 0-1)
    ws_scores = {
        'Achievement/Effort': np.mean(work_styles_resp[0:3]) / 5.0,
        'Leadership': work_styles_resp[3] / 5.0,
        'Cooperation': np.mean(work_styles_resp[4:6]) / 5.0,
        'Stress Tolerance': np.mean(work_styles_resp[6:8]) / 5.0,
        'Dependability': np.mean(work_styles_resp[8:10]) / 5.0,
        'Adaptability': np.mean(work_styles_resp[10:12]) / 5.0,
        'Innovation': work_styles_resp[12] / 5.0,
        'Analytical Thinking': work_styles_resp[13] / 5.0,
        'Independence': work_styles_resp[14] / 5.0,
        'Integrity': work_styles_resp[15] / 5.0
    }
    
    # Combine all scores
    user_profile = {**wv_scores, **riasec_scores, **ws_scores}
    
    return user_profile

print("questions")

questions


In [14]:
# Step 6: Career Recommendations Engine
# Generate career recommendations based on user profile

def get_career_recommendations(user_profile, model, feature_cols, profiles, top_k=10):
    """Get top career recommendations based on user profile"""
    
    # First, let's see what columns we actually have
    print(f"Available columns in profiles: {profiles.columns.tolist()}")
    print(f"Looking for columns: soc, title")
    
    # Find the actual column names for SOC and title
    soc_col = None
    title_col = None
    
    for col in profiles.columns:
        if 'soc' in col.lower() or 'code' in col.lower():
            soc_col = col
        if 'title' in col.lower():
            title_col = col
    
    if soc_col is None or title_col is None:
        print(f"Could not find SOC column: {soc_col}")
        print(f"Could not find title column: {title_col}")
        return None
    
    print(f"Using SOC column: {soc_col}")
    print(f"Using title column: {title_col}")
    
    # Convert user profile to feature vector
    user_vector = np.array([user_profile.get(col, 0) for col in feature_cols])
    
    # Predict user's ideal occupation profile
    predicted_profile = model.predict(user_vector.reshape(1, -1))[0]
    
    # Calculate similarity with all occupations
    occupation_features = profiles[feature_cols].values
    similarities = cosine_similarity(occupation_features, predicted_profile.reshape(1, -1)).ravel()
    
    # Get top matches
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    
    # Create recommendations dataframe
    recommendations = profiles.iloc[top_indices][[soc_col, title_col]].copy()
    recommendations['similarity_score'] = similarities[top_indices]
    recommendations = recommendations.reset_index(drop=True)
    
    return recommendations

print("Career recommendations engine ready!")

Career recommendations engine ready!


In [15]:
# Step 7: Test System
# Test the career recommendation system with sample data

# Test with sample responses (all 3s = neutral)
sample_responses = [3] * 52  # 52 total questions
sample_profile = score_questionnaire(sample_responses)
sample_recommendations = get_career_recommendations(sample_profile, model, feature_cols, profiles)

print("\nSample recommendations (neutral profile):")
print(sample_recommendations.head())

print("\nSystem test complete! Career recommender is working.")

Available columns in profiles: ['O*NET-SOC Code', 'Artistic', 'Conventional', 'Enterprising', 'Investigative', 'Realistic', 'Social', 'Achievement/Effort', 'Adaptability/Flexibility', 'Analytical Thinking', 'Attention to Detail', 'Concern for Others', 'Cooperation', 'Dependability', 'Independence', 'Initiative', 'Innovation', 'Integrity', 'Leadership', 'Persistence', 'Self-Control', 'Social Orientation', 'Stress Tolerance', 'title']
Looking for columns: soc, title
Using SOC column: Social Orientation
Using title column: title

Sample recommendations (neutral profile):
   Social Orientation                                              title  \
0                0.65                               Landscape Architects   
1                0.63                  Economics Teachers, Postsecondary   
2                0.68                    Physics Teachers, Postsecondary   
3                0.60                                        Geographers   
4                0.64  Atmospheric, Earth, Ma

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [None]:
# Step 8: Streamlit App Preparation
# Code for the Streamlit interface

streamlit_code = '''
# Save this as app.py
import streamlit as st
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the trained model
with open('career_recommender_model.pkl', 'rb') as f:
    data = pickle.load(f)
    model = data['model']
    feature_cols = data['feature_cols']
    profiles = data['profiles']

st.title("O*NET Career Path Recommender")
st.write("Answer the questions below to get personalized career recommendations!")

# Questionnaire sections
st.header("Work Values Assessment")
st.write("Rate how important each is to you in a job (1=Not Important, 5=Extremely Important)")

# Add your 52 questions here as st.slider widgets
# Then collect responses and call the recommendation functions

st.success("Career recommender system ready for Streamlit deployment!")
'''



Streamlit app code prepared!
Next step: Create app.py with the Streamlit interface


In [17]:
# Step 8: Save Model for Streamlit App
# Save the trained model and data that app.py needs

import pickle

# Save the complete system
save_data = {
    'model': model,
    'feature_cols': feature_cols,
    'profiles': profiles
}

with open('career_recommender_model.pkl', 'wb') as f:
    pickle.dump(save_data, f)

print("✅ Model saved successfully!")
print("📁 File: 'career_recommender_model.pkl'")
print("📊 Model size:", f"{os.path.getsize('career_recommender_model.pkl') / 1024 / 1024:.2f} MB")

# Verify the save worked
try:
    with open('career_recommender_model.pkl', 'rb') as f:
        loaded_data = pickle.load(f)
    
    print("\n✅ Verification successful!")
    print(f"📈 Model loaded: {type(loaded_data['model'])}")
    print(f"🔢 Features: {len(loaded_data['feature_cols'])}")
    print(f"�� Occupations: {len(loaded_data['profiles'])}")
    
except Exception as e:
    print(f"❌ Error loading saved model: {e}")

print("\n�� Your Streamlit app is ready to use!")
print("Run: streamlit run app.py")

✅ Model saved successfully!
📁 File: 'career_recommender_model.pkl'
📊 Model size: 58.95 MB

✅ Verification successful!
📈 Model loaded: <class 'sklearn.multioutput.MultiOutputRegressor'>
🔢 Features: 22
�� Occupations: 923

�� Your Streamlit app is ready to use!
Run: streamlit run app.py
