# Next Step - Career Guidance System: Quickstart Guide

This notebook demonstrates how to use the career guidance system with example scenarios.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set up plotting style
plt.style.use('seaborn')
sns.set_palette('husl')

## 1. Generate Sample Data

First, let's generate some sample student data to work with.

In [None]:
# Import our data generator
from scripts.generate_sample_data import generate_sample_data

# Generate sample data
df = generate_sample_data()

# Display first few records
print("Sample student records:")
df.head()

## 2. Example Scenarios

Let's look at some typical student profiles:

In [None]:
# Example student profiles
example_students = pd.DataFrame([
    {
        'ol_mathematics': 95,
        'ol_science': 92,
        'ol_english': 88,
        'ol_history': 85,
        'al_stream': 'Science',
        'interests': 'Technology, Healthcare',
        'skills': 'Programming, Analysis'
    },
    {
        'ol_mathematics': 78,
        'ol_science': 75,
        'ol_english': 92,
        'ol_history': 90,
        'al_stream': 'Commerce',
        'interests': 'Business, Leadership',
        'skills': 'Communication, Analysis'
    },
    {
        'ol_mathematics': 82,
        'ol_science': 85,
        'ol_english': 95,
        'ol_history': 88,
        'al_stream': 'Arts',
        'interests': 'Teaching, Arts',
        'skills': 'Communication, Creativity'
    }
])

print("Example student profiles:")
example_students

## 3. Data Analysis

Let's analyze the distribution of grades and career paths in our dataset.

In [None]:
# Plot grade distributions
plt.figure(figsize=(12, 6))
for subject in ['mathematics', 'science', 'english', 'history']:
    sns.kdeplot(data=df[f'ol_{subject}'], label=subject.title())
plt.title('Distribution of OL Grades')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend()
plt.show()

# Plot career path distribution
plt.figure(figsize=(10, 6))
df['career_path'].value_counts().plot(kind='bar')
plt.title('Distribution of Career Paths')
plt.xlabel('Career')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 4. Model Training and Evaluation

Now let's train and evaluate our model.

In [None]:
from src.config.config import Config
from src.data.preprocessing import DataPreprocessor
from src.models.gradient_boosting_model import GradientBoostingModel
from sklearn.model_selection import train_test_split

# Load configuration
config = Config()

# Prepare data
preprocessor = DataPreprocessor(config)
X = preprocessor.preprocess_features(df)
y = pd.get_dummies(df['career_path']).values

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
model = GradientBoostingModel(config.model_config)
model.train(X_train, y_train)

# Evaluate
metrics = model.evaluate(X_test, y_test)
print("\nModel Performance Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

## 5. Making Predictions

Let's make predictions for our example students.

In [None]:
# Preprocess example students
X_examples = preprocessor.preprocess_features(example_students)

# Make predictions
predictions = model.predict(X_examples)
probabilities = model.predict_proba(X_examples)

# Display results
print("Career Predictions:")
for i, (pred, prob) in enumerate(zip(predictions, probabilities[0])):
    print(f"\nStudent {i+1}:")
    print(f"Predicted Career: {pred}")
    print(f"Confidence: {prob.max():.2%}")

## 6. Feature Importance

Let's see which factors are most important for career predictions.

In [None]:
# Get feature importance
importance = model.get_feature_importance()

# Plot feature importance
plt.figure(figsize=(12, 6))
importance_df = pd.DataFrame(importance['weight'].items(), 
                           columns=['Feature', 'Importance'])
importance_df = importance_df.sort_values('Importance', ascending=False).head(10)

sns.barplot(data=importance_df, x='Importance', y='Feature')
plt.title('Top 10 Most Important Features')
plt.tight_layout()
plt.show()