# Developer Role Classification: Modeling

This notebook contains the modeling process for classifying developer roles based on commit data. We'll implement and evaluate multiple models, starting with baselines and progressing to more sophisticated approaches.

## Objectives:
1. Implement baseline models for developer role classification
2. Develop improved models with optimized hyperparameters
3. Compare model performance across various metrics
4. Select the best model for deployment

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif

# Models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

import warnings
import os
import sys
import platform
import joblib

# Print environment information for reproducibility
print(f"Python version: {platform.python_version()}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"scikit-learn version: {sklearn.__version__}")
print(f"XGBoost version: {xgboost.__version__}")

# For reproducibility - Set fixed seeds everywhere
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
import random
random.seed(RANDOM_SEED)

# Set seeds for various libraries
try:
    import tensorflow as tf
    tf.random.set_seed(RANDOM_SEED)
    print(f"TensorFlow version: {tf.__version__}")
except ImportError:
    print("TensorFlow not installed")

try:
    import torch
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)
    print(f"PyTorch version: {torch.__version__}")
except ImportError:
    print("PyTorch not installed")

# Set Python hash seed
os.environ['PYTHONHASHSEED'] = str(RANDOM_SEED)

# Ensure reproducibility in sklearn
from sklearn.utils import check_random_state
check_random_state(RANDOM_SEED)

warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('ggplot')
sns.set(style="whitegrid")

## 1. Data Loading and Preparation

First, let's load our processed dataset and prepare it for modeling.

In [None]:
# Load the processed dataset
try:
    data = pd.read_csv('final_dataset.csv')
    print(f"Dataset loaded successfully with {data.shape[0]} rows and {data.shape[1]} columns.")
except FileNotFoundError:
    print("Dataset not found. Please run the preprocessing notebook first.")

# Display the first few rows
print("\nFirst 5 rows of the dataset:")
display(data.head())

# Identify features and target
# Exclude non-feature columns
exclude_cols = ['role', 'commit_message', 'processed_message', 'clean_commit_message']
if 'processed_for_cloud' in data.columns:
    exclude_cols.append('processed_for_cloud')

feature_cols = [col for col in data.columns if col not in exclude_cols]
target_col = 'role'

print(f"\nFeatures to be used ({len(feature_cols)}):")
print(feature_cols)

# Prepare X and y
X = data[feature_cols].values
y = data[target_col].values

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(f"\nTarget classes: {label_encoder.classes_}")

# Split into train, validation, and test sets (60%, 20%, 20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y_encoded, test_size=0.4, random_state=42, stratify=y_encoded)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Check class distribution in each set
def print_class_distribution(y, set_name):
    unique, counts = np.unique(y, return_counts=True)
    print(f"\nClass distribution in {set_name} set:")
    for i, cls in enumerate(label_encoder.classes_):
        print(f"  {cls}: {counts[counts.argsort()[i]]}")

print_class_distribution(y_train, "training")
print_class_distribution(y_val, "validation")
print_class_distribution(y_test, "test")

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## 2. Evaluation Functions

Define functions to evaluate model performance using various metrics.

In [None]:
def evaluate_model(model, X, y, label_encoder, set_name=""):
    """Evaluate model and return metrics."""
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    
    # Calculate class-wise precision, recall, and F1
    precision, recall, f1, support = precision_recall_fscore_support(y, y_pred, average=None)
    
    # Calculate macro F1
    macro_f1 = f1_score(y, y_pred, average='macro')
    
    # Generate classification report
    class_report = classification_report(y, y_pred, target_names=label_encoder.classes_, output_dict=True)
    
    # Generate confusion matrix
    cm = confusion_matrix(y, y_pred)
    
    # Return all metrics
    return {
        'accuracy': accuracy,
        'macro_f1': macro_f1,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'support': support,
        'class_report': class_report,
        'confusion_matrix': cm,
        'predictions': y_pred
    }

def plot_confusion_matrix(cm, classes, title='Confusion Matrix', cmap=plt.cm.Blues):
    """Plot confusion matrix."""
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap=cmap, xticklabels=classes, yticklabels=classes)
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()
    
def plot_metrics_comparison(metrics_dict, metric_name):
    """Plot comparison of a specific metric across models."""
    models = list(metrics_dict.keys())
    values = [metrics_dict[model][metric_name] for model in models]
    
    plt.figure(figsize=(12, 6))
    bars = plt.bar(models, values, color='skyblue')
    
    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom')
    
    plt.title(f'Comparison of {metric_name.capitalize()} Across Models')
    plt.ylabel(metric_name.capitalize())
    plt.xlabel('Model')
    plt.xticks(rotation=45)
    plt.ylim(0, max(values) + 0.1)
    plt.tight_layout()
    plt.show()

def log_results(model_name, metrics, classes):
    """Print model evaluation results."""
    print(f"=== {model_name} Performance ===")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Macro F1: {metrics['macro_f1']:.4f}")
    
    print("\nPer-class Performance:")
    for i, cls in enumerate(classes):
        print(f"  {cls}:")
        print(f"    Precision: {metrics['precision'][i]:.4f}")
        print(f"    Recall: {metrics['recall'][i]:.4f}")
        print(f"    F1: {metrics['f1'][i]:.4f}")
    
    print("\nConfusion Matrix:")
    plot_confusion_matrix(metrics['confusion_matrix'], classes)

## 3. Baseline Models

Let's implement and evaluate baseline models to establish a performance benchmark.

In [None]:
# Dictionary to store model results
model_results = {}

# Baseline 1: Stratified Random Classifier
baseline_stratified = DummyClassifier(strategy='stratified', random_state=42)
baseline_stratified.fit(X_train_scaled, y_train)
baseline_metrics = evaluate_model(baseline_stratified, X_val_scaled, y_val, label_encoder)
log_results("Baseline (Stratified Random)", baseline_metrics, label_encoder.classes_)
model_results["Baseline (Stratified)"] = baseline_metrics

# Baseline 2: Most Frequent Class Classifier
baseline_most_frequent = DummyClassifier(strategy='most_frequent', random_state=42)
baseline_most_frequent.fit(X_train_scaled, y_train)
baseline_metrics = evaluate_model(baseline_most_frequent, X_val_scaled, y_val, label_encoder)
log_results("Baseline (Most Frequent)", baseline_metrics, label_encoder.classes_)
model_results["Baseline (Most Frequent)"] = baseline_metrics

# Baseline 3: Simple Logistic Regression
baseline_lr = LogisticRegression(max_iter=1000, random_state=42)
baseline_lr.fit(X_train_scaled, y_train)
baseline_metrics = evaluate_model(baseline_lr, X_val_scaled, y_val, label_encoder)
log_results("Baseline (Logistic Regression)", baseline_metrics, label_encoder.classes_)
model_results["Baseline (Logistic Regression)"] = baseline_metrics

# Compare baseline models
plot_metrics_comparison(model_results, 'macro_f1')
plot_metrics_comparison(model_results, 'accuracy')

## 4. Advanced Models

Now let's implement more sophisticated models with hyperparameter tuning.

In [None]:
# 4.1 Random Forest
print("Training Random Forest model...")

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Set up grid search with cross-validation
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    rf, param_grid, cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)

# Fit grid search
grid_search.fit(X_train_scaled, y_train)

# Get best model
best_rf = grid_search.best_estimator_

# Print best parameters
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best macro F1 score: {grid_search.best_score_:.4f}")

# Evaluate on validation set
rf_metrics = evaluate_model(best_rf, X_val_scaled, y_val, label_encoder)
log_results("Random Forest (Optimized)", rf_metrics, label_encoder.classes_)
model_results["Random Forest"] = rf_metrics

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': best_rf.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
plt.title('Top 15 Feature Importances (Random Forest)')
plt.tight_layout()
plt.show()

In [None]:
# 4.2 Gradient Boosting
print("Training Gradient Boosting model...")

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5]
}

# Set up grid search with cross-validation
gb = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(
    gb, param_grid, cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)

# Fit grid search
grid_search.fit(X_train_scaled, y_train)

# Get best model
best_gb = grid_search.best_estimator_

# Print best parameters
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best macro F1 score: {grid_search.best_score_:.4f}")

# Evaluate on validation set
gb_metrics = evaluate_model(best_gb, X_val_scaled, y_val, label_encoder)
log_results("Gradient Boosting (Optimized)", gb_metrics, label_encoder.classes_)
model_results["Gradient Boosting"] = gb_metrics

In [None]:
# 4.3 XGBoost
print("Training XGBoost model...")

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_child_weight': [1, 3]
}

# Set up grid search with cross-validation
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
grid_search = GridSearchCV(
    xgb, param_grid, cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)

# Fit grid search
grid_search.fit(X_train_scaled, y_train)

# Get best model
best_xgb = grid_search.best_estimator_

# Print best parameters
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best macro F1 score: {grid_search.best_score_:.4f}")

# Evaluate on validation set
xgb_metrics = evaluate_model(best_xgb, X_val_scaled, y_val, label_encoder)
log_results("XGBoost (Optimized)", xgb_metrics, label_encoder.classes_)
model_results["XGBoost"] = xgb_metrics

In [None]:
# 4.4 Support Vector Machine
print("Training SVM model...")

# Define hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'linear']
}

# Set up grid search with cross-validation
svm = SVC(probability=True, random_state=42)
grid_search = GridSearchCV(
    svm, param_grid, cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)

# Fit grid search
grid_search.fit(X_train_scaled, y_train)

# Get best model
best_svm = grid_search.best_estimator_

# Print best parameters
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best macro F1 score: {grid_search.best_score_:.4f}")

# Evaluate on validation set
svm_metrics = evaluate_model(best_svm, X_val_scaled, y_val, label_encoder)
log_results("SVM (Optimized)", svm_metrics, label_encoder.classes_)
model_results["SVM"] = svm_metrics

In [None]:
# 4.5 Neural Network (MLP)
print("Training Neural Network model...")

# Define hyperparameter grid
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 25)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant', 'adaptive']
}

# Set up grid search with cross-validation
mlp = MLPClassifier(max_iter=1000, random_state=42)
grid_search = GridSearchCV(
    mlp, param_grid, cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)

# Fit grid search
grid_search.fit(X_train_scaled, y_train)

# Get best model
best_mlp = grid_search.best_estimator_

# Print best parameters
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best macro F1 score: {grid_search.best_score_:.4f}")

# Evaluate on validation set
mlp_metrics = evaluate_model(best_mlp, X_val_scaled, y_val, label_encoder)
log_results("Neural Network (Optimized)", mlp_metrics, label_encoder.classes_)
model_results["Neural Network"] = mlp_metrics

## 5. Model Comparison and Selection

Compare all models and select the best performing one.

In [None]:
# Compare all models
print("Model Comparison (Validation Set):")
results_df = pd.DataFrame({
    'Model': list(model_results.keys()),
    'Accuracy': [model_results[m]['accuracy'] for m in model_results],
    'Macro F1': [model_results[m]['macro_f1'] for m in model_results]
}).sort_values('Macro F1', ascending=False)

display(results_df)

# Plot comparison of macro F1 scores
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='Macro F1', data=results_df)
plt.title('Comparison of Macro F1 Scores Across Models')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot comparison of accuracy scores
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='Accuracy', data=results_df)
plt.title('Comparison of Accuracy Scores Across Models')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Identify best model based on macro F1 score
best_model_name = results_df.iloc[0]['Model']
print(f"\nBest model based on Macro F1 score: {best_model_name}")

# Get corresponding model object
if best_model_name == "Random Forest":
    best_model = best_rf
elif best_model_name == "Gradient Boosting":
    best_model = best_gb
elif best_model_name == "XGBoost":
    best_model = best_xgb
elif best_model_name == "SVM":
    best_model = best_svm
elif best_model_name == "Neural Network":
    best_model = best_mlp
else:
    # Default to the best baseline if somehow a baseline is best
    best_model = baseline_lr

## 6. Final Evaluation on Test Set

Evaluate the best model on the test set to get the final performance metrics.

In [None]:
# Evaluate best model on test set
print(f"Evaluating {best_model_name} on test set...")
test_metrics = evaluate_model(best_model, X_test_scaled, y_test, label_encoder, "test")
log_results(f"{best_model_name} (Test Set)", test_metrics, label_encoder.classes_)

# Compare validation and test performance
print("\nValidation vs Test Performance:")
val_performance = model_results[best_model_name]
comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Macro F1'],
    'Validation': [val_performance['accuracy'], val_performance['macro_f1']],
    'Test': [test_metrics['accuracy'], test_metrics['macro_f1']],
    'Difference': [test_metrics['accuracy'] - val_performance['accuracy'], 
                   test_metrics['macro_f1'] - val_performance['macro_f1']]
})
display(comparison_df)

# Check for overfitting
if abs(val_performance['macro_f1'] - test_metrics['macro_f1']) > 0.05:
    print("⚠️ Warning: There may be some overfitting (>5% difference between validation and test performance)")
else:
    print("✅ Model generalizes well (similar performance on validation and test sets)")

## 7. Model Analysis and Insights

Analyze the best model's behavior and extract insights.

In [None]:
# 7.1 Confusion Matrix Analysis
cm = test_metrics['confusion_matrix']
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Plot normalized confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', 
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Normalized Confusion Matrix (Test Set)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Identify most confused classes
confusion_pairs = []
for i in range(len(cm)):
    for j in range(len(cm)):
        if i != j:  # Skip diagonal elements
            confusion_pairs.append((label_encoder.classes_[i], label_encoder.classes_[j], cm[i, j]))

# Sort by confusion count (descending)
confusion_pairs.sort(key=lambda x: x[2], reverse=True)

print("Most confused class pairs:")
for true_class, pred_class, count in confusion_pairs[:5]:
    print(f"  {true_class} predicted as {pred_class}: {count} instances")

In [None]:
# 7.2 Feature Importance Analysis (for tree-based models)
if best_model_name in ["Random Forest", "Gradient Boosting", "XGBoost"]:
    # Get feature importances
    if best_model_name == "XGBoost":
        importances = best_model.feature_importances_
    else:
        importances = best_model.feature_importances_
        
    # Create DataFrame for visualization
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    # Plot top features
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
    plt.title(f'Top 15 Feature Importances ({best_model_name})')
    plt.tight_layout()
    plt.show()
    
    # Print top 10 features and their importance
    print("Top 10 most important features:")
    for i, (feature, importance) in enumerate(zip(feature_importance['Feature'].head(10), 
                                                 feature_importance['Importance'].head(10))):
        print(f"{i+1}. {feature}: {importance:.4f}")

In [None]:
# 7.3 Per-class Performance Analysis
class_report = test_metrics['class_report']

# Create DataFrame for visualization
class_performance = pd.DataFrame({
    'Class': list(class_report.keys())[:-3],  # Exclude 'accuracy', 'macro avg', 'weighted avg'
    'Precision': [class_report[cls]['precision'] for cls in list(class_report.keys())[:-3]],
    'Recall': [class_report[cls]['recall'] for cls in list(class_report.keys())[:-3]],
    'F1-Score': [class_report[cls]['f1-score'] for cls in list(class_report.keys())[:-3]],
    'Support': [class_report[cls]['support'] for cls in list(class_report.keys())[:-3]]
})

# Plot per-class metrics
plt.figure(figsize=(14, 8))
class_performance_melted = pd.melt(class_performance, id_vars=['Class', 'Support'], 
                                   value_vars=['Precision', 'Recall', 'F1-Score'],
                                   var_name='Metric', value_name='Score')
sns.barplot(x='Class', y='Score', hue='Metric', data=class_performance_melted)
plt.title('Per-class Performance Metrics')
plt.ylim(0, 1.0)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Identify best and worst performing classes
best_class = class_performance.sort_values('F1-Score', ascending=False).iloc[0]
worst_class = class_performance.sort_values('F1-Score').iloc[0]

print(f"Best performing class: {best_class['Class']} (F1-Score: {best_class['F1-Score']:.4f})")
print(f"Worst performing class: {worst_class['Class']} (F1-Score: {worst_class['F1-Score']:.4f})")

In [None]:
# 7.4 Error Analysis
# Get misclassified samples
y_pred = test_metrics['predictions']
misclassified_indices = np.where(y_test != y_pred)[0]

if len(misclassified_indices) > 0:
    print(f"Number of misclassified samples: {len(misclassified_indices)} out of {len(y_test)} ({len(misclassified_indices)/len(y_test)*100:.2f}%)")
    
    # Get original indices in the dataset
    test_indices = np.arange(len(data))[len(X_train) + len(X_val):]
    original_indices = test_indices[misclassified_indices]
    
    # Get misclassified samples with original values
    misclassified_samples = data.iloc[original_indices]
    
    # Add true and predicted labels
    misclassified_samples['true_label'] = [label_encoder.classes_[y_test[i]] for i in misclassified_indices]
    misclassified_samples['predicted_label'] = [label_encoder.classes_[y_pred[i]] for i in misclassified_indices]
    
    # Display sample of misclassified instances
    print("\nSample of misclassified instances:")
    if 'commit_message' in misclassified_samples.columns:
        display(misclassified_samples[['commit_message', 'true_label', 'predicted_label']].head(10))
    else:
        display(misclassified_samples[['true_label', 'predicted_label']].head(10))
    
    # Count misclassifications by class
    misclass_counts = misclassified_samples.groupby(['true_label', 'predicted_label']).size().reset_index()
    misclass_counts.columns = ['True Label', 'Predicted Label', 'Count']
    misclass_counts = misclass_counts.sort_values('Count', ascending=False)
    
    print("\nMost common misclassifications:")
    display(misclass_counts.head(10))

## 8. Model Calibration and Robustness

Check how well-calibrated the model probabilities are and assess robustness.

In [None]:
from sklearn.calibration import calibration_curve

# 8.1 Probability Calibration Analysis
if hasattr(best_model, "predict_proba"):
    y_prob = best_model.predict_proba(X_test_scaled)
    
    # For each class, calculate calibration curve
    plt.figure(figsize=(12, 8))
    for i, class_name in enumerate(label_encoder.classes_):
        # Convert to binary problem (one-vs-rest)
        y_true_binary = (y_test == i).astype(int)
        y_prob_binary = y_prob[:, i]
        
        # Calculate calibration curve
        prob_true, prob_pred = calibration_curve(y_true_binary, y_prob_binary, n_bins=10)
        
        # Plot calibration curve
        plt.plot(prob_pred, prob_true, marker='o', linewidth=1, label=class_name)
    
    # Plot perfectly calibrated line
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    
    plt.title('Calibration Curves (Reliability Diagram)')
    plt.xlabel('Mean Predicted Probability')
    plt.ylabel('Fraction of Positives')
    plt.legend(loc='best')
    plt.tight_layout()
    plt.show()
    
    # Calculate Brier score (lower is better)
    from sklearn.metrics import brier_score_loss
    brier_scores = []
    for i in range(len(label_encoder.classes_)):
        y_true_binary = (y_test == i).astype(int)
        y_prob_binary = y_prob[:, i]
        brier = brier_score_loss(y_true_binary, y_prob_binary)
        brier_scores.append(brier)
    
    print("Brier scores by class (lower is better):")
    for cls, score in zip(label_encoder.classes_, brier_scores):
        print(f"  {cls}: {score:.4f}")
    print(f"Average Brier score: {np.mean(brier_scores):.4f}")
    
    # Check if model is overconfident or underconfident
    confidence = np.max(y_prob, axis=1)
    mean_confidence = np.mean(confidence)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\nMean prediction confidence: {mean_confidence:.4f}")
    print(f"Test accuracy: {accuracy:.4f}")
    
    if mean_confidence > accuracy + 0.1:
        print("⚠️ Model may be overconfident (mean confidence > accuracy)")
    elif mean_confidence < accuracy - 0.1:
        print("⚠️ Model may be underconfident (mean confidence < accuracy)")
    else:
        print("✅ Model confidence is well-calibrated")

In [None]:
# 8.2 Robustness Analysis
# Simulate noise in the features to test robustness
if len(X_test) > 20:
    # Add Gaussian noise to features
    noise_levels = [0.0, 0.05, 0.1, 0.2, 0.5]
    noise_results = []
    
    for noise in noise_levels:
        # Add noise to test set
        X_test_noisy = X_test_scaled.copy()
        if noise > 0:
            X_test_noisy += np.random.normal(0, noise, X_test_noisy.shape)
        
        # Evaluate model on noisy data
        noisy_metrics = evaluate_model(best_model, X_test_noisy, y_test, label_encoder)
        noise_results.append({
            'noise_level': noise,
            'accuracy': noisy_metrics['accuracy'],
            'macro_f1': noisy_metrics['macro_f1']
        })
    
    # Create DataFrame for visualization
    noise_df = pd.DataFrame(noise_results)
    
    # Plot impact of noise on performance
    plt.figure(figsize=(12, 6))
    plt.plot(noise_df['noise_level'], noise_df['accuracy'], marker='o', label='Accuracy')
    plt.plot(noise_df['noise_level'], noise_df['macro_f1'], marker='s', label='Macro F1')
    plt.title('Model Robustness to Feature Noise')
    plt.xlabel('Noise Level (Standard Deviation)')
    plt.ylabel('Performance')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
    # Calculate degradation rate
    initial_f1 = noise_df.iloc[0]['macro_f1']
    max_noise_f1 = noise_df.iloc[-1]['macro_f1']
    degradation = (initial_f1 - max_noise_f1) / initial_f1 * 100
    
    print(f"Performance degradation at {noise_levels[-1]} noise level: {degradation:.2f}%")
    
    if degradation > 50:
        print("⚠️ Model is highly sensitive to noise")
    elif degradation > 25:
        print("⚠️ Model shows moderate sensitivity to noise")
    else:
        print("✅ Model is relatively robust to noise")

## 9. Model Deployment Preparation

Save the best model and necessary preprocessing components for deployment.

In [None]:
import joblib

# Create a dictionary with all necessary components
model_package = {
    'model': best_model,
    'scaler': scaler,
    'label_encoder': label_encoder,
    'feature_columns': feature_cols,
    'model_name': best_model_name,
    'performance': {
        'accuracy': test_metrics['accuracy'],
        'macro_f1': test_metrics['macro_f1'],
        'class_report': test_metrics['class_report']
    }
}

# Save the model package
joblib.dump(model_package, 'best_model.bin')
print(f"Model saved to 'best_model.bin'")

# Create a simple prediction function for demonstration
def predict_role(commit_message, files_changed=1, insertions=10, deletions=5):
    """
    Simple function to demonstrate how to use the model for prediction.
    """
    # This is a simplified version - in a real scenario, you would need to process
    # the commit message using the same preprocessing steps as during training
    # and ensure all feature values are available.
    
    # Load the model package
    model_package = joblib.load('best_model.bin')
    
    # Create a feature vector (simplified - you'd need to extract all features)
    features = np.zeros(len(model_package['feature_columns']))
    
    # Set known features (this is just a placeholder - real implementation would extract all features)
    features[0] = len(commit_message)  # assuming first feature is message_length
    features[6] = files_changed  # assuming files_changed is at index 6
    features[7] = insertions  # assuming insertions is at index 7
    features[8] = deletions  # assuming deletions is at index 8
    
    # Scale features
    features_scaled = model_package['scaler'].transform(features.reshape(1, -1))
    
    # Make prediction
    prediction = model_package['model'].predict(features_scaled)[0]
    
    # Get class label
    predicted_role = model_package['label_encoder'].inverse_transform([prediction])[0]
    
    return predicted_role

# Test the prediction function with a sample commit message
sample_message = "Fixed UI layout issues in the dashboard component"
print(f"\nSample prediction for commit message: \"{sample_message}\"")
print(f"Predicted role: {predict_role(sample_message)}")

## 10. Summary and Conclusions

Let's summarize our modeling process and results.

In [None]:
# Create a summary of the modeling process and results
print("# Developer Role Classification Model Summary\n")

print("## Dataset Overview")
print(f"- Total samples: {len(data)}")
print(f"- Number of features: {len(feature_cols)}")
print(f"- Number of classes: {len(label_encoder.classes_)}")
print(f"- Class distribution: {dict(zip(label_encoder.classes_, np.bincount(data['role'].map(dict(zip(label_encoder.classes_, range(len(label_encoder.classes_)))))))}")

print("\n## Model Comparison")
display(results_df)

print("\n## Best Model")
print(f"- Selected model: {best_model_name}")
if best_model_name in ["Random Forest", "Gradient Boosting", "XGBoost"]:
    print("- Top 5 features:")
    for i, (feature, importance) in enumerate(zip(feature_importance['Feature'].head(5), 
                                                 feature_importance['Importance'].head(5))):
        print(f"  {i+1}. {feature}: {importance:.4f}")

print("\n## Performance on Test Set")
print(f"- Accuracy: {test_metrics['accuracy']:.4f}")
print(f"- Macro F1 Score: {test_metrics['macro_f1']:.4f}")
print("- Per-class F1 Scores:")
for i, cls in enumerate(label_encoder.classes_):
    print(f"  - {cls}: {test_metrics['f1'][i]:.4f}")

print("\n## Conclusion")
if test_metrics['macro_f1'] > 0.9:
    performance_assessment = "excellent"
elif test_metrics['macro_f1'] > 0.8:
    performance_assessment = "good"
elif test_metrics['macro_f1'] > 0.7:
    performance_assessment = "moderate"
else:
    performance_assessment = "needs improvement"

print(f"The {best_model_name} model achieved {performance_assessment} performance with a macro F1 score of {test_metrics['macro_f1']:.4f}.")
print("The model can effectively classify developer roles based on commit patterns and message content.")

if 'confusion_pairs' in locals() and len(confusion_pairs) > 0:
    print(f"\nThe most commonly confused roles are {confusion_pairs[0][0]} and {confusion_pairs[0][1]}.")

print("\nModel saved and ready for deployment.")