# DeepBridge Machine Learning Workflow Tutorial

## Setup and Installation

In [None]:
# Install DeepBridge and dependencies
!pip install deepbridge

Notebook Walkthrough

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# DeepBridge imports
from deepbridge.db_data import DBDataset
from deepbridge.auto_distiller import AutoDistiller
from deepbridge.model_validation import ModelValidation

# Scikit-learn imports for data preparation
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

1. Data Preparation

In [None]:
# Load breast cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Create DataFrame for better visualization
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y

# Display basic information
print("Dataset Overview:")
print(df.head())
print("\nDataset Shape:", df.shape)
print("\nTarget Distribution:")
print(df['target'].value_counts(normalize=True))

# Visualize feature distributions
plt.figure(figsize=(15, 10))
df.drop('target', axis=1).boxplot()
plt.title('Feature Distributions')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

2. Data Preprocessing and Validation

In [None]:
# Prepare data for DeepBridge
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)
df_scaled['target'] = y

# Create DBDataset
dataset = DBDataset(
    data=df_scaled,
    target_column='target',
    synthetic=True  # Generate synthetic data for augmentation
)

# Perform model validation
experiment = ModelValidation(
    experiment_name="breast_cancer_classification"
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df_scaled.drop('target', axis=1), 
    df_scaled['target'], 
    test_size=0.2, 
    random_state=42
)

# Add data to experiment
experiment.add_data(
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test
)

# Visualize synthetic data generation
synthetic_data = dataset.synthetic_data
plt.figure(figsize=(15, 10))
synthetic_df = pd.DataFrame(synthetic_data, columns=data.feature_names + ['target'])

# Compare original vs synthetic data distributions
fig, axes = plt.subplots(2, 1, figsize=(15, 10))
df['target'].value_counts().plot(kind='bar', ax=axes[0], title='Original Data Target Distribution')
synthetic_df['target'].value_counts().plot(kind='bar', ax=axes[1], title='Synthetic Data Target Distribution')
plt.tight_layout()
plt.show()

# Print synthetic data quality metrics
print("\nSynthetic Data Quality Metrics:")
print(dataset.synthetic_quality_metrics)

3. Automated Model Distillation

In [None]:
# Initialize AutoDistiller
auto_distiller = AutoDistiller(
    dataset=dataset,
    output_dir='./breast_cancer_results',
    n_trials=20,  # Number of hyperparameter optimization trials
    random_state=42
)

# Customize model configurations
auto_distiller.customize_config(
    model_types=['gbm', 'xgb', 'random_forest'],
    temperatures=[0.5, 1.0, 2.0],
    alphas=[0.3, 0.5, 0.7]
)

# Run distillation experiments
results_df = auto_distiller.run()

# Generate and print report
report = auto_distiller.generate_report()
print("\nDistillation Experiment Report:")
print(report)

# Visualize model performance
plt.figure(figsize=(15, 10))
results_df.boxplot(column=['test_accuracy'], by='model_type')
plt.title('Model Accuracy by Model Type')
plt.suptitle('')  # Remove automatic suptitle
plt.tight_layout()
plt.show()

# Find and save best model
best_model_path = auto_distiller.save_best_model(
    metric='test_accuracy', 
    minimize=False
)
print(f"\nBest model saved to: {best_model_path}")

4. Model Performance Visualization

In [None]:
# Create detailed performance comparison
plt.figure(figsize=(15, 10))

# Prepare performance metrics for visualization
performance_metrics = [
    'test_accuracy', 
    'test_precision', 
    'test_recall', 
    'test_f1', 
    'test_auc_roc'
]

# Melt the dataframe for easier plotting
melted_df = results_df.melt(
    id_vars=['model_type', 'temperature', 'alpha'], 
    value_vars=performance_metrics, 
    var_name='Metric', 
    value_name='Value'
)

# Create boxplot
sns.boxplot(
    x='model_type', 
    y='Value', 
    hue='Metric', 
    data=melted_df
)
plt.title('Model Performance Across Different Metrics')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Temperature and Alpha Impact
plt.figure(figsize=(15, 10))
sns.scatterplot(
    data=results_df, 
    x='temperature', 
    y='test_accuracy', 
    hue='model_type', 
    size='alpha', 
    palette='viridis'
)
plt.title('Temperature and Alpha Impact on Model Accuracy')
plt.tight_layout()
plt.show()

5. Advanced Model Analysis

In [None]:
# Get best model
best_config = auto_distiller.find_best_model(
    metric='test_accuracy', 
    minimize=False
)

# Retrieve trained model
best_model = auto_distiller.get_trained_model(
    best_config['model_type'], 
    best_config['temperature'], 
    best_config['alpha']
)

# Feature importance for interpretability
if hasattr(best_model, 'get_feature_importances'):
    feature_importances = best_model.get_feature_importances()
    
    plt.figure(figsize=(15, 10))
    feature_imp_df = pd.DataFrame.from_dict(
        feature_importances, 
        orient='index', 
        columns=['Importance']
    ).sort_values('Importance', ascending=False)
    
    feature_imp_df.plot(kind='bar')
    plt.title('Feature Importances in Best Model')
    plt.tight_layout()
    plt.show()

# Print best model configuration
print("\nBest Model Configuration:")
print(json.dumps(best_config, indent=2))

Conclusion and Next Steps
This notebook demonstrated:

Data preprocessing with DeepBridge
Synthetic data generation
Automated model distillation
Performance comparison
Model interpretation

Recommended Next Steps:

Experiment with different datasets
Try various model configurations
Use synthetic data for data augmentation
Explore feature engineering techniques