# 🔬 Phenotype Classification Project
This notebook integrates the entire pipeline using modular code from `src/`

In [None]:
# Add src to path to import modules
import sys
sys.path.append('../src')

# Import our custom modules
from utils import load_data, clean_data, encode_labels, split_data, save_model, load_model
from train_model import train_random_forest, evaluate_model, plot_confusion_matrix
from predict import predict

# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print('✅ All modules imported successfully!')

## 📂 Load and Prepare Data

In [None]:
# Load data
df = load_data('../data/dataset.csv')
print(f'Dataset shape: {df.shape}')

# Clean data
df = clean_data(df)

# Encode labels
df = encode_labels(df, 'label')

df.head()

## 🎯 Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = split_data(df)
print(f'Training set: {X_train.shape}')
print(f'Test set: {X_test.shape}')

## 🌲 Train Random Forest Model

In [None]:
# Train model
model = train_random_forest(X_train, y_train, n_estimators=200)
print('✅ Model training complete!')

## 📊 Evaluate Model Performance

In [None]:
# Evaluate on test set
preds = evaluate_model(model, X_test, y_test)

## 🔢 Confusion Matrix

In [None]:
plot_confusion_matrix(y_test, preds, save_path='../models/confusion_matrix.png')

## 💾 Save Trained Model

In [None]:
save_model(model, '../models/model.pkl')

## 🤖 SHAP Explainability Analysis

In [None]:
import shap

# Create SHAP explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Plot SHAP summary
if isinstance(shap_values, list):
    print(f'Multi-class problem with {len(shap_values)} classes')
    shap.summary_plot(shap_values, X_test)
else:
    print('Binary classification')
    shap.summary_plot(shap_values, X_test)

## 🔮 Make Predictions on New Data

In [None]:
# Example: Predict on test set
predictions, probabilities = predict(model, X_test)

print(f'Predictions shape: {predictions.shape}')
print(f'First 10 predictions: {predictions[:10]}')
print(f'\nPrediction probabilities shape: {probabilities.shape}')

# For new data, uncomment:
# new_df = pd.read_csv('../data/new_data.csv')
# new_df_clean = clean_data(new_df)
# new_predictions, new_probs = predict(model, new_df_clean)