# 🔬 Phenotype Classification Project (Integrated Notebook)
This notebook integrates the entire pipeline: utils, training, SHAP, and predictions.

In [ ]:
!pip install pandas numpy scikit-learn matplotlib seaborn shap joblib

In [ ]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import shap
import joblib

## 📂 Embedded utils.py

In [ ]:
def load_data(path):
    return pd.read_csv(path)

def clean_data(df):
    return df.fillna(df.median(numeric_only=True))

def encode_labels(df, column='label'):
    df[column] = df[column].astype('category').cat.codes
    return df

def split_data(df, target='label'):
    from sklearn.model_selection import train_test_split
    X = df.drop(target, axis=1)
    y = df[target]
    return train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

def save_model(model, path='model.pkl'):
    joblib.dump(model, path)

def load_model(path='model.pkl'):
    return joblib.load(path)

## 🧼 Load and Prepare Data

In [ ]:
df = load_data('dataset.csv')
df = clean_data(df)
df = encode_labels(df, 'label')
df.head()

## 🏋️ Train/Test Split

In [ ]:
X_train, X_test, y_train, y_test = split_data(df)
X_train.shape, X_test.shape

## 🌲 Train Random Forest

In [ ]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)

print('Accuracy:', accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

## 🔢 Confusion Matrix

In [ ]:
cm = confusion_matrix(y_test, preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

## 💾 Save Model

In [ ]:
save_model(rf, 'model.pkl')
print('Model saved as model.pkl')

## 🤖 SHAP Explainability

In [ ]:
explainer = shap.TreeExplainer(rf, feature_perturbation='tree_path_dependent')
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values[0], X_test)

## 🔮 Predict on New Data

In [ ]:
print('To use: new_df = pd.read_csv("new_data.csv") then clean_data(new_df) then model.predict(new_df)')