In [None]:
%pip install pandas
%pip install boto3
%pip install psycopg2
%pip install sqlalchemy
%pip install dotenv


%pip matplotlib
%pip plotly
%pip seaborn 


# Run this cell to install these packages
# !pip install pandas boto3 sqlalchemy python-dotenv

In [None]:
# Import dependencies
import boto3
import os
import pandas as pd
from sqlalchemy import create_engine, text
import psycopg2
from io import StringIO

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import plotly.express as px
import plotly.graph_objects as go


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors



#pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#Prediction
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.pipeline import Pipeline as ImbPipeline

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from sklearn.model_selection import GridSearchCV

import shap

sns.set(style="whitegrid")

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# Load environment variables from .env file
from dotenv import load_dotenv









# --- 1. Imports and Setup ---
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import shap
import warnings
warnings.filterwarnings("ignore")

from sqlalchemy import create_engine
from dotenv import load_dotenv

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay,
    accuracy_score, f1_score
)

sns.set(style="whitegrid")
%matplotlib inline

In [None]:
# Load environment variables
load_dotenv()

# <h2 align="center">Data Input</h2>


### Define a Function to Query & Display Results

In [None]:
# Create a function to fetch data from the database
def get_db_connection():
    db_url = (
        f"postgresql://{os.environ['DB_USER']}:{os.environ['DB_PASSWORD']}@"
        f"{os.environ['DB_HOST']}:{os.environ['DB_PORT']}/{os.environ['DB_NAME']}"
    )
    return create_engine(db_url)

# Instantiate the database connection
engine = get_db_connection()


# Define query to fetch data from each table
query_clinics = "SELECT * FROM clinics;"
query_patients = "SELECT * FROM patients;"
query_sessions = "SELECT * FROM sessions;"
query_feedback = "SELECT * FROM feedback;"
query_dropout_flags = "SELECT * FROM dropout_flags;"
query_interventions = "SELECT * FROM interventions;"

# Load data from each table into a DataFrame
clinics_df = pd.read_sql(query_clinics, engine)
patients_df = pd.read_sql(query_patients, engine)
sessions_df = pd.read_sql(query_sessions, engine)
feedback_df = pd.read_sql(query_feedback, engine)
dropout_flags_df = pd.read_sql(query_dropout_flags, engine)
interventions_df = pd.read_sql(query_interventions, engine)

### Feature Engineering
- Create New features
- Merge relevant datasets

## NOTE: 
After the EDA, the data is pretty messy, hence we will:
- define the `patients data and session data` and merge the data together followed by, 
- creating a pipeline (define numerical and categorical columns),
- Then preprocessing the data before doing the clustering,
- Then do the clustering,

## Patient Segmentation:
To see which cluster each patient belong to, to see the behaviour of how each patient behave in each cluster

### Steps:
    - Define the patients(who they are i.e the features)
    - Define patients sessions

- Defining patients/sessions

In [None]:
clinics_df.info()

In [None]:
patients_df.info()

In [None]:
sessions_df.info()

In [None]:
feedback_df.info()

In [None]:
dropout_flags_df.info()

In [None]:
interventions_df.info()

In [None]:
corr = patients_df.select_dtypes(include='number').corr()
plt.figure(figsize=(15, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix — Numeric Features Only', fontsize=16, fontweight='bold')
plt.show()

#### Augment the `patient_session_df` with more datasets.

# Feature Engineering: Merge relevant features ---

In [None]:
# Aggregate session features per patient
session_agg = sessions_df.groupby("patient_id").agg({
    "duration": "mean",
    "pain_level": "mean",
    "home_adherence_pc": "mean",
    "satisfaction": "mean"
}).rename(columns={
    "duration": "avg_duration",
    "pain_level": "avg_pain_level",
    "home_adherence_pc": "avg_home_adherence",
    "satisfaction": "avg_satisfaction"
})

In [None]:
# Aggregate feedback sentiment per patient
feedback_sessions = feedback_df.merge(sessions_df[["session_id", "patient_id"]], on="session_id", how="left")
feedback_agg = feedback_sessions.groupby("patient_id").agg({
    "sentiment": "mean"
}).rename(columns={"sentiment": "avg_sentiment"})

In [None]:
# Aggregate interventions per patient
interventions_agg = interventions_df.groupby("patient_id").agg({
    "responded": "mean"
}).rename(columns={"responded": "intervention_response_rate"})

In [None]:
# Merge clinics info to patients
patients_clinic = patients_df.merge(clinics_df, on="clinic_id", how="left", suffixes=('', '_clinic'))

In [None]:
# Merge all features
patient_sel = patients_clinic.set_index("patient_id").join([
    session_agg, feedback_agg, interventions_agg
])

In [None]:
# Add dropout label
label = dropout_flags_df.set_index("patient_id").dropout
patients_full = patient_sel.join(label, how='inner')

 Feature Selection (RandomForest for importance) ---

In [None]:
# Select candidate features (including engineered)
candidate_features = [
    "age", "gender", "bmi", "smoker", "chronic_cond", "injury_type", "referral_source", "insurance_type", "consent",
    "city", "country", "type", "postcode", "capacity", "staff_count", "speciality", "avg_rating",  # clinic features
    "avg_duration", "avg_pain_level", "avg_home_adherence", "avg_satisfaction",  # session agg
    "avg_sentiment", "intervention_response_rate"
]

In [None]:
# Remove features not present in the merged df
candidate_features = [f for f in candidate_features if f in patients_full.columns]

cat_cols = patients_full[candidate_features].select_dtypes(include="object").columns.tolist()
num_cols = list(set(candidate_features) - set(cat_cols))
target_col = "dropout"

In [None]:
# Encode categorical for feature selection
patients_fs = patients_full.copy()
for col in cat_cols:
    le = LabelEncoder()
    patients_fs[col] = le.fit_transform(patients_fs[col].astype(str))
le_target = LabelEncoder()
y_fs = le_target.fit_transform(patients_fs[target_col])

rf_fs = RandomForestClassifier(n_estimators=100, random_state=42)
rf_fs.fit(patients_fs[candidate_features], y_fs)
importances = rf_fs.feature_importances_
top_idx = np.argsort(importances)[::-1][:10]  # Select top 10 features
selected_features = [candidate_features[i] for i in top_idx]
print("Top features for dropout prediction:", selected_features)

Prepare data for modeling ---

In [None]:
#Data for modeling ---
X = patients_full[selected_features]
y = patients_full[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20, random_state=42)

numerics = X.select_dtypes(include='number').columns.intersection(selected_features).tolist()
categoricals = list(set(selected_features) - set(numerics))

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
preprocessor = ColumnTransformer([
    ("num", numeric_pipe, numerics),
    ("cat", categorical_pipe, categoricals)
])

Model Definitions and Hyperparameter Grids

In [None]:
# Model Definitions
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
    "Neural Net": MLPClassifier(max_iter=500, random_state=42)
}

In [None]:
# Hyperparameter Grids
param_grids = {
    "Logistic Regression": {
        "model__C": [0.1, 1, 10],
        "model__solver": ["lbfgs", "liblinear"]
    },
    "Random Forest": {
        "model__n_estimators": [100, 200],
        "model__max_depth": [None, 5, 10]
    },
    "Decision Tree": {
        "model__max_depth": [None, 5, 10],
        "model__min_samples_split": [2, 5, 10]
    },
    "XGBoost": {
        "model__n_estimators": [100, 200],
        "model__max_depth": [3, 5, 7]
    },
    "CatBoost": {
        "model__iterations": [100, 200],
        "model__depth": [4, 6, 8]
    },
    "Neural Net": {
        "model__hidden_layer_sizes": [(50,), (100,)],
        "model__alpha": [0.0001, 0.001]
    }
}

In [None]:
# Model Training & Hyperparameter Tuning
from collections import OrderedDict

results = OrderedDict()
metrics = OrderedDict()
best_params = OrderedDict()

for name, model in models.items():
    print(f"\n--- {name} ---")
    pipe = ImbPipeline([
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("model", model)
    ])
    param_grid = param_grids.get(name, {})
    grid = GridSearchCV(pipe, param_grid, scoring='f1', cv=3, n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    results[name] = best_model
    best_params[name] = grid.best_params_

In [None]:
# Model Training & Hyperparameter Tuning
from collections import OrderedDict

results = OrderedDict()
metrics = OrderedDict()
best_params = OrderedDict()

for name, model in models.items():
    print(f"\n--- {name} ---")
    pipe = ImbPipeline([
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("model", model)
    ])
    param_grid = param_grids.get(name, {})
    grid = GridSearchCV(pipe, param_grid, scoring='f1', cv=3, n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    results[name] = best_model
    best_params[name] = grid.best_params_

    # Output best params and metrics immediately after training
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, pos_label='Yes' if 'Yes' in np.unique(y) else 1)
    roc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    metrics[name] = {"accuracy": acc, "f1": f1, "roc_auc": roc}

    print("Best Params:", grid.best_params_)
    print(classification_report(y_test, y_pred))
    if roc is not None:
        print("ROC AUC:", roc)
    print("Accuracy:", acc)
    print("F1 Score:", f1)

In [None]:
# Confusion Matrix
for name, best_model in results.items():
    y_pred = best_model.predict(X_test)
    plt.figure(figsize=(4, 4))
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
    plt.title(f"{name} Confusion Matrix")
    plt.show()

In [None]:
# Save Models
os.makedirs("models/saved_models", exist_ok=True)

for name, best_model in results.items():
    model_path = f"models/saved_models/{name.replace(' ', '_').lower()}_dropout_model.joblib"
    joblib.dump(best_model, model_path)
    print(f"Saved model → {model_path}")


In [None]:
# SHAP Explanations
for name, best_model in results.items():
    try:
        X_test_enc = best_model.named_steps["preprocessor"].transform(X_test)
        clf = best_model.named_steps["model"]
        explainer = shap.Explainer(clf, X_test_enc)
        shap_values = explainer(X_test_enc)
        plt.figure()
        shap.summary_plot(shap_values, X_test_enc, feature_names=best_model.named_steps["preprocessor"].get_feature_names_out())
        plt.title(f"SHAP Summary for {name}")
        plt.show()
    except Exception as e:
        print(f"SHAP not supported for {name}: {e}")

In [None]:
os.makedirs("reports/figures", exist_ok=True)

for name, best_model in results.items():
    try:
        X_test_enc = best_model.named_steps["preprocessor"].transform(X_test)
        clf = best_model.named_steps["model"]
        explainer = shap.Explainer(clf, X_test_enc)
        shap_values = explainer(X_test_enc)
        shap.summary_plot(
            shap_values,
            X_test_enc,
            feature_names=best_model.named_steps["preprocessor"].get_feature_names_out(),
            show=False
        )
        plt.title(f"SHAP Summary for {name}")
        plt.savefig(f"reports/figures/shap_{name.replace(' ', '_').lower()}.png")
        plt.close()
    except Exception as e:
        print(f"SHAP not supported for {name}: {e}")

In [None]:
# Metrics Summary Table
print("\n=== Model Metrics Summary ===")
for name, m in metrics.items():
    print(f"{name}: Accuracy={m['accuracy']:.3f}, F1={m['f1']:.3f}, ROC_AUC={m['roc_auc']:.3f}" if m['roc_auc'] is not None else f"{name}: Accuracy={m['accuracy']:.3f}, F1={m['f1']:.3f}, ROC_AUC=N/A")