In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Custom Transformer to convert boolean columns to integers
class BooleanConverter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        boolean_columns = X.select_dtypes(include='bool').columns
        X[boolean_columns] = X[boolean_columns].astype(int)
        return X

# Function to load and preprocess the dataset
def load_and_preprocess_data(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)
    
    # Define the mapping for risk categories
    risk_mapping = {
        'In Treatment': 'Low',
        'IIT': 'High',
        'Transfer Out': 'Medium'
    }

    # Transform the 'Status' column based on the risk mapping
    df['Status'] = df['Status'].map(risk_mapping)

    # Drop rows where 'Status' is null (e.g., 'Died')
    df = df[df['Status'].notnull()]

    # Drop irrelevant columns
    df.drop(columns=['Patient_uid', 'EducationLevel', 'Region', 'SiteCode', 'LastVisit', 'VisitDate', 'NextAppointmentDate', 'NextVisit'], inplace=True)

    # Convert boolean columns to integers
    df = BooleanConverter().fit_transform(df)

    # Encode the target variable
    label_encoder = LabelEncoder()
    df['Status'] = label_encoder.fit_transform(df['Status'])

    # Select columns for one-hot encoding
    columns_to_encode = ['Gender', 'Occupation', 'MaritalStatus', 'StartRegimen', 'LastRegimen', 'ArtAdherence', 'PHQ_9_rating']

    # Perform one-hot encoding with boolean dtype
    df_encoded = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)
    df_encoded = df_encoded.astype(int)

    # Define the desired number of samples for each class
    sampling_strategy = {1: 17547, 0: 12000, 2: 15009}

    # Instantiate the RandomOverSampler with the specified sampling strategy
    ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)

    # Resample the dataset
    X = df_encoded.drop(columns=['Status'])
    y = df_encoded['Status']
    X_resampled, y_resampled = ros.fit_resample(X, y)

    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled['Status'] = y_resampled

    return df, df_resampled, label_encoder

In [12]:
# Function to train the model
def train_model(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a RandomForest model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    return model, accuracy, cm, report,X_test,y_test,y_pred


In [13]:
from sklearn.ensemble import RandomForestClassifier
# Load and preprocess the data
file_path = 'dff.csv'  # Replace with your dataset path
df, df_resampled, label_encoder = load_and_preprocess_data(file_path)

# Select features and target variable
target_col = 'Status'
X = df_resampled.drop(columns=[target_col])
y = df_resampled[target_col]

  df_resampled['Status'] = y_resampled


In [14]:
# Train the model
model, accuracy, cm, report,X_test,y_test,y_pred = train_model(X, y)

In [15]:
model

In [16]:
accuracy

0.9809245960502693

In [17]:
report

'              precision    recall  f1-score   support\n\n           0       0.97      1.00      0.99      2382\n           1       1.00      0.95      0.98      3505\n           2       0.97      1.00      0.98      3025\n\n    accuracy                           0.98      8912\n   macro avg       0.98      0.98      0.98      8912\nweighted avg       0.98      0.98      0.98      8912\n'

In [18]:
df.head()

Unnamed: 0,Gender,Occupation,MaritalStatus,AgeLastVisit,AgeARTStart,StartRegimen,LastRegimen,ArtAdherence,Status,TotalVisits,IITPercentage,IITLast3Percentage,Viral Load,PHQ_9_rating
0,Male,OTHER NON-CODED,SINGLE,12.0,5.0,3TC+ABC+EFV,3TC+DTG+TDF,good,2,7.0,0.0,0.0,540.0,Unscreened
2,Female,Farmer,WIDOWED,38.0,31.0,3TC+EFV+TDF,3TC+DTG+TDF,good,1,6.0,16.666667,0.0,0.0,Unscreened
3,Female,Farmer,MARRIED MONOGAMOUS,38.0,29.0,3TC+EFV+TDF,3TC+DTG+TDF,good,1,4.0,0.0,0.0,0.0,Depression Unlikely
4,Female,Trader,UNKNOWN,58.0,51.0,3TC+EFV+TDF,3TC+EFV+TDF,good,1,3.0,0.0,0.0,0.0,Unscreened
5,Male,Farmer,MARRIED MONOGAMOUS,30.0,26.0,3TC+DTG+TDF,3TC+DTG+TDF,good,1,3.0,0.0,0.0,0.0,Unscreened


In [29]:
import shap
from tabulate import tabulate
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Sample a smaller subset of the test set
sampled_X_test = X_test.sample(n=5, random_state=42)
sampled_y_test = y_test[sampled_X_test.index]

# Create a SHAP explainer object
explainer = shap.TreeExplainer(model)

# Calculate SHAP values for the sampled test data
shap_values = explainer.shap_values(sampled_X_test)

# Define function to generate explanations for the sampled data
def generate_explanations_for_sample():
    explanations = []
    for index in range(len(sampled_X_test)):
        prediction = model.predict(sampled_X_test.iloc[index].values.reshape(1, -1))[0]
        explanation = []  # Define explanation list within the loop
        for i, feature_name in enumerate(sampled_X_test.columns):
            feature_value = sampled_X_test.iloc[index, i]
            if feature_value != 0:
                # Ensure prediction is within the valid range
                if 0 <= prediction < shap_values.shape[2]:
                    contribution = shap_values[index, i, prediction]
                    explanation.append(f"{feature_name} (value: {feature_value}) contributed by {contribution:.3f}")
                else:
                    explanation.append(f"{feature_name} (value: {feature_value})")
        explanations.append((index, prediction, explanation))
    return explanations

# Define function to format explanations into tabular style
def format_explanations(explanations, risk_category):
    formatted_explanations = []
    headers = ["\033[94mPatient\033[0m", "\033[94mPrediction\033[0m", "\033[94mExplanation\033[0m"]
    
    for index, prediction, explanation in explanations:
        formatted_explanation = ["Patient " + str(index + 1), label_encoder.inverse_transform([prediction])[0], "\n".join(explanation)]
        formatted_explanations.append(formatted_explanation)
    
    # Define title color
    title_color = ""
    if risk_category == "High":
        title_color = "\033[91m"  # Red color for High risk
    elif risk_category == "Medium":
        title_color = "\033[93m"  # Yellow color for Medium risk
    elif risk_category == "Low":
        title_color = "\033[92m"  # Green color for Low risk
    
    # Print the tabular style output
    print("\n{}{} Risk patients Prediction and Explanations:\033[0m".format(title_color, risk_category))
    print(tabulate(formatted_explanations, headers=headers, tablefmt="grid"))


# Generate explanations for sampled data
all_explanations = generate_explanations_for_sample()

# Iterating over each risk category and format explanations
risk_categories = ['High', 'Medium', 'Low']
for risk_category in risk_categories:
    filtered_explanations = filter_explanations_by_risk(all_explanations, risk_category)
    format_explanations(filtered_explanations, risk_category)



[91mHigh Risk patients Prediction and Explanations:[0m
+-----------+--------------+----------------------------------------------------------+
| [94mPatient[0m   | [94mPrediction[0m   | [94mExplanation[0m                                              |
| Patient 3 | High         | AgeLastVisit (value: 27) contributed by 0.057            |
|           |              | AgeARTStart (value: 19) contributed by 0.039             |
|           |              | TotalVisits (value: 26) contributed by 0.049             |
|           |              | IITPercentage (value: 3) contributed by 0.022            |
|           |              | IITLast3Percentage (value: 33) contributed by 0.247      |
|           |              | Occupation_Student (value: 1) contributed by 0.070       |
|           |              | MaritalStatus_SINGLE (value: 1) contributed by 0.046     |
|           |              | StartRegimen_3TC+EFV+TDF (value: 1) contributed by 0.028 |
|           |              | LastRe



In [27]:
from tabulate import tabulate

# Define function to format explanations into tabular style
def format_explanations(explanations, risk_category):
    formatted_explanations = []
    headers = ["\033[94mPatient\033[0m", "\033[94mPrediction\033[0m", "\033[94mExplanation\033[0m"]
    
    for index, prediction, explanation in explanations:
        formatted_explanation = ["Patient " + str(index + 1), label_encoder.inverse_transform([prediction])[0], "\n".join(explanation)]
        formatted_explanations.append(formatted_explanation)
    
    # Define title color
    title_color = ""
    if risk_category == "High":
        title_color = "\033[91m"  # Red color for High risk
    elif risk_category == "Medium":
        title_color = "\033[93m"  # Yellow color for Medium risk
    elif risk_category == "Low":
        title_color = "\033[92m"  # Green color for Low risk
    
    # Print the tabular style output
    print("\n{}{} Risk patients Prediction and Explanations:\033[0m".format(title_color, risk_category))
    print(tabulate(formatted_explanations, headers=headers, tablefmt="grid"))

# Iterating over each risk category and format explanations
risk_categories = ['High', 'Medium', 'Low']
for risk_category in risk_categories:
    filtered_explanations = filter_explanations_by_risk(all_explanations, risk_category)
    format_explanations(filtered_explanations, risk_category)



[91mHigh Risk patients Prediction and Explanations:[0m
+-----------+--------------+----------------------------------------------------------+
| [94mPatient[0m   | [94mPrediction[0m   | [94mExplanation[0m                                              |
| Patient 3 | High         | AgeLastVisit (value: 27) contributed by 0.057            |
|           |              | AgeARTStart (value: 19) contributed by 0.039             |
|           |              | TotalVisits (value: 26) contributed by 0.049             |
|           |              | IITPercentage (value: 3) contributed by 0.022            |
|           |              | IITLast3Percentage (value: 33) contributed by 0.247      |
|           |              | Occupation_Student (value: 1) contributed by 0.070       |
|           |              | MaritalStatus_SINGLE (value: 1) contributed by 0.046     |
|           |              | StartRegimen_3TC+EFV+TDF (value: 1) contributed by 0.028 |
|           |              | LastRe

In [31]:
df_encoded.columns

NameError: name 'df_encoded' is not defined