# Maternal Health Risk Prediction with Explainability
This notebook improves the prediction of maternal health risks using a Random Forest classifier and integrates SHAP for explainability.

### Key Features:
- **Improved Low-Risk Predictions:** Addressed class imbalance using SMOTE.
- **SHAP Explainability:** Added explanations for "High Risk" and "Mid Risk" predictions.
- **Manual Input:** Allows users to input custom values and predict their risk level.


In [28]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
import joblib

In [29]:
# Load data
df = pd.read_csv('pregnancy_recommendations.csv')

# Separate features (X) and targets (y)
X = df.drop(columns=['RiskLevel', 'Reasons'])  # Exclude 'RiskLevel' and 'Reason' from features
y = df[['RiskLevel', 'Reasons']]  # Multi-output target

# Convert categorical target 'RiskLevel' to numerical values
y['RiskLevel'] = y['RiskLevel'].map({'low risk': 0, 'mid risk': 1, 'high risk': 2})

# Handle missing values by filling them with the mean of each column
X = X.fillna(X.mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['RiskLevel'] = y['RiskLevel'].map({'low risk': 0, 'mid risk': 1, 'high risk': 2})


In [30]:
# Oversample the RiskLevel column
smote = SMOTE(random_state=42)
X_resampled, y_resampled_risk = smote.fit_resample(X, y['RiskLevel'])

# Align the Reason column with the oversampled RiskLevel
# Use the indices returned by SMOTE to select the corresponding Reason values
# The resampled data is in the same order as the original data plus the synthetic samples at the end.
# For the synthetic samples, we can assign the most frequent Reason for that RiskLevel, or set as NaN.
original_len = len(y)
y_resampled_reason = list(y['Reasons'])
for i in range(len(y_resampled_risk) - original_len):
	# For each synthetic sample, assign the most frequent Reason for that RiskLevel
	risk_level = y_resampled_risk.iloc[original_len + i]
	most_common_reason = y[y['RiskLevel'] == risk_level]['Reasons'].mode()
	if not most_common_reason.empty:
		y_resampled_reason.append(most_common_reason.iloc[0])
	else:
		y_resampled_reason.append(np.nan)
y_resampled_reason = pd.Series(y_resampled_reason)

y_resampled = pd.DataFrame({'RiskLevel': y_resampled_risk.values, 'Reasons': y_resampled_reason})

# Encode the 'Reason' column
from sklearn.preprocessing import LabelEncoder
le_reason = LabelEncoder()
y_resampled['Reasons'] = le_reason.fit_transform(y_resampled['Reasons'].astype(str))

# Prepare the multi-output target
y_train_multi = np.column_stack([y_resampled['RiskLevel'], y_resampled['Reasons']])

In [31]:
y_resampled_reason = y_resampled_reason.fillna("Unknown Reason")

In [32]:
print(y_resampled.head())
print(y_resampled['Reasons'].value_counts())

   RiskLevel  Reasons
0          2       68
1          0       34
2          0       34
3          0       34
4          0       69
Reasons
34     1059
147     323
105      87
52       67
69       54
       ... 
117       1
8         1
10        1
23        1
144       1
Name: count, Length: 184, dtype: int64


In [33]:
import joblib
joblib.dump(le_reason, 'reason_label_encoder.pkl')

['reason_label_encoder.pkl']

In [34]:
# Create reasons_dict using LabelEncoder's classes_
reasons_dict = {i: reason for i, reason in enumerate(le_reason.classes_)}

In [35]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Separate RiskLevel and Reason for training
y_train_risk = y_train['RiskLevel']
y_train_reason = y_train['Reasons']
y_test_risk = y_test['RiskLevel']
y_test_reason = y_test['Reasons']

In [36]:
from sklearn.preprocessing import LabelEncoder

# Encode the 'Reason' column to numeric labels
le_reason = LabelEncoder()
y_train_reason_encoded = le_reason.fit_transform(y_train['Reasons'].astype(str))

# Prepare the multi-output target as a DataFrame or numpy array
y_train_multi = np.column_stack([y_train['RiskLevel'], y_train_reason_encoded])

# Initialize the base model
base_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Wrap the base model in a MultiOutputClassifier
multi_output_model = MultiOutputClassifier(base_model)

# Train the model
multi_output_model.fit(X_train, y_train_multi)

In [37]:
# Make predictions
y_pred = multi_output_model.predict(X_test)

# Separate predictions for RiskLevel and Reason
y_pred_risk = y_pred[:, 0]
y_pred_reason = y_pred[:, 1]

# Map numerical RiskLevel back to categorical values
risk_map = {0: 'Low Risk', 1: 'Mid Risk', 2: 'High Risk'}
y_pred_risk = [risk_map[risk] for risk in y_pred_risk]

In [38]:
# Example: Define manual_input_df with sample values
# Replace the dictionary below with actual input values as needed
manual_input_data = {
    col: [X_train.iloc[0][col]] for col in X_train.columns  # Use first row as template
}
manual_input_df = pd.DataFrame(manual_input_data)

# Predict RiskLevel and Reason for user input
prediction = multi_output_model.predict(manual_input_df)
probabilities = multi_output_model.predict_proba(manual_input_df)

# Decode RiskLevel
predicted_risk = risk_map[prediction[0][0]]

# Decode Reason
predicted_reason_index = prediction[0][1]
predicted_reason = le_reason.inverse_transform([predicted_reason_index])[0]

# Debugging: Print raw predictions and probabilities
print(f"Predicted RiskLevel: {predicted_risk}")
print(f"Predicted Reason Index: {predicted_reason_index}")
print(f"Decoded Reason: {predicted_reason}")
print(f"RiskLevel Probabilities: {probabilities[0]}")

# Fallback mechanism for mismatched reasons
if predicted_risk == "High Risk" and "low risk" in predicted_reason.lower():
    predicted_reason = "High Risk - Elevated indicators detected. Please consult a healthcare provider."
elif predicted_risk == "Mid Risk" and "low risk" in predicted_reason.lower():
    predicted_reason = "Mid Risk - Some indicators require attention. Maintain regular checkups."

# Display the response
print(f"Final Predicted Risk Level: {predicted_risk}")
print(f"Final Reason for Risk Level: {predicted_reason}")

Predicted RiskLevel: High Risk
Predicted Reason Index: 48
Decoded Reason: 147
RiskLevel Probabilities: [[7.4806809e-04 7.1558816e-04 9.9853635e-01]]
Final Predicted Risk Level: High Risk
Final Reason for Risk Level: 147


In [39]:
# Save the model
joblib.dump(multi_output_model, 'multi_output_model.pkl')

['multi_output_model.pkl']

In [40]:
# Save the unique reasons for later use
import json


unique_reasons = y['Reasons'].unique()
with open('reasons.json', 'w') as f:
    json.dump(list(unique_reasons), f)