In [None]:
#importing of necessary libraries

import pandas as pd  
import numpy as np  
import seaborn as sns 
import pyreadstat 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from itertools import cycle
import time as time

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Separate features and target variable
target_column = 'Cause pph'
X = df.drop(target_column, axis=1)  # Features
y = df[target_column]               # Target variable

In [None]:
# visualize the target variable
# Define colors for each class
colors = ['red', 'yellow', 'orange', 'green']

# Count frequencies of each class
unique_classes, class_counts = np.unique(y, return_counts=True)

# Plot the bar graph with different colors
plt.figure(figsize=(8, 6))
plt.bar(unique_classes, class_counts, color=colors)

plt.xlabel('Target Variable')
plt.ylabel('Frequency')
plt.title('Distribution of Target Variable')
plt.grid(axis='y')
plt.xticks(unique_classes)
plt.show()

In [None]:
# Apply SMOTE to balance the dataset class variables 
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
# Define colors for each class
colors = ['red', 'yellow', 'orange', 'green']

# Count frequencies of each class
unique_classes, class_counts = np.unique(y_resampled, return_counts=True)

# Plot the bar graph with different colors
plt.figure(figsize=(8, 6))
plt.bar(unique_classes, class_counts, color=colors)

plt.xlabel('Target Variable(Anaemia Level)')
plt.ylabel('Frequency')
plt.title('Distribution of Target Variable(Anaemia Level) after SMOTE')
plt.grid(axis='y')
plt.xticks(unique_classes)
plt.show()

In [None]:
# Split the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
df.columns 

In [None]:
# Define your feature columns (example given, excluding the target column 'Anemia level')
feature_columns = ['Was labor augmented', 'Mode of delivery', 'Pretreatment bp systolic', 'Pretreatment bp diastolic', 'Complications in previous pregnancy',
                   'Age', 'Parity of index pregnanacy', 'Episiotomy', 'Experience seizures', 'Week of gestation',
                   'Delayed arrival to health facility', 'Delay in correct diagnosis', 'Weight of newborn at birth', 'Duration of labour in hour',
                   'Woman / baby was referred from another facility', 'Region/Province','Urban/rural']

# Ensure X_train is a DataFrame with the correct number of columns
X_train = pd.DataFrame(X_train, columns=feature_columns)

# Calculate mutual information scores
mi = mutual_info_classif(X_train, y_train)

# Convert mutual information scores to a Pandas Series
mi = pd.Series(mi)

# Label the Series with feature names
mi.index = X_train.columns

# Sort the Series
mi_sorted = mi.sort_values(ascending=False)

# display the sorted Series
mi_sorted

In [None]:
# Plot the sorted Series with different color palettes
palettes = sns.color_palette('tab10', n_colors=len(mi_sorted))
plt.figure(figsize=(12, 8))
mi_sorted.plot(kind='bar', color=palettes)
plt.title('Mutual Information Scores of Features')
plt.xlabel('Features')
plt.ylabel('Mutual Information Score')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Initialize the RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)

# Train the model on the standardized training data
rf_clf.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = rf_clf.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)

print(f'Validation Accuracy: {val_accuracy:.4f}')
print('Validation Classification Report:')
print(val_report)

# Optionally, evaluate the model on the test set
y_test_pred = rf_clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print(f'Test Accuracy: {test_accuracy:.4f}')
print('Test Classification Report:')
print(test_report)

In [None]:
# Predict on the validation set
y_val_pred = rf_clf.predict(X_val)

# Calculate the confusion matrix for the validation set
val_cm = confusion_matrix(y_val, y_val_pred)

# Predict on the test set
y_test_pred = rf_clf.predict(X_test)

# Calculate the confusion matrix for the test set
test_cm = confusion_matrix(y_test, y_test_pred)

# Define class names
class_names = ['Uterine atony', 'Trauma', 'Retained placenta', 'Cloting disorder']

# Set up the matplotlib figure and subplots
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Plot the confusion matrix for the validation set
sns.heatmap(val_cm, annot=True, fmt='d', cmap="viridis", cbar=False, ax=axes[0],
            xticklabels=class_names, yticklabels=class_names, annot_kws={"size": 16})
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_title('Confusion Matrix for Validation Set')

# Plot the confusion matrix for the test set
sns.heatmap(test_cm, annot=True, fmt='d', cmap="plasma", cbar=False, ax=axes[1],
            xticklabels=class_names, yticklabels=class_names,annot_kws={"size": 16})
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
axes[1].set_title('Confusion Matrix for Test Set')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
classes = [1, 2, 3, 4]

# Predict probabilities for each class on the validation set
y_score = rf_clf.predict_proba(X_test)

# Binarize the true labels for multi-class ROC
y_test_bin = label_binarize(y_test, classes=classes)

# Compute ROC curve and ROC area for each class
plt.figure(figsize=(10, 7))
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(classes)):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curves for each class
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green'])
for i, color in zip(range(len(classes)), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f'ROC curve (class {classes[i]}) (AUC = {roc_auc[i]:.3f})')

# Plot random guess line
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guess')

# Calculate and plot the macro-average ROC curve and AUC
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(classes))]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(len(classes)):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= len(classes)
macro_auc = auc(all_fpr, mean_tpr)

plt.plot(all_fpr, mean_tpr, color='navy', linestyle='-', linewidth=2,
         label=f'Macro-average ROC curve (AUC = {macro_auc:.3f})')

# Set labels and title
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Random Forest (Anaemia Level = 4 Classes)')
plt.legend(loc='lower right')

# Show the plot
plt.show()

In [None]:
# Train Decision Tree classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = dt.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)

print(f'Validation Accuracy: {val_accuracy:.4f}')
print('Validation Classification Report:')
print(val_report)

# Optionally, evaluate the model on the test set
y_test_pred = dt.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print(f'Test Accuracy: {test_accuracy:.4f}')
print('Test Classification Report:')
print(test_report)

In [None]:
# Calculate the confusion matrix for the validation set
val_cm = confusion_matrix(y_val, y_val_pred)

# Calculate the confusion matrix for the test set
test_cm = confusion_matrix(y_test, y_test_pred)

# Set up the matplotlib figure and subplots
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Plot the confusion matrix for the validation set
sns.heatmap(val_cm, annot=True, fmt='d', cmap="inferno", cbar=False, ax=axes[0],
            xticklabels=class_names, yticklabels=class_names, annot_kws={"size": 16})  # Adjust fontsize here
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_title('Confusion Matrix for Validation Set of decision tree Classifier')

# Plot the confusion matrix for the test set
sns.heatmap(test_cm, annot=True, fmt='d', cmap="magma", cbar=False, ax=axes[1],
            xticklabels=class_names, yticklabels=class_names, annot_kws={"size": 16})  # Adjust fontsize here
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
axes[1].set_title('Confusion Matrix for Test Set of decision tree Classifier')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# Predict probabilities for each class on the test set
y_score = dt.predict_proba(X_test)

# Compute ROC curve and ROC area for each class
plt.figure(figsize=(10, 7))
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(4):  # There are 4 classes (1, 2, 3, 4)
    # Binarize the true labels for class i
    y_test_bin = label_binarize(y_test, classes=[1, 2, 3, 4])[:, i]
    # Compute ROC curve and ROC area
    fpr[i], tpr[i], _ = roc_curve(y_test_bin, y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curves for each class
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green'])
for i, color in zip(range(4), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f'ROC curve (class {i+1}) (AUC = {roc_auc[i]:.2f})')

# Plot random guess line
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guess')

# Calculate and plot the macro-average ROC curve and AUC
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(classes))]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(len(classes)):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= len(classes)
macro_auc = auc(all_fpr, mean_tpr)

plt.plot(all_fpr, mean_tpr, color='navy', linestyle='-', linewidth=2,
         label=f'Macro-average ROC curve (AUC = {macro_auc:.2f})')

# Set labels and title
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Decision Tree (Anaemia Level = 4 Classes)')
plt.legend(loc='lower right')

# Show the plot
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train Logistic Regression classifier
logreg = LogisticRegression(random_state=42, max_iter=1000)
logreg.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = logreg.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)

print(f'Logistic Regression - Validation Accuracy: {val_accuracy:.4f}')
print('Logistic Regression - Validation Classification Report:')
print(val_report)

# Optionally, evaluate the model on the test set
y_test_pred = logreg.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print(f'Logistic Regression - Test Accuracy: {test_accuracy:.4f}')
print('Logistic Regression - Test Classification Report:')
print(test_report)

In [None]:
from sklearn.naive_bayes import GaussianNB

# Train Naive Bayes classifier
nb = GaussianNB()
nb.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = nb.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)

print(f'Naive Bayes - Validation Accuracy: {val_accuracy:.4f}')
print('Naive Bayes - Validation Classification Report:')
print(val_report)

# Optionally, evaluate the model on the test set
y_test_pred = nb.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print(f'Naive Bayes - Test Accuracy: {test_accuracy:.4f}')
print('Naive Bayes - Test Classification Report:')
print(test_report)


In [None]:
from sklearn.svm import SVC

# Train SVM classifier
svm = SVC(random_state=42)
svm.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = svm.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)

print(f'SVM - Validation Accuracy: {val_accuracy:.4f}')
print('SVM - Validation Classification Report:')
print(val_report)

# Optionally, evaluate the model on the test set
y_test_pred = svm.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print(f'SVM - Test Accuracy: {test_accuracy:.4f}')
print('SVM - Test Classification Report:')
print(test_report)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = rf.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)

print(f'Random Forest - Validation Accuracy: {val_accuracy:.4f}')
print('Random Forest - Validation Classification Report:')
print(val_report)

# Optionally, evaluate the model on the test set
y_test_pred = rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print(f'Random Forest - Test Accuracy: {test_accuracy:.4f}')
print('Random Forest - Test Classification Report:')
print(test_report)


In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Train KNN classifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = knn.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)

print(f'KNN - Validation Accuracy: {val_accuracy:.4f}')
print('KNN - Validation Classification Report:')
print(val_report)

# Optionally, evaluate the model on the test set
y_test_pred = knn.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print(f'KNN - Test Accuracy: {test_accuracy:.4f}')
print('KNN - Test Classification Report:')
print(test_report)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score

# List of models to evaluate
models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier()
}

# Initialize a dictionary to store performance metrics
performance_metrics = {
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': []
}

# Iterate over each model to train, predict, and collect metrics
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average='weighted')
    recall = recall_score(y_test, y_test_pred, average='weighted')
    f1 = f1_score(y_test, y_test_pred, average='weighted')
    
    # Store metrics
    performance_metrics['Accuracy'].append(accuracy)
    performance_metrics['Precision'].append(precision)
    performance_metrics['Recall'].append(recall)
    performance_metrics['F1 Score'].append(f1)

# Convert to numpy array for plotting
metrics_array = np.array([performance_metrics['Accuracy'],
                          performance_metrics['Precision'],
                          performance_metrics['Recall'],
                          performance_metrics['F1 Score']])

# Create the grouped bar plot
model_names = list(models.keys())
metric_names = list(performance_metrics.keys())
n_metrics = len(metric_names)
n_models = len(model_names)

# Set up the plot
fig, ax = plt.subplots(figsize=(12, 8))

# Define bar width and positions
bar_width = 0.2
indices = np.arange(n_models)

# Plot bars for each metric
for i, metric in enumerate(metric_names):
    ax.bar(indices + i * bar_width, metrics_array[i], bar_width, label=metric)

# Add labels and title
ax.set_xlabel('Classifiers')
ax.set_ylabel('Scores')
ax.set_title('Grouped Bar Plot of Classification Performances')
ax.set_xticks(indices + bar_width * (n_metrics - 1) / 2)
ax.set_xticklabels(model_names, rotation=45)
ax.legend()

# Show the plot
plt.tight_layout()
plt.show()