# 1. Generate Random Data
To do this we need to:
1. Get the list of all healthcare facilities
2. Use `faker` to generate user-like data for all but the open-ended survey question 

In [None]:
# Get list of all healthcare facilities

import pandas as pd
clinics = pd.read_excel('data/clinics.xlsx')
clinics.head()

: 

In [None]:
# Generate some random feedback data
from faker import Faker
import random
import numpy as np
import pandas as pd

fake = Faker()

# Constants for readability
FEMALE = 1
MALE = 2
YES = 1
NO = 2

num_responses_per_clinic = 100
responses = []

for row in clinics.itertuples():
    clinic_number = row[2]  # Assuming clinic_number is at index 2
    for _ in range(num_responses_per_clinic):
        gender = random.randint(1, 2)  # 1 = Female, 2 = Male
        received_all_services_needed = random.randint(1, 4)
        vital_signs_measured = random.randint(1, 3)
        able_to_get_all_tests_needed = random.randint(1, 4)
        all_medicines_needed_available = random.randint(1, 4)
        overall_satisfaction = random.randint(1, 4)

        response = {
            'age': random.randint(1, 100),
            'gender': gender,
            'pregnant': np.random.choice([YES, NO], p=[0.3, 0.7]) if gender == FEMALE else NO,
            'Facility Number': clinic_number,
            'service_seeked': random.randint(1, 6),
            'asked_for_consent_before_exam': random.randint(1, 3),
            'received_all_services_needed': received_all_services_needed,
            'why_not_received_all_services_needed': random.randint(1, 5) 
                if received_all_services_needed in [2, 3] else None,
            'satisfied_attended_on_time': random.randint(1, 4),
            'vital_signs_measured': vital_signs_measured,
            'how_long_wait_to_have_vitals_measured': random.randint(1, 5) 
                if vital_signs_measured == 1 else None,
            'how_long_wait_for_care': random.randint(1, 6),
            'comfortable_sitting_places_while_waiting': random.randint(1, 2),
            'practitioner_clearly_explained': random.randint(1, 2),
            'able_to_get_all_tests_needed': able_to_get_all_tests_needed,
            'why_not_get_all_tests_needed': random.randint(1, 4) 
                if able_to_get_all_tests_needed in [2, 3] else None,
            'all_medicines_needed_available': all_medicines_needed_available,
            'why_not_get_all_medicine': random.randint(1, 4) 
                if all_medicines_needed_available in [2, 3] else None,
            'informed_how_to_take_medicine': random.randint(1, 2),
            'satisfied_with_privacy': random.randint(1, 4),
            'satisfied_with_cleanliness': random.randint(1, 4),
            'addressed_treated_politely_respectfully': random.randint(1, 4),
            'faced_issue_did_someone_listen': random.randint(1, 4),
            'how_did_you_pay': random.randint(1, 4),
            'overall_satisfaction': overall_satisfaction,
            'satisfied_most': random.randint(1, 6) 
                if overall_satisfaction in [1, 2] else None,
            'satisfied_least': random.randint(1, 6) 
                if overall_satisfaction in [3, 4] else None,
            'other_feedback': random.randint(1, 2),
            'feedback': fake.sentence()
        }

        responses.append(response)

# Convert to DataFrame if needed
df = pd.DataFrame(responses)
df.head()

As you can see, the `feedback` column is essentially meaningless. The hope would be to get some Swahili text and do sentiment analysis, but for now, we can disregard that column.

# 2. ML Experiments

## K-Means

Before we do any experiments, we need to preprocess the data.

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load dataset
df = clinics.copy()  # Replace with your actual file

# Drop unnecessary columns
columns_to_drop = ["ID", "Facility Number", "Official Phone Number", "PostalAddress",
                   "OfficialEmail", "Website", "Date Opened", "MTUHA", "CTC_ID", "msd_id"]
df = df.drop(columns=columns_to_drop)

# Convert 'Not Set' and other non-numeric values to NaN
df[['Latitude', 'Longitude']] = df[['Latitude', 'Longitude']].apply(pd.to_numeric, errors='coerce')

# Handle missing numerical values (Latitude and Longitude)
imputer = SimpleImputer(strategy="mean")
df[['Latitude', 'Longitude']] = imputer.fit_transform(df[['Latitude', 'Longitude']])

# Fill missing categorical values
categorical_columns = ['Facility Name', 'Common Name', 'Region', 'District', 'Council',
                       'Ward', 'Ownership', 'Operating Status']
df[categorical_columns] = df[categorical_columns].fillna("Unknown")
df[categorical_columns] = df[categorical_columns].astype(str)

# One-hot encode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')  # Drop first category to avoid redundancy
encoded_columns = encoder.fit_transform(df[categorical_columns])

# Convert to DataFrame
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate numerical and encoded categorical data
final_df = pd.concat([df[['Latitude', 'Longitude']], encoded_df], axis=1)

# Normalize the data for K-Means
scaler = StandardScaler()
final_df = pd.DataFrame(scaler.fit_transform(final_df), columns=final_df.columns)

final_df.head() # Ready for clustering

Now, we can perform the $k$-means algorithm. We first find the best $k$-value, then re-run the algorithm with that $k$.

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

best_k = 0
best_score = -1

for k in range(2, 11):  # Silhouette is only valid for K >= 2
    kmeans = KMeans(n_clusters=k, random_state=42, init='k-means++')
    cluster_labels = kmeans.fit_predict(final_df)
    score = silhouette_score(final_df, cluster_labels)
    
    print(f"K={k}, Silhouette Score={score:.4f}")

    if score > best_score:
        best_k = k
        best_score = score

print(f"Best K: {best_k} with Silhouette Score: {best_score:.4f}")

In [None]:
# run k-means with the best k-value 

clinics_clustered = clinics.copy()

kmeans = KMeans(n_clusters=best_k, init='k-means++')
cluster_labels = kmeans.fit_predict(final_df)

clinics_clustered['cluster'] = cluster_labels
clinics_clustered

This doesn't seem to be all that helpful. What about our fake reports? Can we cluster them in some useful way?

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Assuming `responses` is your DataFrame
responses_df = pd.DataFrame(responses)

# Drop the 'feedback' column as it is not numeric
responses_df_no_feedback_str = responses_df.drop(columns=['feedback'])

# Ensure all data is numeric
responses_df_no_feedback_str = responses_df_no_feedback_str.apply(pd.to_numeric, errors='coerce')

# Handle missing values
imputer = SimpleImputer(strategy="mean")
responses_df_no_feedback_str = imputer.fit_transform(responses_df_no_feedback_str)

# Standardize the features
scaler = StandardScaler()
responses_df_no_feedback_str = scaler.fit_transform(responses_df_no_feedback_str)

# Find the best k-value for K-Means
best_k = 0
best_score = -1

for k in range(2, 11):  # Silhouette is only valid for K >= 2
    kmeans = KMeans(n_clusters=k, random_state=42, init='k-means++')
    cluster_labels = kmeans.fit_predict(responses_df_no_feedback_str)
    score = silhouette_score(responses_df_no_feedback_str, cluster_labels)

    print(f"K={k}, Silhouette Score={score:.4f}")

    if score > best_score:
        best_k = k
        best_score = score

print(f"Best K: {best_k} with Silhouette Score: {best_score:.4f}")

# Run K-Means with the best k-value
responses_clustered = responses_df.copy()
kmeans = KMeans(n_clusters=best_k, init='k-means++')
cluster_labels = kmeans.fit_predict(responses_df_no_feedback_str)
responses_clustered['cluster'] = cluster_labels

print(responses_clustered)

In [None]:
# run k-means with the best k-value 

responses_clustered = responses_df.copy()

kmeans = KMeans(n_clusters=best_k, init='k-means++')
cluster_labels = kmeans.fit_predict(responses_df_no_feedback_str)

responses_clustered['cluster'] = cluster_labels
responses_clustered

## Anomaly Detection
Detects outlier data

In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# Merge the review data with the healthcare facility data
final_df = pd.merge(clinics, df, on='Facility Number', how='left')

# Drop non-numeric columns
non_numeric_columns = ['Facility Name', 'Common Name', 'Region', 'District', 'Council', 'Ward', 'Ownership', 'Operating Status']
final_df = final_df.drop(columns=non_numeric_columns)

# Convert 'Not Set' and other non-numeric values to NaN
final_df = final_df.apply(pd.to_numeric, errors='coerce')

# Handle missing values
imputer = SimpleImputer(strategy="mean")
final_df_imputed = imputer.fit_transform(final_df)

# Create a new DataFrame with the imputed values and the original column names
final_df = pd.DataFrame(final_df_imputed, columns=final_df.columns[:final_df_imputed.shape[1]])

# Display the merged data
print(final_df.head())

# Train the Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)
anomaly_labels = iso_forest.fit_predict(final_df)

# Add anomaly labels to the DataFrame
final_df['anomaly'] = anomaly_labels

# Plot the results
plt.figure(figsize=(10, 7))
plt.scatter(final_df.iloc[:, 0], final_df.iloc[:, 1], c=final_df['anomaly'], cmap='coolwarm', alpha=0.5)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Anomaly Detection using Isolation Forest')
plt.colorbar(label='Anomaly')
plt.show()

# Print all the rows that are anomalies
anomalies = final_df[final_df['anomaly'] == -1]
anomalies

: 

## Feature Importance

In [22]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Assuming `responses` is your DataFrame
df = pd.DataFrame(responses)

# Drop the 'feedback' and 'Facility Number' columns as they are not numeric
df = df.drop(columns=['feedback', 'Facility Number'])

# Ensure all data is numeric
df = df.apply(pd.to_numeric, errors='coerce')

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df_imputed = imputer.fit_transform(df)

# Convert back to DataFrame
df = pd.DataFrame(df_imputed, columns=df.columns)

# Define features and target
X = df.drop(columns=['overall_satisfaction', 'satisfied_most', 'satisfied_least', 'age'])
y = df['overall_satisfaction']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Extract feature importances
importances = rf.feature_importances_
feature_names = X.columns

# Create a DataFrame for feature importances
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

# Display the feature importances
feature_importances

Unnamed: 0,feature,importance
2,service_seeked,0.071835
9,how_long_wait_for_care,0.070882
6,satisfied_attended_on_time,0.057416
18,satisfied_with_cleanliness,0.055838
17,satisfied_with_privacy,0.055681
21,how_did_you_pay,0.054208
19,addressed_treated_politely_respectfully,0.052702
20,faced_issue_did_someone_listen,0.051619
5,why_not_received_all_services_needed,0.04819
13,why_not_get_all_tests_needed,0.045473
