# Healthcare Appointments (No-Show) Analysis

This notebook demonstrates a simple end-to-end data analysis workflow:
- Load and inspect data
- Clean and prepare features
- Explore trends (EDA)
- Generate practical insights for decision-making

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 120)

DATA_PATH = "../data/appointments.csv"
df = pd.read_csv(DATA_PATH)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/appointments.csv'

In [None]:
df.info()

In [None]:
# Basic cleaning / feature engineering
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])
df['NoShowFlag'] = (df['NoShow'].str.strip().str.lower() == 'yes').astype(int)

# Quick checks
df.isna().sum()

## Overall No-Show Rate

In [None]:
no_show_rate = df['NoShowFlag'].mean()
print(f"Overall no-show rate: {no_show_rate:.1%}")

df['NoShow'].value_counts()

## No-Show Rate by Key Factors

In [None]:
def rate_by(col):
    out = df.groupby(col)['NoShowFlag'].mean().sort_values(ascending=False)
    return (out * 100).round(1)

for c in ['Gender', 'Scholarship', 'Hypertension', 'Diabetes', 'Alcoholism', 'SMS_Received']:
    print(f"\nNo-show rate by {c} (%):")
    print(rate_by(c))

## Visualisations

In [None]:
# No-show by SMS received
sms_rates = df.groupby('SMS_Received')['NoShowFlag'].mean()
sms_rates.plot(kind='bar')
plt.ylabel('No-show rate')
plt.title('No-show rate by SMS reminder')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Age distribution by No-show
plt.figure()
df[df['NoShowFlag'] == 0]['Age'].plot(kind='hist', alpha=0.7, bins=8)
df[df['NoShowFlag'] == 1]['Age'].plot(kind='hist', alpha=0.7, bins=8)
plt.xlabel('Age')
plt.title('Age distribution: Show vs No-show')
plt.legend(['Show', 'No-show'])
plt.show()

## Simple Insight Summary

Below are simple, recruiter-friendly insights from the dataset.

In [None]:
insights = []

insights.append(("Overall no-show rate", f"{no_show_rate:.1%}"))
insights.append(("No-show rate (SMS=0)", f"{df[df['SMS_Received']==0]['NoShowFlag'].mean():.1%}"))
insights.append(("No-show rate (SMS=1)", f"{df[df['SMS_Received']==1]['NoShowFlag'].mean():.1%}"))

top_factor = rate_by('Hypertension').idxmax()
insights.append(("Hypertension group with higher no-show", str(top_factor)))

pd.DataFrame(insights, columns=['Metric', 'Value'])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

# Features (simple baseline)
features = ["Age","Scholarship","Hypertension","Diabetes","Alcoholism","SMS_Received"]
X = df[features].copy()
y = df["NoShowFlag"].copy()

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict + evaluate
y_pred = model.predict(X_test)
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
importance = pd.Series(model.coef_[0], index=features).sort_values()
importance.plot(kind="barh")
plt.title("Baseline model feature weights (Logistic Regression)")
plt.xlabel("Weight")
plt.show()

importance


## Recommendations (Example)

- Strengthen reminder workflows (SMS/phone) for higher-risk groups.
- Monitor no-show patterns by age and chronic conditions to improve scheduling.
- Improve data quality checks (e.g., consistent patient demographics and contact fields).