# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load Data

In [2]:
df = pd.read_csv('data/processed_data.csv')

# EDA

In [3]:
df.head()

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Date of Admission,Hospital,Insurance Provider,Billing Amount,Admission Type,Discharge Date,Medication,Test Results,target
0,30,Male,B-,Cancer,2024-01-31,Sons and Miller,Blue Cross,18856.281306,Urgent,2024-02-02,Paracetamol,Normal,0
1,62,Male,A+,Obesity,2019-08-20,Kim Inc,Medicare,33643.327287,Emergency,2019-08-26,Ibuprofen,Inconclusive,1
2,76,Female,A-,Obesity,2022-09-22,Cook PLC,Aetna,27955.096079,Emergency,2022-10-07,Aspirin,Normal,1
3,28,Female,O+,Diabetes,2020-11-18,"Hernandez Rogers and Vang,",Medicare,37909.78241,Elective,2020-12-18,Ibuprofen,Abnormal,0
4,43,Female,AB+,Cancer,2022-09-19,White-White,Aetna,14238.317814,Urgent,2022-10-09,Penicillin,Abnormal,0


In [4]:
df.shape

(55500, 13)

In [5]:
df.columns

Index(['Age', 'Gender', 'Blood Type', 'Medical Condition', 'Date of Admission',
       'Hospital', 'Insurance Provider', 'Billing Amount', 'Admission Type',
       'Discharge Date', 'Medication', 'Test Results', 'target'],
      dtype='object')

In [6]:
#summary statistics for numerical data
df.describe()

Unnamed: 0,Age,Billing Amount,target
count,55500.0,55500.0,55500.0
mean,51.539459,25539.316097,0.329171
std,19.602454,14211.454431,0.469916
min,13.0,-2008.49214,0.0
25%,35.0,13241.224652,0.0
50%,52.0,25538.069376,0.0
75%,68.0,37820.508436,1.0
max,89.0,52764.276736,1.0


In [7]:
#summary statistics for categorical data
df.describe(include='object')

Unnamed: 0,Gender,Blood Type,Medical Condition,Date of Admission,Hospital,Insurance Provider,Admission Type,Discharge Date,Medication,Test Results
count,55500,55500,55500,55500,55500,55500,55500,55500,55500,55500
unique,2,8,6,1827,39876,5,3,1856,5,3
top,Male,A-,Arthritis,2024-03-16,LLC Smith,Cigna,Elective,2020-03-15,Lipitor,Abnormal
freq,27774,6969,9308,50,44,11249,18655,53,11140,18627


# Feature Engineering

In [8]:
#categorize hospital based on their billing

# Define billing amount bins 
bins = [df['Billing Amount'].min(), 
        df['Billing Amount'].quantile(0.33),  
        df['Billing Amount'].quantile(0.67),  
        df['Billing Amount'].max()]  

# Define category labels
labels = ['Low-cost', 'Medium-cost', 'High-cost']

#New column categorizing hospitals based on billing amount
df['Hospital Category'] = pd.cut(df['Billing Amount'], bins=bins, labels=labels, include_lowest=True)


In [9]:
# Display hospital categories
print(df[['Hospital', 'Billing Amount', 'Hospital Category']].head(10))

                      Hospital  Billing Amount Hospital Category
0              Sons and Miller    18856.281306       Medium-cost
1                      Kim Inc    33643.327287       Medium-cost
2                     Cook PLC    27955.096079       Medium-cost
3   Hernandez Rogers and Vang,    37909.782410         High-cost
4                  White-White    14238.317814          Low-cost
5               Nunez-Humphrey    48145.110951         High-cost
6              Group Middleton    19580.872345       Medium-cost
7  Powell Robinson and Valdez,    45820.462722         High-cost
8                Sons Rich and    50119.222792         High-cost
9               Padilla-Walker    19784.631062       Medium-cost


In [10]:
df.dtypes

Age                      int64
Gender                  object
Blood Type              object
Medical Condition       object
Date of Admission       object
Hospital                object
Insurance Provider      object
Billing Amount         float64
Admission Type          object
Discharge Date          object
Medication              object
Test Results            object
target                   int64
Hospital Category     category
dtype: object

In [11]:
# Length of Stay

# Convert date columns to datetime
df["Date of Admission"] = pd.to_datetime(df["Date of Admission"])
df["Discharge Date"] = pd.to_datetime(df["Discharge Date"])
df["Length of Stay"] = (df["Discharge Date"] - df["Date of Admission"]).dt.days

In [12]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# Define a function to Create Visualizations

In [17]:
def create_visualization(data):
     # Distribution of Hospital Admissions by Insurance Provider
    plt.figure(figsize=(12,6))
    sns.countplot(data=df, x='Insurance Provider', hue='Hospital Category', palette='BuGn')
    plt.title("Distribution of Hospital Admissions by Insurance Provider Category")
    plt.xlabel("Insurance Provider")
    plt.ylabel("Number of Admissions")
    plt.savefig('visualizations/Hospital Admissions by Insurance Provider.png')
    plt.close()
    
    # Admission Type Breakdown (Emergency vs. Elective vs. Urgent)
    df['Admission Type'].value_counts().plot.pie(autopct='%1.1f%%', colors=['red', 'blue', 'green'], startangle=90)
    plt.title("Proportion of Admission Types")
    plt.ylabel("")  
    plt.savefig('visualizations/Proportion of Admission Types.png')
    plt.close()
    
    #Health care cost among insurance providers
    plt.figure(figsize=(12,6))
    sns.boxplot(data=df, x='Insurance Provider', y='Billing Amount')
    plt.title("Healthcare Costs Across Insurance Providers")
    plt.xlabel("Insurance Provider")
    plt.ylabel("Billing Amount (Naira)")
    plt.savefig('visualizations/Healthcare Costs Across Insurance Providers.png')
    plt.close()
    
    #Emergency Rate by provider
    sns.barplot(x=df["Insurance Provider"], y=df["Admission Type"]== 'Emergency')
    plt.xticks(rotation=90)
    plt.title("Emergency Admission Rate by Insurance Provider")
    plt.savefig('visualizations/Emergency Rate by provider.png')
    plt.close()
    
    # Age Distribution of patients
    sns.histplot(df['Age'], bins=20, kde=True, color='blue')
    plt.title(" Age Distribution of Patients")
    plt.xlabel("Age")
    plt.ylabel("Count")
    plt.savefig('visualizations/Age Distribution.png')
    plt.close()

# Create visualizations directory if it doesn't exist
if not os.path.exists('visualizations'):
    os.makedirs('visualizations')

In [18]:
# Generate visualizations
create_visualization(df)