In [8]:
import pandas as pd
import statistics
import numpy as np

# Declaration
data = pd.read_csv("modified_healthcare_dataset.csv")

target_col = ['Age','Billing Amount','Length of Stay']

for dt in target_col:
    # Convert to int
    data[dt] = pd.to_numeric(data[dt], errors='coerce')
    
    # Remove NaN
    data = data.dropna(subset=[dt])

    max_val = data[dt].max()
    min_val = data[dt].min()
    range_val = max_val - min_val
    mean = round(data[dt].mean(),2)
    mode = round(statistics.mode(data[dt]),2)
    var = round(np.nanvar(data[dt],ddof=1),2)
    std = round(var ** 0.5,2)

    print(f"Max of {dt.title()} : {max_val}")
    print(f"Min of {dt.title()} : {min_val}")
    print(f"Range of {dt.title()} : {range_val}")
    print(f"Mean of {dt.title()} : {mean}")
    print(f"Mode of {dt.title()} : {mode}")
    print(f"Variance of {dt.title()} : {var}")
    print(f"Standard Deviance of {dt.title()} : {std}\n")

Max of Age : 90
Min of Age : 5
Range of Age : 85
Mean of Age : 48.0
Mode of Age : 60
Variance of Age : 445.46
Standard Deviance of Age : 21.11

Max of Billing Amount : 99997.79797710298
Min of Billing Amount : 500.22098943266377
Range of Billing Amount : 99497.57698767031
Mean of Billing Amount : 21835.04
Mode of Billing Amount : 2212.27
Variance of Billing Amount : 555752976.28
Standard Deviance of Billing Amount : 23574.41

Max of Length Of Stay : 89
Min of Length Of Stay : 1
Range of Length Of Stay : 88
Mean of Length Of Stay : 17.84
Mode of Length Of Stay : 3
Variance of Length Of Stay : 410.97
Standard Deviance of Length Of Stay : 20.27



In [7]:
import pandas as pd
import statistics
import numpy as np

# Declaration
# Descriptive Statistic for Average Length of Stay, Billing Ammount, and Age per Medical Condition
data = pd.read_csv("modified_healthcare_dataset.csv")

# Unique values for Medical Condition
age_groups = data['Medical Condition'].unique().tolist()

avg_columns = ["Length of Stay","Billing Amount","Age"]

results = []
for age in age_groups:
    age_data = data[data['Medical Condition'] == age]
    total_patient = age_data['Name'].count()
    
    result_row = {
        'Medical Condition': age,
        'Total Patient': total_patient,
    }

    for col in avg_columns:
        result_row[f'Average {col}'] = round(age_data[col].mean(), 2)

    results.append(result_row)

# Sort
res_df = pd.DataFrame(results)
res_df = res_df.sort_values(by='Total Patient', ascending=False)
res_df

Unnamed: 0,Medical Condition,Total Patient,Average Length of Stay,Average Billing Amount,Average Age
1,Flu,7046,2.5,2744.15,27.38
6,Diabetes,7005,8.06,12503.19,54.96
7,Obesity,6994,5.97,10055.56,45.01
2,Cancer,6940,36.54,64537.09,64.92
3,Asthma,6908,3.5,5025.35,24.93
4,Heart Disease,6900,26.86,44913.43,64.92
5,Alzheimer’s,6861,54.42,32543.54,74.97
0,Infections,6846,5.52,2747.57,27.23


In [9]:
import pandas as pd
import statistics
import numpy as np

# Display Option
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.width', 0)

# Declaration
# Descriptive Statistic for Patient With Most Longest Length of Stay
data = pd.read_csv("modified_healthcare_dataset.csv")

data['Length of Stay'] = pd.to_numeric(data['Length of Stay'], errors='coerce')
data['Billing Amount'] = pd.to_numeric(data['Billing Amount'], errors='coerce')

# Drop rows with missing Length of Stay or Name
data = data.dropna(subset=['Length of Stay', 'Name', 'Billing Amount'])

# Sort & Limit
top_10_stays = data.sort_values(by='Length of Stay', ascending=False).head(10)

result = top_10_stays[['Length of Stay', 'Name', 'Billing Amount','Medical Condition','Doctor','Hospital']]
print(result)

       Length of Stay             Name  Billing Amount Medical Condition           Doctor                                     Hospital
8451               89  Arthur Perez Md    42855.501068       Alzheimer’s     Gregory Shaw             Loyola University Medical Center
39442              89    Travis Ortega    16264.012102       Alzheimer’s  Charles Donovan             Loyola University Medical Center
30369              89      James Davis    15757.597681       Alzheimer’s   Rhonda Meadows             Loyola University Medical Center
51211              89     Douglas Hill    25392.164044       Alzheimer’s   Vickie Stanley                            UChicago Medicine
3820               89  Nicholas Ibarra    32711.663340       Alzheimer’s    Paige Griffin  UI Health (University of Illinois Hospital)
8198               89   Amanda Alvarez    43311.238440       Alzheimer’s      Cheryl Shah                            UChicago Medicine
48418              89     Michael Lamb    17305.134448 

In [19]:
import pandas as pd
import statistics
import numpy as np

# Display Option
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.width', 0)

# Declaration
# Descriptive Statistic for Patient With Most Highest Billing
data = pd.read_csv("modified_healthcare_dataset.csv")

data['Billing Amount'] = pd.to_numeric(data['Billing Amount'], errors='coerce')

# Drop rows with missing Length of Stay or Name
data = data.dropna(subset=['Name', 'Billing Amount'])

# Sort & Limit
top_10_stays = data.sort_values(by='Billing Amount', ascending=False).head(10)

result = top_10_stays[['Name','Billing Amount','Medical Condition','Length of Stay','Hospital','Insurance Provider']]
print(result)

                   Name  Billing Amount Medical Condition  Length of Stay                                     Hospital Insurance Provider
53225     Jeffrey Patel    99997.797977            Cancer              18                            UChicago Medicine           Medicare
27943       Carol Lopez    99966.951967            Cancer              20               Northwestern Memorial Hospital              Aetna
346      Melinda Tanner    99957.519728            Cancer              16                            UChicago Medicine   UnitedHealthcare
100       Marcus Zamora    99951.563141            Cancer              22               Northwestern Memorial Hospital   UnitedHealthcare
33114  Kristin Jennings    99944.752623            Cancer              32             Loyola University Medical Center              Cigna
45946    Douglas Wright    99933.578902            Cancer              52             Loyola University Medical Center   UnitedHealthcare
43085        Tyler Long    99925.2

In [5]:
import pandas as pd
import statistics
import numpy as np

# Display Option
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.width', 0)

# Declaration
# Descriptive Statistic for Oldest Patient 
data = pd.read_csv("modified_healthcare_dataset.csv")

data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

# Drop rows with missing Age or Name
data = data.dropna(subset=['Name', 'Age'])

# Sort & Limit
top_10_age = data.sort_values(by='Age', ascending=False).head(10)

result = top_10_age[['Name','Age','Medical Condition','Length of Stay','Hospital']]
print(result)

                     Name  Age Medical Condition  Length of Stay                                     Hospital
16899         Mary Martin   90       Alzheimer’s              29  UI Health (University of Illinois Hospital)
4501         Jeanne Jones   90       Alzheimer’s              36                            UChicago Medicine
26315    Christina Miller   90       Alzheimer’s              27                            UChicago Medicine
8798       Timothy Wilson   90       Alzheimer’s              27  UI Health (University of Illinois Hospital)
40184       Jordan Nelson   90       Alzheimer’s              81                            UChicago Medicine
19905    Alexandra Thomas   90       Alzheimer’s              26  UI Health (University of Illinois Hospital)
21982          John Smith   90       Alzheimer’s              78             Loyola University Medical Center
13027  Dr. Christie Boyer   90       Alzheimer’s              28             Loyola University Medical Center
18072    W