In [None]:
import pandas as pd

In [None]:
import os

In [None]:
import numpy as np

# Step 1: Data Cleaning and Exploration

In [None]:
Inpatient_Discharges_Data = pd.read_csv('Copy of Copy of Inpatient_Discharges_Data.csv')

Inpatient_Discharges_Data.head()

In [None]:
# Drop column not used for Tableau 
clean_data = Inpatient_Discharges_Data.drop(columns='Permanent Facility Id')

clean_data.head()

In [None]:
# Remove "u" from the gender column
clean_data["Gender"] = clean_data["Gender"].str.replace("U", "")

clean_data.to_csv('clean_data.csv')

clean_data.head(50)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Check for missing values
missing_values = clean_data["Length of Stay"].isnull().sum()
print("Number of missing values in 'Length of Stay':", missing_values)

In [None]:
# Check for outliers using a box plot
plt.figure(figsize=(8, 6))
sns.boxplot(clean_data["Length of Stay"])
plt.title("Box Plot of Length of Stay")
plt.show()

In [None]:
# Calculate basic statistics
mean_length_of_stay = clean_data["Length of Stay"].mean()
median_length_of_stay = clean_data["Length of Stay"].median()
std_dev_length_of_stay = clean_data["Length of Stay"].std()

print("Mean Length of Stay:", mean_length_of_stay)
print("Median Length of Stay:", median_length_of_stay)
print("Standard Deviation of Length of Stay:", std_dev_length_of_stay)

In [None]:
# Basic statistics
statistics = {
    "Mean": mean_length_of_stay,
    "Median": median_length_of_stay,
    "Std Dev": std_dev_length_of_stay
}

# Create a bar chart
plt.figure(figsize=(8, 6))
plt.bar(statistics.keys(), statistics.values(), color=['blue', 'green', 'orange'])
plt.title("Basic Statistics of Length of Stay")
plt.ylabel("Value")
plt.savefig(f"segment_visualisations/length_of_stay.png")
plt.show()

In [None]:
pip install tabulate


In [None]:
from tabulate import tabulate

# Basic statistics
statistics = [
    ["Mean", mean_length_of_stay],
    ["Median", median_length_of_stay],
    ["Std Dev", std_dev_length_of_stay]
]

# Display statistics in table format
table = tabulate(statistics, headers=["Statistic", "Value"], tablefmt="grid")
print(table)


In [None]:
# Create a histogram
plt.figure(figsize=(8, 6))
plt.hist(clean_data["Length of Stay"], bins=20, edgecolor='k')
plt.title("Histogram of Length of Stay")
plt.xlabel("Length of Stay")
plt.ylabel("Frequency")
plt.show()

#The histogram provides a visual representation of how the values are 
#distributed across different ranges. The box plot gives you insights into 
#the central tendency, spread, and presence of potential outliers in the 
#distribution.

# Step 2: Segmentation and Grouping

In [None]:
# Segment the data based on different attributes. This will calculate the average length of stay for each combination of these categories.
segments = ["Age Group", "Gender", "Race", "Type of Admission", "Patient Disposition"]

for segment in segments:
    grouped_data = clean_data.groupby(segment)["Length of Stay"]
    
    # Calculate average length of stay for each segment
    avg_length_of_stay = grouped_data.mean()
    
    print(f"\nSegment: {segment}")
    print(avg_length_of_stay)

Based on the segment analysis, identify any trends or patterns in the data. For example, do certain age groups or admission types have longer stays on average? 

In [None]:
#visually compare the average length of stay across different segments, 
#to identify any trends or differences

# Create a folder to save the visualisations
if not os.path.exists("segment_visualisations"):
    os.makedirs("segment_visualisations")

for segment in segments:
    grouped_data = clean_data.groupby(segment)["Length of Stay"]
    
    # Calculate average length of stay for each segment
    avg_length_of_stay = grouped_data.mean()
    
    # Create a bar chart for the average length of stay
    plt.figure(figsize=(10, 6))
    sns.barplot(x=avg_length_of_stay.index, y=avg_length_of_stay.values)
    plt.title(f"Average Length of Stay by {segment}")
    plt.xlabel(segment)
    plt.ylabel("Average Length of Stay")
    plt.xticks(rotation=90)
    plt.tight_layout()
    
    # Save the visualization
    plt.savefig(f"segment_visualisations/{segment}_avg_length_of_stay.png")
    
    plt.show()

# Step 3: Insights Generation

Explore the impact of patient disposition on length of stay. Are there certain dispositions that tend to result in longer stays?

In [None]:
# Explore the impact of patient disposition on length of stay
plt.figure(figsize=(10, 6))
sns.boxplot(x="Patient Disposition", y="Length of Stay", data=clean_data)
plt.title("Impact of Patient Disposition on Length of Stay")
plt.xlabel("Patient Disposition")
plt.ylabel("Length of Stay")
plt.xticks(rotation=45)
plt.tight_layout()

# Save the visualization
plt.savefig("segment_visualisations/patient_disposition_length_of_stay.png")

plt.show()

In [None]:
# Calculate the total length of stay
total_length_of_stay = clean_data["Length of Stay"].sum()

# Group data by gender and calculate total length of stay for each gender
gender_grouped = clean_data.groupby("Gender")["Length of Stay"].sum()

# Calculate the percentage of length of stay for each gender
percentage_by_gender = (gender_grouped / total_length_of_stay) * 100

# Create a pie chart
plt.figure(figsize=(6, 6))
plt.pie(percentage_by_gender, labels=percentage_by_gender.index, autopct="%1.1f%%", startangle=140)
plt.title("Percentage of Length of Stay by Gender")
plt.savefig(f"segment_visualisations/percentage_by_gender.png")

plt.show()

In [None]:
# Group data by patient disposition and calculate average length of stay
avg_length_by_disposition = clean_data.groupby("Patient Disposition")["Length of Stay"].mean()

# Create a DataFrame from the calculated results
avg_length_df = pd.DataFrame({
    "Patient Disposition": avg_length_by_disposition.index,
    "Average Length of Stay": avg_length_by_disposition.values
})

# Display the DataFrame
print(avg_length_df)

In [None]:
avg_length_df.head(20)