In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load data
df = pd.read_csv("national_health_data_2024.csv") 

# Step 1: Keep only relevant columns
relevant_cols = [
    "poverty_perc", "median_household_income", "education_less_than_high_school_percent",
    "percent_high_blood_pressure", "percent_coronary_heart_disease", "percent_stroke", 
    "percent_high_cholesterol", "percent_inactive", "percent_smoking",
    "number_of_hospitals", "number_of_primary_care_physicians", "percent_no_heath_insurance",
    "urban_rural_status"
]
df = df[relevant_cols]  # Drop all other columns

# Step 2: Check for missing values
print("Missing Values:\n", df.isnull().sum())

# Step 3: Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)  # Fill numeric columns with median

# Drop columns if more than 30% data is missing
df.dropna(axis=1, thresh=len(df) * 0.7, inplace=True)

# Step 4: Convert percentage columns to decimal format (if they are in 0-100 range)
percent_cols = [
    "poverty_perc", "education_less_than_high_school_percent", "percent_inactive",
    "percent_smoking", "percent_no_heath_insurance", "percent_high_blood_pressure",
    "percent_coronary_heart_disease", "percent_stroke", "percent_high_cholesterol"
]

# Step 5: Convert numeric columns to proper data types
num_cols = ["median_household_income", "number_of_hospitals", "number_of_primary_care_physicians"]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")

# Step 6: Encode categorical variables (Urban = 1, Rural = 0)
df["urban_rural_status"] = df["urban_rural_status"].map({"Rural": 0, "Urban": 1})

# Step 7: Remove duplicates
df.drop_duplicates(inplace=True)

# Step 8: Normalize selected columns for better visualization
scaler = MinMaxScaler()
scale_cols = ["median_household_income", "number_of_hospitals", "number_of_primary_care_physicians"]
df[scale_cols] = scaler.fit_transform(df[scale_cols])

# Step 9: Save the cleaned dataset
df.to_csv("cleaned_data.csv", index=False)

print("Data preprocessing complete! Cleaned data saved as 'cleaned_data.csv'.")


Missing Values:
 poverty_perc                               0
median_household_income                    0
education_less_than_high_school_percent    0
percent_high_blood_pressure                0
percent_coronary_heart_disease             0
percent_stroke                             0
percent_high_cholesterol                   0
percent_inactive                           0
percent_smoking                            0
number_of_hospitals                        0
number_of_primary_care_physicians          0
percent_no_heath_insurance                 0
urban_rural_status                         0
dtype: int64
Data preprocessing complete! Cleaned data saved as 'cleaned_data.csv'.
