In [26]:
import pandas as pd
import os
import shutil
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler
import warnings
warnings.filterwarnings('ignore')

import kagglehub

# Download latest version
path = kagglehub.dataset_download("abhilashjash/covid-19-simulated-dataset-by-abhilash-jash")
print("Path to dataset files:", path)

# Find the actual CSV file in the downloaded location
raw_data_path = os.path.join(path, "covid19_simulated_dataset_by_Abhilash_Jash.csv")  # Adjust filename if needed
target_col = "Is_Covid_True"

# Load the RAW data
print("\n=== Loading RAW data ===")
raw_data = pd.read_csv(raw_data_path)
print(f"Raw data shape: {raw_data.shape}")
print(f"Missing values:\n{raw_data.isnull().sum().sum()} total")


Path to dataset files: C:\Users\akoni\.cache\kagglehub\datasets\abhilashjash\covid-19-simulated-dataset-by-abhilash-jash\versions\1

=== Loading RAW data ===
Raw data shape: (1000000, 30)
Missing values:
781128 total


In [27]:
# === DATA CLEANING SECTION ===
print("\n=== Cleaning Data ===")

# Drop rows with missing values
data = raw_data.copy()
data.dropna(inplace=True)
print(f"After dropping NaN: {data.shape}")

# Replace boolean values
data = data.replace(False, 0)
data = data.replace(True, 1)

# Create gender dummy variables
data["Is_Male"] = (data["Gender"] == "Male").astype(int)
data["Is_Female"] = (data["Gender"] == "Female").astype(int)
data["Gender_Other"] = (data["Gender"] == "Other").astype(int)

# Drop unnecessary columns
data = data.drop(["Patient_ID", "Gender", "Name"], axis=1)

print(f"Final cleaned data shape: {data.shape}")
print(f"Columns: {list(data.columns)}")

# === SAVE CLEANED DATA TO BOTH LOCATIONS ===
print("\n=== Saving cleaned data ===")

# Location 1: Project root
root_input_path = "./input.csv"
data.to_csv(root_input_path, index=False)
print(f"✓ Saved to: {root_input_path}")

# Location 2: Dashboard public folder
dashboard_input_path = "./health-dashboard/public/input.csv"
data.to_csv(dashboard_input_path, index=False)
print(f"✓ Saved to: {dashboard_input_path}")

print("\n=== Data cleaning complete! ===")
print(f"Both input.csv files now contain {len(data)} cleaned records")



=== Cleaning Data ===
After dropping NaN: (449882, 30)
Final cleaned data shape: (449882, 30)
Columns: ['Age', 'Blood_Pressure', 'Heart_Rate', 'Fever', 'Cough', 'Sore_Throat', 'Fatigue', 'Headache', 'Breathlessness', 'Loss_of_Smell_Taste', 'Diarrhea', 'Chest_Pain', 'Body_Ache', 'Runny_Nose', 'Vomiting', 'Diabetes', 'Hypertension', 'Asthma', 'Obesity', 'Smoking', 'Vaccinated', 'Travel_History', 'Contact_with_Positive', 'Comorbidity_Count', 'ICU_Admission', 'Hospitalization_Days', 'Is_Covid_True', 'Is_Male', 'Is_Female', 'Gender_Other']

=== Saving cleaned data ===
✓ Saved to: ./input.csv
✓ Saved to: ./health-dashboard/public/input.csv

=== Data cleaning complete! ===
Both input.csv files now contain 449882 cleaned records


In [28]:
# === NOW USE THE CLEANED DATA FOR THE REST OF THE ANALYSIS ===
# Load from the saved location to ensure consistency
data_csv_location = root_input_path
data = pd.read_csv(data_csv_location)

# Display the first 2 rows of the dataset to get a quick look at the data
print("\n=== Cleaned Data Preview ===")
print(data.head(2))

# Generate summary statistics for numerical columns (count, mean, std, min, quartiles, max)
print("\n=== Summary Statistics ===")
print(data.describe())

# Export metadata (transposed for better readability)
metadata_path = './health-dashboard/public/metadata.csv'
data.describe().T.to_csv(metadata_path)
print(f"\n✓ Metadata exported to: {metadata_path}")

# Print concise information about the DataFrame: column names, non-null counts, and data types
print("\n=== Data Info ===")
data.info()



=== Cleaned Data Preview ===
    Age  Blood_Pressure  Heart_Rate  Fever  Cough  Sore_Throat  Fatigue  \
0  93.0           118.0        66.0      0      0            0        0   
1  15.0           153.0        79.0      0      0            0        0   

   Headache  Breathlessness  Loss_of_Smell_Taste  ...  Vaccinated  \
0         1               0                    1  ...           1   
1         1               1                    0  ...           1   

   Travel_History  Contact_with_Positive  Comorbidity_Count  ICU_Admission  \
0               0                      1                0.0              0   
1               0                      0                4.0              0   

   Hospitalization_Days  Is_Covid_True  Is_Male  Is_Female  Gender_Other  
0                   7.0              1        0          0             1  
1                  22.0              1        0          0             1  

[2 rows x 30 columns]

=== Summary Statistics ===
                 Age  Blo