In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

# Load the dataset
df = pd.read_csv("filtered_dataset_condition3500.csv")

# Convert 'birthDate' to datetime
df['birthDate'] = pd.to_datetime(df['birthDate'])

# Define current date (July 3, 2025)
current_date = datetime(2025, 7, 3)

# Calculate age in years
df['age'] = (current_date - df['birthDate']).dt.days / 365.25
df['age'] = df['age'].round(2)

# Convert 'gender' to categorical type
df['gender'] = df['gender'].astype('category')

# Identify observation columns (starting with 'obs_')
obs_columns = [col for col in df.columns if col.startswith('obs_')]

# Ensure observation columns are numeric and impute missing values with median
for col in obs_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df[obs_columns] = df[obs_columns].fillna(df[obs_columns].median())

# Identify allergy and condition columns
allergy_columns = [col for col in df.columns if col.startswith('allergy_')]
condition_columns = [col for col in df.columns if col.startswith('condition_')]

# Ensure allergy and condition columns are numeric, set missing to 0, and convert to integer
for col in allergy_columns + condition_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

# Save the cleaned dataset to a new CSV file
df.to_csv("sanchityelonayi.csv", index=False)

print("Dataset cleaned and saved as 'sanchityelonayi.csv'.")

Dataset cleaned and saved as 'sanchityelonayi.csv'.
