In [None]:
# Predictive Risk Assessment Model - Data Preprocessing

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load the synthetic dataset
print("[INFO] Loading dataset...")
file_path = "../data/synthetic_health_data.csv"
data = pd.read_csv(file_path)

# Display the first few rows
print("Dataset Preview:")
display(data.head())

# Check dataset information
print("\n[INFO] Dataset Information:")
data.info()

# Check for missing values
print("\n[INFO] Missing Values:")
missing_values = data.isnull().sum()
print(missing_values)

# Fill or handle missing values
print("\n[INFO] Handling Missing Values...")
data['BloodPressure'] = data['BloodPressure'].fillna(data['BloodPressure'].mean())
data['Cholesterol'] = data['Cholesterol'].fillna(data['Cholesterol'].median())

# Verify missing values have been addressed
print("\n[INFO] Missing Values After Handling:")
print(data.isnull().sum())

# Data statistics
print("\n[INFO] Dataset Summary Statistics:")
display(data.describe())

# Exploratory Data Analysis (EDA)
print("\n[INFO] Performing Exploratory Data Analysis...")

# 1. Distribution of HighRisk
plt.figure(figsize=(6, 4))
sns.countplot(x='HighRisk', data=data, palette='coolwarm')
plt.title("Distribution of HighRisk Members")
plt.xlabel("High Risk (1 = Yes, 0 = No)")
plt.ylabel("Count")
plt.show()

# 2. Distribution of Age
plt.figure(figsize=(6, 4))
sns.histplot(data['Age'], bins=15, kde=True, color='blue')
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

# 3. Correlation Heatmap
plt.figure(figsize=(10, 6))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

# Encode categorical variables
print("\n[INFO] Encoding Categorical Variables...")
encoder = LabelEncoder()
data['Gender'] = encoder.fit_transform(data['Gender'])

# Feature scaling
print("\n[INFO] Scaling Numerical Features...")
scaler = StandardScaler()
numerical_features = ['Age', 'BloodPressure', 'Cholesterol']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Feature-target split
print("\n[INFO] Splitting Features and Target...")
X = data.drop(columns=['HighRisk', 'MemberID', 'Name'])  # Drop non-feature columns
y = data['HighRisk']

# Split data into training and testing sets
print("\n[INFO] Splitting Data into Training and Testing Sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the processed data for model training
print("\n[INFO] Saving Processed Data...")
X_train.to_csv("../data/X_train.csv", index=False)
X_test.to_csv("../data/X_test.csv", index=False)
y_train.to_csv("../data/y_train.csv", index=False)
y_test.to_csv("../data/y_test.csv", index=False)

print("\n[INFO] Data Preprocessing Completed Successfully!")