In [1]:
from datetime import datetime, timedelta
import numpy as np
import pandas as pd

# Generate weekly timestamps starting from 2003
start_date = datetime(2003, 1, 1)
timestamps = [start_date + timedelta(weeks=i) for i in range(1250)]

# Define realistic CBC ranges
ranges = {
    'WBC': (4.0, 10.0),    # Normal range
    'LYMp': (20.0, 50.0),
    'MIDp': (2.0, 15.0),
    'NEUTp': (30.0, 70.0),
    'LYMn': (1.0, 5.0),
    'MIDn': (0.1, 2.0),
    'NEUTn': (1.5, 10.0),
    'RBC': (2.5, 4.5),     # Lower due to chronic anemia
    'HGB': (6.0, 11.0),    # Normal range for sickle cell patients
    'HCT': (18.0, 33.0),
    'MCV': (70.0, 100.0),
    'MCH': (20.0, 30.0),
    'MCHC': (28.0, 35.0),
    'RDWSD': (35.0, 50.0),
    'RDWCV': (15.0, 25.0),
    'PLT': (150.0, 450.0),
    'MPV': (7.0, 12.0),
    'PDW': (10.0, 18.0),
    'PCT': (0.1, 0.5),
    'PLCR': (15.0, 40.0)
}

# Generate synthetic data
np.random.seed(42)  # For reproducibility
synthetic_data = {
    'Timestamp': timestamps,
    'ID': range(1, 1251)  # Unique ID for each row
}

for column, (low, high) in ranges.items():
    synthetic_data[column] = np.round(np.random.uniform(low, high, 1250), 2)

# Convert to DataFrame
synthetic_cbc_df = pd.DataFrame(synthetic_data)

# 🔹 Add High-Risk Patients (10-15% of the data)
num_high_risk = int(0.15 * len(synthetic_cbc_df))  # 15% of data should be high risk
high_risk_indices = np.random.choice(synthetic_cbc_df.index, num_high_risk, replace=False)

# Modify selected rows to be "High-Risk"
synthetic_cbc_df.loc[high_risk_indices, 'HGB'] = np.round(np.random.choice([np.random.uniform(4.0, 5.0), np.random.uniform(11.1, 13.0)], size=num_high_risk), 2)
synthetic_cbc_df.loc[high_risk_indices, 'WBC'] = np.round(np.random.uniform(10.1, 14.0, num_high_risk), 2)

# 🔹 Define risk status for classification model
synthetic_cbc_df['risk_status'] = np.where(((synthetic_cbc_df['HGB'] > 11) | (synthetic_cbc_df['HGB'] < 5)) & (synthetic_cbc_df['WBC'] > 10), 1, 0)

# Save the updated synthetic dataset
synthetic_file_path = "synthetic_sickle_cbc_updated.xlsx"
synthetic_cbc_df.to_excel(synthetic_file_path, index=False)

print("Updated dataset saved with high-risk patients included.")



Updated dataset saved with high-risk patients included.


In [2]:
from datetime import datetime, timedelta
import numpy as np
import pandas as pd

# Generate weekly timestamps starting from 2003
start_date = datetime(2003, 1, 1)
timestamps = [start_date + timedelta(weeks=i) for i in range(1250)]

# Define realistic CBC ranges
ranges = {
    'WBC': (4.0, 10.0),    # Normal range
    'LYMp': (20.0, 50.0),
    'MIDp': (2.0, 15.0),
    'NEUTp': (30.0, 70.0),
    'LYMn': (1.0, 5.0),
    'MIDn': (0.1, 2.0),
    'NEUTn': (1.5, 10.0),
    'RBC': (2.5, 4.5),
    'HGB': (6.0, 11.0),  # Normal range for sickle cell patients
    'HCT': (18.0, 33.0),
    'MCV': (70.0, 100.0),
    'MCH': (20.0, 30.0),
    'MCHC': (28.0, 35.0),
    'RDWSD': (35.0, 50.0),
    'RDWCV': (15.0, 25.0),
    'PLT': (150.0, 450.0),
    'MPV': (7.0, 12.0),
    'PDW': (10.0, 18.0),
    'PCT': (0.1, 0.5),
    'PLCR': (15.0, 40.0)
}

# Generate synthetic data
np.random.seed(42)  
synthetic_data = {
    'Timestamp': timestamps,
    'ID': range(1, 1251)
}

for column, (low, high) in ranges.items():
    synthetic_data[column] = np.round(np.random.uniform(low, high, 1250), 2)

# Convert to DataFrame
synthetic_cbc_df = pd.DataFrame(synthetic_data)

# Modify selected rows to be "High-Risk" **without adding risk_status**
num_high_risk = int(0.15 * len(synthetic_cbc_df))  # 15% of data should be high risk
high_risk_indices = np.random.choice(synthetic_cbc_df.index, num_high_risk, replace=False)

# Ensure high-risk patients meet the new criteria: HGB < 6 or HGB > 11 and WBC > 10
synthetic_cbc_df.loc[high_risk_indices, 'HGB'] = np.round(np.random.choice(
    [np.random.uniform(4.0, 6.0), np.random.uniform(11.1, 13.0)], size=num_high_risk
), 2)
synthetic_cbc_df.loc[high_risk_indices, 'WBC'] = np.round(np.random.uniform(10.1, 14.0, num_high_risk), 2)

# Save the dataset locally
synthetic_cbc_df.to_excel("synthetic_sickle_cbc_upd.xlsx", index=False)

print("Dataset successfully saved as 'synthetic_sickle_cbc_upd.xlsx' with high-risk patients included.")


Dataset successfully saved as 'synthetic_sickle_cbc_upd.xlsx' with high-risk patients included.
