In [1]:
# Data Cleaning and Preprocessing - Asthma Dataset
# File: notebooks/exploratory/02_data_preprocessing.ipynb

# =============================================================================
# IMPORTS AND SETUP
# =============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler, 
    LabelEncoder, OneHotEncoder, OrdinalEncoder
)
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

print("🧹 DATA CLEANING AND PREPROCESSING")
print("=" * 60)

🧹 DATA CLEANING AND PREPROCESSING


In [17]:
# =============================================================================
# 1. LOAD ORIGINAL DATA (CORRECTLY)
# =============================================================================

# Load original data
df_original = pd.read_csv('../../data/raw/asthma_disease_data.csv')
print(f"✅ Original data loaded: {df_original.shape}")

# Display first few rows to understand structure
print(f"\n📊 Original Data Structure:")
print(f"Shape: {df_original.shape}")
display(df_original.head())

print(f"\n📋 Column Info:")
for col in df_original.columns:
    dtype = df_original[col].dtype
    unique_count = df_original[col].nunique()
    sample_vals = df_original[col].unique()[:3] if unique_count <= 10 else f"Range: {df_original[col].min()}-{df_original[col].max()}" if dtype in ['int64', 'float64'] else "Many values"
    print(f"  {col}: {dtype}, {unique_count} unique, {sample_vals}")

✅ Original data loaded: (2392, 29)

📊 Original Data Structure:
Shape: (2392, 29)


Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,PollutionExposure,PollenExposure,DustExposure,PetAllergy,FamilyHistoryAsthma,HistoryOfAllergies,Eczema,HayFever,GastroesophagealReflux,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis,DoctorInCharge
0,5034,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,7.388481,2.855578,0.974339,1,1,0,0,0,0,1.369051,4.941206,0,0,1,0,0,1,0,Dr_Confid
1,5035,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,1.969838,7.457665,6.584631,0,0,1,0,0,0,2.197767,1.702393,1,0,0,1,1,1,0,Dr_Confid
2,5036,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,1.460593,1.448189,5.445799,0,1,1,0,1,0,1.698011,5.022553,1,1,1,0,1,1,0,Dr_Confid
3,5037,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,0.581905,7.571845,3.965316,0,0,0,0,1,0,3.032037,2.300159,1,0,1,1,1,0,0,Dr_Confid
4,5038,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,0.980875,3.049807,8.260605,0,0,0,0,1,0,3.470589,3.067944,1,1,1,0,0,1,0,Dr_Confid



📋 Column Info:
  PatientID: int64, 2392 unique, Range: 5034-7425
  Age: int64, 75 unique, Range: 5-79
  Gender: int64, 2 unique, [0 1]
  Ethnicity: int64, 4 unique, [1 2 0]
  EducationLevel: int64, 4 unique, [0 2 1]
  BMI: float64, 2392 unique, Range: 15.031803385194396-39.985610652758176
  Smoking: int64, 2 unique, [0 1]
  PhysicalActivity: float64, 2392 unique, Range: 0.0017403282894756-9.995809378574393
  DietQuality: float64, 2392 unique, Range: 0.0030308474623308-9.99990370116923
  SleepQuality: float64, 2392 unique, Range: 4.001436511239669-9.996235376613528
  PollutionExposure: float64, 2392 unique, Range: 0.0010215049107153-9.998964331499586
  PollenExposure: float64, 2392 unique, Range: 0.0006590280421692-9.999554769488762
  DustExposure: float64, 2392 unique, Range: 0.0024338328540329-9.999707848844984
  PetAllergy: int64, 2 unique, [1 0]
  FamilyHistoryAsthma: int64, 2 unique, [1 0]
  HistoryOfAllergies: int64, 2 unique, [0 1]
  Eczema: int64, 2 unique, [0 1]
  HayFever: in

In [18]:
# =============================================================================
# 2. IDENTIFY COLUMNS 
# =============================================================================

print(f"\n" + "="*60)
print("🏷️  CORRECT COLUMN IDENTIFICATION")
print("="*60)

# Columns that should NEVER be scaled
ID_COLUMNS = ['PatientID']  # These should stay as-is

# Identify target variable
TARGET_CANDIDATES = ['Diagnosis', 'Asthma', 'HasAsthma']
target_column = None

for col in df_original.columns:
    if col in TARGET_CANDIDATES or 'diagnosis' in col.lower() or 'asthma' in col.lower():
        target_column = col
        break

if target_column:
    print(f"🎯 Target variable identified: {target_column}")
    print(f"   Values: {df_original[target_column].value_counts().to_dict()}")
else:
    print(f"❓ Target variable not clearly identified. Looking at binary columns:")
    for col in df_original.columns:
        if df_original[col].nunique() == 2 and col not in ID_COLUMNS:
            print(f"   Potential target: {col} -> {df_original[col].unique()}")

# Identify feature types
numerical_features = []
categorical_features = []

for col in df_original.columns:
    if col in ID_COLUMNS:
        continue  # Skip ID columns
    elif col == target_column:
        continue  # Skip target column
    elif df_original[col].dtype in ['int64', 'float64']:
        # Check if it's actually categorical (like binary 0/1)
        if df_original[col].nunique() <= 10 and set(df_original[col].dropna().unique()).issubset({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}):
            categorical_features.append(col)
        else:
            numerical_features.append(col)
    else:
        categorical_features.append(col)

print(f"\n📊 Feature Classification:")
print(f"   ID columns (don't process): {ID_COLUMNS}")
print(f"   Target column: {target_column}")
print(f"   Numerical features ({len(numerical_features)}): {numerical_features[:5]}{'...' if len(numerical_features) > 5 else ''}")
print(f"   Categorical features ({len(categorical_features)}): {categorical_features[:5]}{'...' if len(categorical_features) > 5 else ''}")



🏷️  CORRECT COLUMN IDENTIFICATION
🎯 Target variable identified: FamilyHistoryAsthma
   Values: {0: 1672, 1: 720}

📊 Feature Classification:
   ID columns (don't process): ['PatientID']
   Target column: FamilyHistoryAsthma
   Numerical features (10): ['Age', 'BMI', 'PhysicalActivity', 'DietQuality', 'SleepQuality']...
   Categorical features (17): ['Gender', 'Ethnicity', 'EducationLevel', 'Smoking', 'PetAllergy']...


In [19]:
# =============================================================================
# 3. DATA CLEANING (WITHOUT DESTROYING STRUCTURE)
# =============================================================================

print(f"\n" + "="*60)
print("🧹 PROPER DATA CLEANING")
print("="*60)

# Work with a copy
df_clean = df_original.copy()

# Check missing values
missing_summary = df_clean.isnull().sum()
if missing_summary.sum() > 0:
    print(f"❗ Missing values found:")
    for col, missing_count in missing_summary[missing_summary > 0].items():
        print(f"   {col}: {missing_count} ({missing_count/len(df_clean)*100:.1f}%)")
    
    # Handle missing values appropriately
    for col in missing_summary[missing_summary > 0].index:
        if col in numerical_features:
            # Use median for numerical features
            median_val = df_clean[col].median()
            df_clean[col].fillna(median_val, inplace=True)
            print(f"   Filled {col} with median: {median_val}")
        elif col in categorical_features:
            # Use mode for categorical features
            mode_val = df_clean[col].mode()[0] if len(df_clean[col].mode()) > 0 else 'Unknown'
            df_clean[col].fillna(mode_val, inplace=True)
            print(f"   Filled {col} with mode: {mode_val}")
else:
    print(f"✅ No missing values found")

# Remove duplicates
duplicates = df_clean.duplicated().sum()
if duplicates > 0:
    df_clean = df_clean.drop_duplicates()
    print(f"🗑️  Removed {duplicates} duplicate rows")
else:
    print(f"✅ No duplicates found")

print(f"📊 Clean dataset shape: {df_clean.shape}")



🧹 PROPER DATA CLEANING
✅ No missing values found
✅ No duplicates found
📊 Clean dataset shape: (2392, 29)


In [20]:
# =============================================================================
# 4. PROPER FEATURE PREPROCESSING
# =============================================================================

print(f"\n" + "="*60)
print("⚙️  PROPER FEATURE PREPROCESSING")
print("="*60)

# Create final dataframe
df_processed = df_clean.copy()

# Keep ID columns as-is
print(f"✅ Keeping ID columns unchanged: {ID_COLUMNS}")

# Handle categorical variables
print(f"\n🏷️  Encoding Categorical Variables:")

for col in categorical_features:
    unique_count = df_processed[col].nunique()
    
    if unique_count == 2:
        # Binary encoding (0/1)
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        print(f"   {col}: Binary encoded (0/1)")
        
    elif unique_count <= 5:
        # One-hot encoding for low cardinality
        dummies = pd.get_dummies(df_processed[col], prefix=col, drop_first=True)
        df_processed = pd.concat([df_processed.drop(col, axis=1), dummies], axis=1)
        print(f"   {col}: One-hot encoded ({unique_count-1} new columns)")
        
    else:
        # Label encoding for high cardinality
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        print(f"   {col}: Label encoded ({unique_count} categories)")

# Handle numerical variables (SCALE ONLY THESE)
print(f"\n📊 Scaling Numerical Variables:")

if len(numerical_features) > 0:
    # Only scale numerical features, NOT ID or categorical
    scaler = StandardScaler()
    df_processed[numerical_features] = scaler.fit_transform(df_processed[numerical_features])
    
    print(f"   ✅ Scaled {len(numerical_features)} numerical features")
    print(f"   Numerical features scaled: {numerical_features}")
    
    # Show scaling results
    print(f"\n   📈 Scaling Results (first 3 features):")
    for col in numerical_features[:3]:
        mean_val = df_processed[col].mean()
        std_val = df_processed[col].std()
        print(f"     {col}: mean={mean_val:.3f}, std={std_val:.3f}")

else:
    print(f"   ℹ️  No numerical features to scale")



⚙️  PROPER FEATURE PREPROCESSING
✅ Keeping ID columns unchanged: ['PatientID']

🏷️  Encoding Categorical Variables:
   Gender: Binary encoded (0/1)
   Ethnicity: One-hot encoded (3 new columns)
   EducationLevel: One-hot encoded (3 new columns)
   Smoking: Binary encoded (0/1)
   PetAllergy: Binary encoded (0/1)
   HistoryOfAllergies: Binary encoded (0/1)
   Eczema: Binary encoded (0/1)
   HayFever: Binary encoded (0/1)
   GastroesophagealReflux: Binary encoded (0/1)
   Wheezing: Binary encoded (0/1)
   ShortnessOfBreath: Binary encoded (0/1)
   ChestTightness: Binary encoded (0/1)
   Coughing: Binary encoded (0/1)
   NighttimeSymptoms: Binary encoded (0/1)
   ExerciseInduced: Binary encoded (0/1)
   Diagnosis: Binary encoded (0/1)
   DoctorInCharge: One-hot encoded (0 new columns)

📊 Scaling Numerical Variables:
   ✅ Scaled 10 numerical features
   Numerical features scaled: ['Age', 'BMI', 'PhysicalActivity', 'DietQuality', 'SleepQuality', 'PollutionExposure', 'PollenExposure', 'Dust

In [21]:
# =============================================================================
# 5. VERIFY PREPROCESSING RESULTS
# =============================================================================

print(f"\n" + "="*60)
print("✅ PREPROCESSING VERIFICATION")
print("="*60)

print(f"📊 Final Dataset:")
print(f"   Shape: {df_processed.shape}")
print(f"   Columns: {len(df_processed.columns)}")

# Check PatientID is preserved correctly
if 'PatientID' in df_processed.columns:
    print(f"\n🆔 PatientID Verification:")
    print(f"   Original PatientID range: {df_original['PatientID'].min()} to {df_original['PatientID'].max()}")
    print(f"   Processed PatientID range: {df_processed['PatientID'].min()} to {df_processed['PatientID'].max()}")
    print(f"   First 5 PatientIDs: {df_processed['PatientID'].head().tolist()}")
    
    if df_processed['PatientID'].equals(df_original['PatientID']):
        print(f"   ✅ PatientID preserved correctly!")
    else:
        print(f"   ❌ PatientID was modified!")

# Check target variable
if target_column and target_column in df_processed.columns:
    print(f"\n🎯 Target Variable Verification:")
    orig_dist = df_original[target_column].value_counts()
    proc_dist = df_processed[target_column].value_counts()
    
    print(f"   Original distribution: {orig_dist.to_dict()}")
    print(f"   Processed distribution: {proc_dist.to_dict()}")
    
    if orig_dist.equals(proc_dist):
        print(f"   ✅ Target variable preserved correctly!")
    else:
        print(f"   ❌ Target variable was modified!")

# Show data types
print(f"\n📋 Final Data Types:")
dtype_counts = df_processed.dtypes.value_counts()
for dtype, count in dtype_counts.items():
    print(f"   {dtype}: {count} columns")

# Show sample of final data
print(f"\n👀 Final Data Sample:")
display(df_processed.head())



✅ PREPROCESSING VERIFICATION
📊 Final Dataset:
   Shape: (2392, 32)
   Columns: 32

🆔 PatientID Verification:
   Original PatientID range: 5034 to 7425
   Processed PatientID range: 5034 to 7425
   First 5 PatientIDs: [5034, 5035, 5036, 5037, 5038]
   ✅ PatientID preserved correctly!

🎯 Target Variable Verification:
   Original distribution: {0: 1672, 1: 720}
   Processed distribution: {0: 1672, 1: 720}
   ✅ Target variable preserved correctly!

📋 Final Data Types:
   int64: 16 columns
   float64: 10 columns
   bool: 6 columns

👀 Final Data Sample:


Unnamed: 0,PatientID,Age,Gender,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,PollutionExposure,PollenExposure,DustExposure,PetAllergy,FamilyHistoryAsthma,HistoryOfAllergies,Eczema,HayFever,GastroesophagealReflux,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis,Ethnicity_1,Ethnicity_2,Ethnicity_3,EducationLevel_1,EducationLevel_2,EducationLevel_3
0,5034,0.96574,0,-1.582769,0,-1.432099,0.160113,0.971063,0.809355,-0.780866,-1.401921,1,1,0,0,0,0,-1.368934,0.920608,0,0,1,0,0,1,0,True,False,False,False,False,False
1,5035,-0.747054,1,-0.6233,0,0.291269,0.453069,-1.076746,-1.036866,0.810184,0.560684,0,0,1,0,0,0,-0.407132,-1.564256,1,0,0,1,1,1,0,False,True,False,False,True,False
2,5036,0.687989,0,-1.229074,0,0.58133,1.434458,-0.102976,-1.210374,-1.267434,0.162295,0,1,1,0,1,0,-0.987146,0.983019,1,1,1,0,1,1,0,False,True,False,True,False,False
3,5037,-0.09897,1,1.565307,0,-1.256398,0.276233,-1.59688,-1.509757,0.849659,-0.355611,0,0,0,0,1,0,0.561114,-1.105641,1,0,1,1,1,0,0,False,True,False,True,False,False
4,5038,0.873156,0,-1.105686,0,-0.154081,-0.651625,1.504976,-1.373822,-0.713717,1.146977,0,0,0,0,1,0,1.070095,-0.516586,1,1,1,0,0,1,0,False,False,False,False,False,True


In [23]:
# =============================================================================
# 6. SAVE CORRECTLY PROCESSED DATA
# =============================================================================

print(f"\n" + "="*60)
print("💾 SAVING CORRECTLY PROCESSED DATA")
print("="*60)

# Create output directory
import os
output_dir = '../../data/processed/'
os.makedirs(output_dir, exist_ok=True)

# Save the correctly processed data
output_file = f'{output_dir}asthma_data_processed.csv'
df_processed.to_csv(output_file, index=False)
print(f"✅ Saved correctly processed data: {output_file}")

# Save preprocessing summary
preprocessing_summary = {
    'original_shape': df_original.shape,
    'processed_shape': df_processed.shape,
    'id_columns_preserved': ID_COLUMNS,
    'target_column': target_column,
    'numerical_features_scaled': numerical_features,
    'categorical_features_encoded': categorical_features,
    'missing_values_filled': missing_summary[missing_summary > 0].to_dict(),
    'duplicates_removed': duplicates if 'duplicates' in locals() else 0
}

import json
with open(f'{output_dir}preprocessing_summary.json', 'w') as f:
    json.dump(preprocessing_summary, f, indent=2, default=str)

print(f"✅ Saved preprocessing summary: {output_dir}preprocessing_summary.json")


💾 SAVING CORRECTLY PROCESSED DATA
✅ Saved correctly processed data: ../../data/processed/asthma_data_processed.csv
✅ Saved preprocessing summary: ../../data/processed/preprocessing_summary.json
