<h2 style="text-align:center;">Data Preprocessing</h2>

1. [Handling Datetime Features](#date-time)
2. [Handling Name Column](#handle-names)
3. [Handling Negative Values](#negative-values)
4. [Handling Missing Values](#missing-values)
5. [Encoding](#encoding)
6. [Feature Engineering](#feature-engineering)
7. [Scaling](#scaling)
8. [Preprocessing Pipeling](#pipeline)

<h2>Imports</h2>

In [2]:

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import os
import sys
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler




<h2>Loading Dataset</h2>

In [3]:

# Detect project root by going up until we find the 'src' directory
current_dir = os.getcwd()
while not os.path.isdir(os.path.join(current_dir, 'src')):
    current_dir = os.path.dirname(current_dir)
    if current_dir == os.path.dirname(current_dir):  # Reached filesystem root
        raise FileNotFoundError("Could not find 'src' directory in any parent folders.")

# Set project root and add it to sys.path
PROJECT_ROOT = current_dir
print(f"Setting project root: {PROJECT_ROOT}")
os.chdir(PROJECT_ROOT)
sys.path.insert(0, PROJECT_ROOT)


from src.data import loader, preprocessor
from src.visualization import exploration_visualized


Setting project root: c:\Users\HP\Desktop\Healthcare_test_results_classification-


In [4]:

project_root = r"C:\Users\HP\Desktop\Healthcare_test_results_classification-"  # Replace with the actual path
data_path = os.path.join(project_root, 'data', 'raw')

train_df, test_df = loader.load_data(
    train_path=os.path.join(data_path, 'train data.csv'),
    test_path=os.path.join(data_path, 'test data.csv')
)

train_df.head()


Unnamed: 0,ID,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,1,Bobby JacksOn,27,Female,O-,Asthma,06/06/2022,Mark Hartman Jr.,Sons and Miller,Cigna,2625.980554,379,Elective,18/08/2022,Ibuprofen,Normal
1,2,LesLie TErRy,68,Female,O-,Cancer,19/11/2021,Angela Contreras,White-White,Cigna,1471.387317,113,Elective,20/11/2021,Ibuprofen,Inconclusive
2,3,DaNnY sMitH,21,Female,A+,Hypertension,05/03/2022,David Ruiz,Group Middleton,Medicare,5131.488104,154,Emergency,16/05/2022,Paracetamol,Normal
3,4,andrEw waTtS,91,Male,AB-,Diabetes,06/04/2020,Jenny Griffith,Morris-Arellano,Blue Cross,8972.793157,293,Urgent,26/04/2020,Ibuprofen,Abnormal
4,5,adrIENNE bEll,52,Female,A+,Diabetes,31/12/2022,Cynthia Scott,Williams-Davis,Blue Cross,2015.522684,265,Emergency,11/02/2023,Penicillin,Abnormal


<h2 id="handled-names">Handling Names Column</h2>

In [None]:
import re
import pandas as pd

def cleaned_names(df, name_column):
    """
    Clean messy names in a dataset column.
    
    Args:
        df: pandas DataFrame
        name_column: string, name of the column containing messy names
    
    Returns:
        DataFrame with cleaned names in the same column
    """
    for i in range(len(df)):
        text = df.iloc[i][name_column]
        
        if pd.isna(text) or not str(text).strip():
            df.iloc[i, df.columns.get_loc(name_column)] = ""
            continue
        
        text = str(text).strip()
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        
        parts = []
        for part in text.lower().split():
            if '-' in part:
                parts.append('-'.join(p.capitalize() for p in part.split('-')))
            elif "'" in part and len(part.split("'")) == 2:
                p1, p2 = part.split("'")
                parts.append(p1.upper() + "'" + p2.capitalize())
            else:
                parts.append(part.capitalize())
        
        df.iloc[i, df.columns.get_loc(name_column)] = ' '.join(parts)
    
    return df

# cleaned_df = cleaned_names(train_df, 'Name')
# cleaned_df

Unnamed: 0,ID,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,1,Bobby Jackson,27,Female,O-,Asthma,06/06/2022,Mark Hartman Jr.,Sons and Miller,Cigna,2625.980554,379,Elective,18/08/2022,Ibuprofen,Normal
1,2,Leslie Terry,68,Female,O-,Cancer,19/11/2021,Angela Contreras,White-White,Cigna,1471.387317,113,Elective,20/11/2021,Ibuprofen,Inconclusive
2,3,Danny Smith,21,Female,A+,Hypertension,05/03/2022,David Ruiz,Group Middleton,Medicare,5131.488104,154,Emergency,16/05/2022,Paracetamol,Normal
3,4,Andrew Watts,91,Male,AB-,Diabetes,06/04/2020,Jenny Griffith,Morris-Arellano,Blue Cross,8972.793157,293,Urgent,26/04/2020,Ibuprofen,Abnormal
4,5,Adrienne Bell,52,Female,A+,Diabetes,31/12/2022,Cynthia Scott,Williams-Davis,Blue Cross,2015.522684,265,Emergency,11/02/2023,Penicillin,Abnormal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49996,Joseph Paul,42,Female,B-,Asthma,24/09/2021,Donna Martinez MD,Houston PLC,Cigna,5569.504046,119,Urgent,14/10/2021,Paracetamol,Inconclusive
49996,49997,Bradley Daniel,43,Male,A-,Asthma,21/09/2020,John Duncan,"Powers Miller, and Flores",Cigna,5190.988559,470,Urgent,28/09/2020,Aspirin,Abnormal
49997,49998,Lisa Simpson,72,Female,O+,Hypertension,19/08/2019,Timothy Baker,Schaefer-Porter,Blue Cross,181.636485,425,Urgent,10/10/2019,Aspirin,Abnormal
49998,49999,Roger Farrell,10,Female,AB-,Cancer,01/05/2024,Rose Zuniga,Nunez-Humphrey,Aetna,1398.821577,205,Urgent,06/06/2024,Penicillin,Abnormal


<h2 id="date-time">Handling DateTime Datatype</h2>

In [18]:
def handle_date_features(df):
    df = df.copy()
    date_like_columns = []

    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                # Try parsing with known consistent format first
                converted = pd.to_datetime(df[col], format="%d/%m/%Y", errors='raise')
                df[col] = pd.to_datetime(df[col], format="%d/%m/%Y", errors='coerce')
                date_like_columns.append(col)
              
            except Exception:
                continue  # Not a consistently date-formatted column

    if not date_like_columns:
        print("ℹ️ No date-like columns were found and converted.")
    else:
        print(f"\n📅 Detected and converted date columns: {date_like_columns}")
    
    # print("\n📄 Preview of dataset after date conversion:")
    return df



fixed_datatypes=handle_date_features(train_df)
# preprocessor.save_processed_df(fixed_datatypes,"processed_train_data.csv",output_dir="data/processed")
# fixed_datatypes.head()



📅 Detected and converted date columns: ['Date of Admission', 'Discharge Date']


<h2 id="negative-values">Handling Negative Values</h2>

In [7]:
import pandas as pd

def check_negative_values(df):
    numeric_cols = df.select_dtypes(include=['number']).columns
    negative_counts = {}

    for col in numeric_cols:
        neg_count = (df[col] < 0).sum()
        if neg_count > 0:
            negative_counts[col] = neg_count
    
    if negative_counts:
        print("Negative values found in columns:")
        for col, count in negative_counts.items():
            print(f" - {col}: {count} negative values")
    else:
        print("No negative values found in numerical columns.")


def replace_negatives_with_nan(df):
    numeric_cols = df.select_dtypes(include=['number']).columns
    
    for col in numeric_cols:
        df.loc[df[col] < 0, col] = pd.NA  # or use np.nan
    
    return df


# # Assume df is your DataFrame
# check_negative_values(train_df)  # Check first

# df = replace_negatives_with_nan(train_df)  # Replace negatives with NaN

# check_negative_values(train_df)  # Check again to verify
# df.to_csv('cleaned_dataset.csv', index=False)


<h2 id="missing-values">Handling Missing Values</h2>

In [8]:
def handle_missing_values(df):
  
    filled_info = []

    for col in df.columns:
        if df[col].isnull().any():
            if df[col].dtype in ['float64', 'int64']:
                median_val = df[col].median()
                df[col] = df[col].fillna(median_val)
                filled_info.append(f"Filled numerical column '{col}' with median: {median_val}")
            else:
                mode_val = df[col].mode().dropna()
                if not mode_val.empty:
                    mode_val = mode_val[0]
                    df[col] = df[col].fillna(mode_val)

                    filled_info.append(f"Filled categorical column '{col}' with mode: {mode_val}")
                else:
                    filled_info.append(f"Could not fill column '{col}' — no valid mode found.")

    # Print summary
    if filled_info:
        print("\n✅Missing values handled:\n" + "\n".join(filled_info))
    else:
        print("❌No missing values found.")

    return df

# handled_missing=handle_missing_values(fixed_datatypes)
# preprocessor.save_processed_df(handled_missing,"processed_train_data.csv",output_dir="data/processed")


<h2 id="encoding">Encoding</h2>

In [9]:
def encoding_features(df, max_unique_threshold=20):
    df = df.copy()
    
    # Drop irrelevant columns
    drop_cols = ['ID', 'Name', 'Room Number']
    df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)
    
    # Convert datetime columns to numeric features
    date_columns = df.select_dtypes(include=['datetime64']).columns
    for date_col in date_columns:
        # Convert to integer (days since epoch)
        df[f'{date_col}_days'] = df[date_col].astype(np.int64) // 10**9 // 86400
        # Drop original date column
        df.drop(columns=[date_col], inplace=True)
        print(f"✅ Converted '{date_col}' to integer days")
    
    # Handle binary categorical column: Gender
    if 'Gender' in df.columns:
        df = pd.get_dummies(df, columns=['Gender'], drop_first=True)
        print("✅ One-hot encoded 'Gender'")
    
    # Label encode target: Test Results (ordinal in context of classification)
    if 'Test Results' in df.columns:
        df['Test Results'] = df['Test Results'].map({
            'Normal': 0, 'Abnormal': 1, 'Inconclusive': 2
        })
        print("🎯 Label encoded target column 'Test Results'")
    
    # Detect categorical columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    categorical_cols = [col for col in categorical_cols if col != 'Test Results']
    
    # Separate based on cardinality
    low_cardinality_cols = [col for col in categorical_cols if df[col].nunique() <= max_unique_threshold]
    medium_cardinality_cols = [col for col in categorical_cols if max_unique_threshold < df[col].nunique() <= 50]
    high_cardinality_cols = [col for col in categorical_cols if df[col].nunique() > 50]
    
    # One-hot encode low-cardinality columns
    if low_cardinality_cols:
        df = pd.get_dummies(df, columns=low_cardinality_cols, drop_first=True)
        for col in low_cardinality_cols:
            print(f"✅ One-hot encoded '{col}'")
    
    # Target encoding for medium-cardinality columns (if target column exists)
    if 'Test Results' in df.columns and medium_cardinality_cols:
        for col in medium_cardinality_cols:
            # Calculate mean of target for each category
            encoding_map = df.groupby(col)['Test Results'].mean().to_dict()
            new_col_name = f"{col}_target_encoded"
            df[new_col_name] = df[col].map(encoding_map)
            df.drop(columns=[col], inplace=True)
            print(f"✅ Target encoded '{col}'")
    
    # Frequency encoding for high-cardinality columns
    for col in high_cardinality_cols:
        freq_map = df[col].value_counts(normalize=True).to_dict()
        new_col_name = f"{col}_freq_encoded"
        df[new_col_name] = df[col].map(freq_map)
        df.drop(columns=[col], inplace=True)
        print(f"✅ Frequency encoded '{col}'")
    
    print(f"\n📐 Encoded shape: {df.shape}")
    return df



# encoded_df = encoding_features(train_df)
# print("Encoded shape:", encoded_df.shape)
# encoded_df=preprocessor.encoding_features(handled_missing)

# encoded_df = preprocessor.encoding_features(encoded_df)
# preprocessor.save_processed_df(encoded_df, "processed_train_data.csv", output_dir="data/processed")

# encoded_df.head()


<h2 id="feature-engineering">Feature Engineering</h2>

In [None]:
def calculate_los(df, admission_col='Date of Admission', discharge_col='Discharge Date', 
                 dayfirst=True, fix_errors=True, verbose=True):
    """
    Calculate Length of Stay (LOS) in days between admission and discharge dates.
    """
    
    # Parse dates with error handling
    df[admission_col] = pd.to_datetime(df[admission_col], dayfirst=dayfirst, errors='coerce')
    df[discharge_col] = pd.to_datetime(df[discharge_col], dayfirst=dayfirst, errors='coerce')
    
    # Calculate LOS
    df['LOS_days'] = (df[discharge_col] - df[admission_col]).dt.days
    
    if verbose:
        # Show parsing success rate
        n_missing = df[['LOS_days']].isna().sum().values[0]
        print(f"Successfully parsed {len(df) - n_missing}/{len(df)} rows ({n_missing} failed)")
        
        # Show LOS statistics
        if n_missing < len(df):
            print("\nLOS statistics (before fixes):")
            print(df['LOS_days'].describe())
    
    # Fix common issues
    if fix_errors:
        # Fix negative LOS (swap dates)
        neg_mask = df['LOS_days'] < 0
        if verbose and neg_mask.any():
            print(f"\nFound {neg_mask.sum()} negative LOS values (swapping dates)")
            
        df.loc[neg_mask, [admission_col, discharge_col]] = (
            df.loc[neg_mask, [discharge_col, admission_col]].values
        )
        
        # Recalculate LOS after fixes
        df['LOS_days'] = (df[discharge_col] - df[admission_col]).dt.days
        
        # Cap unrealistic LOS (optional)
        upper_limit = 365  # 1 year as maximum reasonable stay
        long_stays = df['LOS_days'] > upper_limit
        if verbose and long_stays.any():
            print(f"Found {long_stays.sum()} stays > {upper_limit} days (capping at {upper_limit})")
            df.loc[long_stays, 'LOS_days'] = upper_limit
    
    if verbose:
        print("\nSample of results:")
        print(df[[admission_col, discharge_col, 'LOS_days']].head())
        
    return df

# calculate_los(fixed_datatypes)

Successfully parsed 50000/50000 rows (0 failed)

LOS statistics (before fixes):
count    50000.000000
mean        42.652000
std         25.853069
min         -1.000000
25%         21.000000
50%         42.000000
75%         63.000000
max        100.000000
Name: LOS_days, dtype: float64

Found 203 negative LOS values (swapping dates)

Sample of results:
  Date of Admission Discharge Date  LOS_days
0        2022-06-06     2022-08-18        73
1        2021-11-19     2021-11-20         1
2        2022-03-05     2022-05-16        72
3        2020-04-06     2020-04-26        20
4        2022-12-31     2023-02-11        42


Unnamed: 0,ID,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results,LOS_days
0,1,Bobby Jackson,27,Female,O-,Asthma,2022-06-06,Mark Hartman Jr.,Sons and Miller,Cigna,2625.980554,379,Elective,2022-08-18,Ibuprofen,Normal,73
1,2,Leslie Terry,68,Female,O-,Cancer,2021-11-19,Angela Contreras,White-White,Cigna,1471.387317,113,Elective,2021-11-20,Ibuprofen,Inconclusive,1
2,3,Danny Smith,21,Female,A+,Hypertension,2022-03-05,David Ruiz,Group Middleton,Medicare,5131.488104,154,Emergency,2022-05-16,Paracetamol,Normal,72
3,4,Andrew Watts,91,Male,AB-,Diabetes,2020-04-06,Jenny Griffith,Morris-Arellano,Blue Cross,8972.793157,293,Urgent,2020-04-26,Ibuprofen,Abnormal,20
4,5,Adrienne Bell,52,Female,A+,Diabetes,2022-12-31,Cynthia Scott,Williams-Davis,Blue Cross,2015.522684,265,Emergency,2023-02-11,Penicillin,Abnormal,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49996,Joseph Paul,42,Female,B-,Asthma,2021-09-24,Donna Martinez MD,Houston PLC,Cigna,5569.504046,119,Urgent,2021-10-14,Paracetamol,Inconclusive,20
49996,49997,Bradley Daniel,43,Male,A-,Asthma,2020-09-21,John Duncan,"Powers Miller, and Flores",Cigna,5190.988559,470,Urgent,2020-09-28,Aspirin,Abnormal,7
49997,49998,Lisa Simpson,72,Female,O+,Hypertension,2019-08-19,Timothy Baker,Schaefer-Porter,Blue Cross,181.636485,425,Urgent,2019-10-10,Aspirin,Abnormal,52
49998,49999,Roger Farrell,10,Female,AB-,Cancer,2024-05-01,Rose Zuniga,Nunez-Humphrey,Aetna,1398.821577,205,Urgent,2024-06-06,Penicillin,Abnormal,36


<h2 id="scaling">Scaling (Standrization)</h2>


PCA, Logistic Regression, SVM, MLP → Use StandardScaler

In [10]:
def scale_numerical_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Define known numerical features
    numeric_cols = ['Age', 'Billing Amount']

    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    print(f"\n✅ Scaled numerical columns: {numeric_cols}")
    
    print(f"\n📐 Scaled shape: {df.shape}")
    # print(f"\n📄 Preview of scaled dataset:")

    return df

# scaled_df=scale_numerical_features(encoded_df)

#  preprocessor.save_processed_df(scaled_df,"processed_train_data.csv",output_dir="data/processed")
#  scaled_df.head()

<h2 id="pipeline">Preprocessing Pipeline</h2>

In [16]:
import os

def process_and_save_data(train_df, outputpath="data/processed/"):
    os.makedirs(outputpath, exist_ok=True)

    # Step-by-step preprocessing
    handeled_names= cleaned_names(train_df, 'Name')
    handled_dates = handle_date_features(handeled_names)
    handled_negatives = replace_negatives_with_nan(handled_dates)
    handled_missing_values = handle_missing_values(handled_negatives)
    encoded_data = encoding_features(handled_missing_values)
    scaled_data = scale_numerical_features(encoded_data)

    # Save and return
    preprocessor.save_processed_df(scaled_data, "processed_train_data.csv", output_dir=outputpath)
    print(f"✅ Processed data saved to {outputpath}processed_train_data.csv")
   
    return scaled_data

# Call it
processed_train_df = process_and_save_data(train_df, outputpath="data/processed/")
processed_train_df.head()



📅 Detected and converted date columns: ['Date of Admission', 'Discharge Date']

✅Missing values handled:
Filled categorical column 'Blood Type' with mode: B-
Filled categorical column 'Doctor' with mode: Angela Contreras
Filled categorical column 'Hospital' with mode: Houston PLC
Filled categorical column 'Insurance Provider' with mode: Blue Cross
Filled numerical column 'Billing Amount' with median: 5340.178472
Filled categorical column 'Admission Type' with mode: Urgent
✅ Converted 'Date of Admission' to integer days
✅ Converted 'Discharge Date' to integer days
✅ One-hot encoded 'Gender'
🎯 Label encoded target column 'Test Results'
✅ One-hot encoded 'Blood Type'
✅ One-hot encoded 'Medical Condition'
✅ One-hot encoded 'Insurance Provider'
✅ One-hot encoded 'Admission Type'
✅ One-hot encoded 'Medication'
✅ Frequency encoded 'Doctor'
✅ Frequency encoded 'Hospital'

📐 Encoded shape: (50000, 30)

✅ Scaled numerical columns: ['Age', 'Billing Amount']

📐 Scaled shape: (50000, 30)
✅ Process

Unnamed: 0,Age,Billing Amount,Test Results,Date of Admission_days,Discharge Date_days,Gender_Male,Blood Type_A-,Blood Type_AB+,Blood Type_AB-,Blood Type_B+,...,Insurance Provider_Medicare,Insurance Provider_UnitedHealthcare,Admission Type_Emergency,Admission Type_Urgent,Medication_Ibuprofen,Medication_Lipitor,Medication_Paracetamol,Medication_Penicillin,Doctor_freq_encoded,Hospital_freq_encoded
0,-0.7726,-0.877583,0,19149,19222,False,False,False,False,False,...,False,False,False,False,True,False,False,False,0.01056,0.027
1,0.906636,-1.239534,2,18950,18951,False,False,False,False,False,...,False,False,False,False,True,False,False,False,0.02778,0.04216
2,-1.018342,-0.092137,0,19056,19128,False,False,False,False,False,...,True,False,True,False,False,False,True,False,0.00698,0.03122
3,1.848646,1.112066,1,18358,18378,True,False,False,True,False,...,False,False,False,True,True,False,False,False,0.00132,0.02846
4,0.251324,-1.068955,1,19357,19399,False,False,False,False,False,...,False,False,True,False,False,False,False,True,0.0072,0.027


<h2>Old Pipeline</h2>

In [12]:

# class DateFeatureHandler(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None): return self
#     def transform(self, X): return preprocessor.handle_date_features(X)

# class MissingValueImputer(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None): return self
#     def transform(self, X): return preprocessor.handle_missing_values(X)

# class FeatureEncoder(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None): return self
#     def transform(self, X): return preprocessor.encoding_features(X)


# class FeatureScaler(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None): return self
#     def transform(self, X): return preprocessor.scale_numerical_features(X)


# preprocessing_pipeline = Pipeline([
#     ('imputer', MissingValueImputer()),
#     ('date_handler', DateFeatureHandler()),
#     ('encoder', FeatureEncoder()),
#     ('scaler', FeatureScaler())
# ])

# processed_df = preprocessing_pipeline.fit_transform(train_df)

# preprocessor.save_processed_df(processed_df, filename="preprocessed_train_data.csv")
