<h2 style="text-align:center;">Data Preprocessing</h2>

<h2>Imports</h2>

In [156]:

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import os
import sys
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler




<h2>🔃Loading Dataset</h2>

In [157]:

# Detect project root by going up until we find the 'src' directory
current_dir = os.getcwd()
while not os.path.isdir(os.path.join(current_dir, 'src')):
    current_dir = os.path.dirname(current_dir)
    if current_dir == os.path.dirname(current_dir):  # Reached filesystem root
        raise FileNotFoundError("Could not find 'src' directory in any parent folders.")

# Set project root and add it to sys.path
PROJECT_ROOT = current_dir
print(f"Setting project root: {PROJECT_ROOT}")
os.chdir(PROJECT_ROOT)
sys.path.insert(0, PROJECT_ROOT)


from src.data import loader, preprocessor
from src.visualization import exploration_visualized


Setting project root: c:\Users\HP\Desktop\Healthcare_test_results_classification-


In [158]:

project_root = r"C:\Users\HP\Desktop\Healthcare_test_results_classification-"  # Replace with the actual path
data_path = os.path.join(project_root, 'data', 'raw')

train_df, test_df = loader.load_data(
    train_path=os.path.join(data_path, 'train data.csv'),
    test_path=os.path.join(data_path, 'test data.csv')
)

train_df.head()


Unnamed: 0,ID,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,1,Bobby JacksOn,27,Female,O-,Asthma,06/06/2022,Mark Hartman Jr.,Sons and Miller,Cigna,2625.980554,379,Elective,18/08/2022,Ibuprofen,Normal
1,2,LesLie TErRy,68,Female,O-,Cancer,19/11/2021,Angela Contreras,White-White,Cigna,1471.387317,113,Elective,20/11/2021,Ibuprofen,Inconclusive
2,3,DaNnY sMitH,21,Female,A+,Hypertension,05/03/2022,David Ruiz,Group Middleton,Medicare,5131.488104,154,Emergency,16/05/2022,Paracetamol,Normal
3,4,andrEw waTtS,91,Male,AB-,Diabetes,06/04/2020,Jenny Griffith,Morris-Arellano,Blue Cross,8972.793157,293,Urgent,26/04/2020,Ibuprofen,Abnormal
4,5,adrIENNE bEll,52,Female,A+,Diabetes,31/12/2022,Cynthia Scott,Williams-Davis,Blue Cross,2015.522684,265,Emergency,11/02/2023,Penicillin,Abnormal


<h2>Handling DateTime Datatype</h2>

In [159]:
def handle_date_features(df):
    df = df.copy()
    date_like_columns = []

    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                # Try parsing with known consistent format first
                converted = pd.to_datetime(df[col], format="%d/%m/%Y", errors='raise')
                df[col] = pd.to_datetime(df[col], format="%d/%m/%Y", errors='coerce')
                date_like_columns.append(col)
              
            except Exception:
                continue  # Not a consistently date-formatted column

    if not date_like_columns:
        print("ℹ️ No date-like columns were found and converted.")
    else:
        print(f"\n📅 Detected and converted date columns: {date_like_columns}")
    
    # print("\n📄 Preview of dataset after date conversion:")
    return df



# fixed_datatypes=preprocessor.handle_date_features(train_df)
# preprocessor.save_processed_df(fixed_datatypes,"processed_train_data.csv",output_dir="data/processed")
# fixed_datatypes.head()


<h2>Handling Missing Values</h2>

In [161]:
def handle_missing_values(df):
  
    filled_info = []

    for col in df.columns:
        if df[col].isnull().any():
            if df[col].dtype in ['float64', 'int64']:
                median_val = df[col].median()
                df[col] = df[col].fillna(median_val)
                filled_info.append(f"Filled numerical column '{col}' with median: {median_val}")
            else:
                mode_val = df[col].mode().dropna()
                if not mode_val.empty:
                    mode_val = mode_val[0]
                    df[col] = df[col].fillna(mode_val)

                    filled_info.append(f"Filled categorical column '{col}' with mode: {mode_val}")
                else:
                    filled_info.append(f"Could not fill column '{col}' — no valid mode found.")

    # Print summary
    if filled_info:
        print("\n✅Missing values handled:\n" + "\n".join(filled_info))
    else:
        print("❌No missing values found.")

    return df

# handled_missing=preprocessor.handle_missing_values(fixed_datatypes)
# preprocessor.save_processed_df(handled_missing,"processed_train_data.csv",output_dir="data/processed")


<h2>Handling Negative Values</h2>

In [163]:

def Handle_Negative_Values(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    numeric_cols = df.select_dtypes(include='number').columns

    for col in numeric_cols:
        # Identify negative values
        neg_mask = df[col] < 0

        if neg_mask.any():
            # Compute median of non-negative values
            median_val = df.loc[~neg_mask, col].median()
            
            # Replace negative values with median
            df.loc[neg_mask, col] = median_val
            
            print(f"\n🔄 Replaced {neg_mask.sum()} negative values in '{col}' with median: {median_val}")

    return df



<h2>Encoding</h2>

In [164]:
def encoding_features(df: pd.DataFrame, max_unique_threshold=50) -> pd.DataFrame:
    df = df.copy()

    # Drop irrelevant columns
    drop_cols = ['ID', 'Name', 'Room Number']
    df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)

    # Handle binary categorical column: Gender
    if 'Gender' in df.columns:
        df = pd.get_dummies(df, columns=['Gender'], drop_first=True)
        print("\n✅ One-hot encoded 'Gender'.")

    # Label encode target: Test Results (ordinal in context of classification)
    if 'Test Results' in df.columns:
        df['Test Results'] = df['Test Results'].map({
            'Normal': 0, 'Abnormal': 1, 'Inconclusive': 2
        })
        print("🎯 Label encoded target column 'Test Results'.")

    # Detect remaining categorical columns
    categorical_cols = df.select_dtypes(include='object').columns.tolist()
    categorical_cols = [col for col in categorical_cols if col != 'Test Results']

    # Separate based on cardinality
    low_cardinality_cols = [col for col in categorical_cols if df[col].nunique() <= max_unique_threshold]
    high_cardinality_cols = [col for col in categorical_cols if df[col].nunique() > max_unique_threshold]

    # One-hot encode low-cardinality columns
    if low_cardinality_cols:
        df = pd.get_dummies(df, columns=low_cardinality_cols, drop_first=True)
        for col in low_cardinality_cols:
            print(f"✅ One-hot encoded '{col}'.")

    # Frequency encode high-cardinality columns
    for col in high_cardinality_cols:
        freq_map = df[col].value_counts()
        df[col] = df[col].map(freq_map)
        print(f"✅ Frequency encoded '{col}'.")

    print(f"\n📐 Encoded shape: {df.shape}")
    return df



# encoded_df = preprocessor.encode_students_dataset(train_df)
# print("Encoded shape:", encoded_df.shape)
# encoded_df=preprocessor.encoding_features(handled_missing)

# encoded_df = preprocessor.encoding_features(encoded_df)
# preprocessor.save_processed_df(encoded_df, "processed_train_data.csv", output_dir="data/processed")

# encoded_df.head()


<h2>Scaling (Standrization)</h2>


PCA, Logistic Regression, SVM, MLP → Use StandardScaler

In [166]:
def scale_numerical_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Define known numerical features
    numeric_cols = ['Age', 'Billing Amount']

    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    print(f"\n✅ Scaled numerical columns: {numeric_cols}")
    
    print(f"\n📐 Scaled shape: {df.shape}")
    # print(f"\n📄 Preview of scaled dataset:")

    return df

#  scaled_df=preprocessor.scale_numerical_features(encoded_df)

#  preprocessor.save_processed_df(scaled_df,"processed_train_data.csv",output_dir="data/processed")
#  scaled_df.head()

<h2>Preprocessing Pipeline</h2>

In [168]:
import os

def process_and_save_data(train_df, outputpath="data/processed/"):
    os.makedirs(outputpath, exist_ok=True)

    # Step-by-step preprocessing
    handled_dates = handle_date_features(train_df)
    handled_missing_values = handle_missing_values(handled_dates)
    handled_negatives = Handle_Negative_Values(handled_missing_values)
    encoded_data = encoding_features(handled_negatives)
    scaled_data = scale_numerical_features(encoded_data)

    # Save and return
    preprocessor.save_processed_df(scaled_data, "processed_train_data.csv", output_dir=outputpath)
    print(f"✅ Processed data saved to {outputpath}processed_train_data.csv")
   
    return scaled_data

# Call it
processed_train_df = process_and_save_data(train_df, outputpath="data/processed/")
processed_train_df.head()



📅 Detected and converted date columns: ['Date of Admission', 'Discharge Date']

✅Missing values handled:
Filled categorical column 'Blood Type' with mode: B-
Filled categorical column 'Doctor' with mode: Angela Contreras
Filled categorical column 'Hospital' with mode: Houston PLC
Filled categorical column 'Insurance Provider' with mode: Blue Cross
Filled numerical column 'Billing Amount' with median: 5313.5078885
Filled categorical column 'Admission Type' with mode: Urgent

🔄 Replaced 247 negative values in 'Billing Amount' with median: 5313.5078885

✅ One-hot encoded 'Gender'.
🎯 Label encoded target column 'Test Results'.
✅ One-hot encoded 'Blood Type'.
✅ One-hot encoded 'Medical Condition'.
✅ One-hot encoded 'Insurance Provider'.
✅ One-hot encoded 'Admission Type'.
✅ One-hot encoded 'Medication'.
✅ Frequency encoded 'Doctor'.
✅ Frequency encoded 'Hospital'.

📐 Encoded shape: (50000, 30)

✅ Scaled numerical columns: ['Age', 'Billing Amount']

📐 Scaled shape: (50000, 30)
✅ Saved proce

Unnamed: 0,Age,Date of Admission,Doctor,Hospital,Billing Amount,Discharge Date,Test Results,Gender_Male,Blood Type_A-,Blood Type_AB+,...,Insurance Provider_Blue Cross,Insurance Provider_Cigna,Insurance Provider_Medicare,Insurance Provider_UnitedHealthcare,Admission Type_Emergency,Admission Type_Urgent,Medication_Ibuprofen,Medication_Lipitor,Medication_Paracetamol,Medication_Penicillin
0,-0.7726,2022-06-06,528,1350,-0.877417,2022-08-18,0,False,False,False,...,False,True,False,False,False,False,True,False,False,False
1,0.906636,2021-11-19,1389,2108,-1.239366,2021-11-20,2,False,False,False,...,False,True,False,False,False,False,True,False,False,False
2,-1.018342,2022-03-05,349,1561,-0.091974,2022-05-16,0,False,False,False,...,False,False,True,False,True,False,False,False,True,False
3,1.848646,2020-04-06,66,1423,1.112222,2020-04-26,1,True,False,False,...,True,False,False,False,False,True,True,False,False,False
4,0.251324,2022-12-31,360,1350,-1.068787,2023-02-11,1,False,False,False,...,True,False,False,False,True,False,False,False,False,True


<h2>Old Pipeline</h2>

In [None]:

# class DateFeatureHandler(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None): return self
#     def transform(self, X): return preprocessor.handle_date_features(X)

# class MissingValueImputer(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None): return self
#     def transform(self, X): return preprocessor.handle_missing_values(X)

# class FeatureEncoder(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None): return self
#     def transform(self, X): return preprocessor.encoding_features(X)


# class FeatureScaler(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None): return self
#     def transform(self, X): return preprocessor.scale_numerical_features(X)


# preprocessing_pipeline = Pipeline([
#     ('imputer', MissingValueImputer()),
#     ('date_handler', DateFeatureHandler()),
#     ('encoder', FeatureEncoder()),
#     ('scaler', FeatureScaler())
# ])

# processed_df = preprocessing_pipeline.fit_transform(train_df)

# preprocessor.save_processed_df(processed_df, filename="preprocessed_train_data.csv")
