### Step 1: Importing dependencies

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle
import os

### Step 2: Loading the dataset

In [2]:
df = pd.read_csv('../dataset/patient_data.csv')
df.head()

Unnamed: 0,C,Age,History,Patient,TakeMedication,Severity,BreathShortness,VisualChanges,NoseBleeding,Whendiagnoused,Systolic,Diastolic,ControlledDiet,Stages
0,Male,18-34,Yes,No,No,Mild,No,No,No,<1 Year,111 - 120,81 - 90,No,HYPERTENSION (Stage-1)
1,Female,18-34,Yes,No,No,Mild,No,No,No,<1 Year,111 - 120,81 - 90,No,HYPERTENSION (Stage-1)
2,Male,35-50,Yes,No,No,Mild,No,No,No,<1 Year,111 - 120,81 - 90,No,HYPERTENSION (Stage-1)
3,Female,35-50,Yes,No,No,Mild,No,No,No,<1 Year,111 - 120,81 - 90,No,HYPERTENSION (Stage-1)
4,Male,51-64,Yes,No,No,Mild,No,No,No,<1 Year,111 - 120,81 - 90,No,HYPERTENSION (Stage-1)


### Step 3: Renaming the column 'C' to gender

In [3]:
df.rename(columns={'C': 'Gender'}, inplace=True)
df.head()

Unnamed: 0,Gender,Age,History,Patient,TakeMedication,Severity,BreathShortness,VisualChanges,NoseBleeding,Whendiagnoused,Systolic,Diastolic,ControlledDiet,Stages
0,Male,18-34,Yes,No,No,Mild,No,No,No,<1 Year,111 - 120,81 - 90,No,HYPERTENSION (Stage-1)
1,Female,18-34,Yes,No,No,Mild,No,No,No,<1 Year,111 - 120,81 - 90,No,HYPERTENSION (Stage-1)
2,Male,35-50,Yes,No,No,Mild,No,No,No,<1 Year,111 - 120,81 - 90,No,HYPERTENSION (Stage-1)
3,Female,35-50,Yes,No,No,Mild,No,No,No,<1 Year,111 - 120,81 - 90,No,HYPERTENSION (Stage-1)
4,Male,51-64,Yes,No,No,Mild,No,No,No,<1 Year,111 - 120,81 - 90,No,HYPERTENSION (Stage-1)


### Step 4: Check for Missing Values

In [4]:
df.isnull().sum()

Gender             0
Age                0
History            0
Patient            0
TakeMedication     0
Severity           0
BreathShortness    0
VisualChanges      0
NoseBleeding       0
Whendiagnoused     0
Systolic           0
Diastolic          0
ControlledDiet     0
Stages             0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,Gender,Age,History,Patient,TakeMedication,Severity,BreathShortness,VisualChanges,NoseBleeding,Whendiagnoused,Systolic,Diastolic,ControlledDiet,Stages
count,1825,1825,1825,1825,1825,1825,1825,1825,1825,1825,1825,1825,1825,1825
unique,2,4,2,2,3,3,2,2,3,3,5,5,2,6
top,Female,51-64,Yes,No,No,Moderate,No,No,No,<1 Year,111 - 120,81 - 90,No,HYPERTENSION (Stage-1)
freq,913,475,1657,984,744,697,976,940,984,625,1008,708,984,648


### Step 5: Inspect Data Types
Confirming that all columns are currently `object` (string) type, indicating they need encoding.



In [6]:
print("DataFrame Info (Data Types and Non-Null Counts):")
df.info()

DataFrame Info (Data Types and Non-Null Counts):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1825 entries, 0 to 1824
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Gender           1825 non-null   object
 1   Age              1825 non-null   object
 2   History          1825 non-null   object
 3   Patient          1825 non-null   object
 4   TakeMedication   1825 non-null   object
 5   Severity         1825 non-null   object
 6   BreathShortness  1825 non-null   object
 7   VisualChanges    1825 non-null   object
 8   NoseBleeding     1825 non-null   object
 9   Whendiagnoused   1825 non-null   object
 10  Systolic         1825 non-null   object
 11  Diastolic        1825 non-null   object
 12  ControlledDiet   1825 non-null   object
 13  Stages           1825 non-null   object
dtypes: object(14)
memory usage: 199.7+ KB


Discover Unique Values for Categorical Columns

We will use `df['column_name'].unique()` to get all distinct values.

In [7]:
df.columns

Index(['Gender', 'Age', 'History', 'Patient', 'TakeMedication', 'Severity',
       'BreathShortness', 'VisualChanges', 'NoseBleeding', 'Whendiagnoused',
       'Systolic', 'Diastolic', 'ControlledDiet', 'Stages'],
      dtype='object')

In [8]:
for col in df.columns:
    print(f"--- Unique Values for '{col}' ---")
    print(f"Column: {col}, Data Type: {df[col].dtype}, Unique Values: {df[col].unique()}")
    print("-" * 40)

--- Unique Values for 'Gender' ---
Column: Gender, Data Type: object, Unique Values: ['Male' 'Female']
----------------------------------------
--- Unique Values for 'Age' ---
Column: Age, Data Type: object, Unique Values: ['18-34' '35-50' '51-64' '65+']
----------------------------------------
--- Unique Values for 'History' ---
Column: History, Data Type: object, Unique Values: ['Yes' 'No']
----------------------------------------
--- Unique Values for 'Patient' ---
Column: Patient, Data Type: object, Unique Values: ['No' 'Yes']
----------------------------------------
--- Unique Values for 'TakeMedication' ---
Column: TakeMedication, Data Type: object, Unique Values: ['No' 'Yes ' 'Yes']
----------------------------------------
--- Unique Values for 'Severity' ---
Column: Severity, Data Type: object, Unique Values: ['Mild' 'Sever' 'Moderate']
----------------------------------------
--- Unique Values for 'BreathShortness' ---
Column: BreathShortness, Data Type: object, Unique Values:

### Step 6: Data Cleaning - Resolving Typos and Inconsistencies
This is to ensure that each unique concept has a single, consistent representation.

In [9]:
print("--- Starting Data Cleaning for Inconsistencies ---")

# 1. Apply .str.strip() to all object columns to remove leading/trailing whitespace
# This resolves 'Yes ' vs 'Yes' and 'No ' vs 'No'
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].str.strip()
print("1. Applied .str.strip() to all object columns to handle whitespace issues.")

# 2. Correct specific spelling/punctuation typos using .replace()

# Severity column: 'Sever' should be 'Severe'
df['Severity'] = df['Severity'].replace('Sever', 'Severe')
print("2. Corrected 'Sever' to 'Severe' in 'Severity' column.")

# Stages column: Address punctuation and spelling issues
df['Stages'] = df['Stages'].replace('HYPERTENSION (Stage-2).', 'HYPERTENSION (Stage-2)')
df['Stages'] = df['Stages'].replace('HYPERTENSIVE CRISI', 'HYPERTENSIVE CRISIS')
print("3. Corrected typos and punctuation in 'Stages' column.")

# Systolic column: Fix inconsistent spacing for ranges
df['Systolic'] = df['Systolic'].replace('121- 130', '121 - 130')
print("4. Corrected inconsistent spacing in 'Systolic' column.")

print("\n--- Re-checking Unique Values After Cleaning ---")
for col in df.columns:
    print(f"--- Unique Values for '{col}' ---")
    print(f"Column: {col}, Data Type: {df[col].dtype}, Unique Values: {df[col].unique()}")
    print("-" * 40)

--- Starting Data Cleaning for Inconsistencies ---
1. Applied .str.strip() to all object columns to handle whitespace issues.
2. Corrected 'Sever' to 'Severe' in 'Severity' column.
3. Corrected typos and punctuation in 'Stages' column.
4. Corrected inconsistent spacing in 'Systolic' column.

--- Re-checking Unique Values After Cleaning ---
--- Unique Values for 'Gender' ---
Column: Gender, Data Type: object, Unique Values: ['Male' 'Female']
----------------------------------------
--- Unique Values for 'Age' ---
Column: Age, Data Type: object, Unique Values: ['18-34' '35-50' '51-64' '65+']
----------------------------------------
--- Unique Values for 'History' ---
Column: History, Data Type: object, Unique Values: ['Yes' 'No']
----------------------------------------
--- Unique Values for 'Patient' ---
Column: Patient, Data Type: object, Unique Values: ['No' 'Yes']
----------------------------------------
--- Unique Values for 'TakeMedication' ---
Column: TakeMedication, Data Type: ob

### Step 7: Handle Categorical Data
Transforming string-based categorical features into numerical formats suitable for machine learning models. We'll differentiate between Nominal and Ordinal categories.

Define Ordinal Columns and their specific, ORDERED categories

The order reflects a logical, typically increasing, severity or quantity.

In [10]:
ordinal_cols_and_order = {
    'Age': ['18-34', '35-50', '51-64', '65+'],
    'Severity': ['Mild', 'Moderate', 'Severe'],
    'Whendiagnoused': ['<1 Year', '1 - 5 Years', '>5 Years'],
    'Systolic': ['100+', '111 - 120', '121 - 130', '130+'],
    'Diastolic': ['70 - 80', '81 - 90', '91 - 100', '100+', '130+'],
    'Stages': ['NORMAL', 'HYPERTENSION (Stage-1)', 'HYPERTENSION (Stage-2)', 'HYPERTENSIVE CRISIS']
}

# Extract just the column names for ordinal encoding
ordinal_features = list(ordinal_cols_and_order.keys())

Define Nominal Columns (no inherent order, just a list of column names)

Things like 'Yes' and 'No' those that can be converted to binary values.

In [11]:
nominal_features = [
    'Gender',
    'History',
    'Patient',
    'TakeMedication',
    'BreathShortness',
    'VisualChanges',
    'NoseBleeding',
    'ControlledDiet'
]

# Verify all columns are covered
all_expected_columns = set(df.columns)
identified_features = set(ordinal_features + nominal_features)

if all_expected_columns != identified_features:
    print("\nWARNING: Mismatch in column identification!")
    print(f"Columns in DataFrame but not identified: {all_expected_columns - identified_features}")
    print(f"Columns identified but not in DataFrame: {identified_features - all_expected_columns}")
else:
    print("\nAll DataFrame columns successfully identified as either ordinal or nominal features.")


All DataFrame columns successfully identified as either ordinal or nominal features.


Create and Apply ColumnTransformer for Encoding

This step applies `OrdinalEncoder` to ordinal features and `OneHotEncoder` to nominal features,
creating a fully numerical DataFrame.

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        # Apply OrdinalEncoder to ordinal features with their defined categories
        ('ord', OrdinalEncoder(categories=[ordinal_cols_and_order[col] for col in ordinal_features]), ordinal_features),
        # Apply OneHotEncoder to nominal features
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_features)
    ],
    remainder='passthrough' # Keep any columns not explicitly transformed (should be none here as all are identified)
)

# The output of ColumnTransformer is a NumPy array.
df_processed_array = preprocessor.fit_transform(df)

# Get the names of the new columns after transformation
# Ordinal features retain their original names
ordinal_transformed_names = ordinal_features

# One-Hot Encoded features get new names (e.g., 'Gender_Male', 'Gender_Female')
ohe_transformed_names = preprocessor.named_transformers_['ohe'].get_feature_names_out(nominal_features)

# Combine all new column names for the resulting DataFrame
all_transformed_column_names = list(ordinal_transformed_names) + list(ohe_transformed_names)

# Create the new DataFrame with processed data and correct column names
df_processed = pd.DataFrame(df_processed_array, columns=all_transformed_column_names)


Verify Processed DataFrame

In [13]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1825 entries, 0 to 1824
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  1825 non-null   float64
 1   Severity             1825 non-null   float64
 2   Whendiagnoused       1825 non-null   float64
 3   Systolic             1825 non-null   float64
 4   Diastolic            1825 non-null   float64
 5   Stages               1825 non-null   float64
 6   Gender_Female        1825 non-null   float64
 7   Gender_Male          1825 non-null   float64
 8   History_No           1825 non-null   float64
 9   History_Yes          1825 non-null   float64
 10  Patient_No           1825 non-null   float64
 11  Patient_Yes          1825 non-null   float64
 12  TakeMedication_No    1825 non-null   float64
 13  TakeMedication_Yes   1825 non-null   float64
 14  BreathShortness_No   1825 non-null   float64
 15  BreathShortness_Yes  1825 non-null   f

In [14]:
df_processed.head()

Unnamed: 0,Age,Severity,Whendiagnoused,Systolic,Diastolic,Stages,Gender_Female,Gender_Male,History_No,History_Yes,...,TakeMedication_No,TakeMedication_Yes,BreathShortness_No,BreathShortness_Yes,VisualChanges_No,VisualChanges_Yes,NoseBleeding_No,NoseBleeding_Yes,ControlledDiet_No,ControlledDiet_Yes
0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,2.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [15]:
print("\nUnique values of a few processed ordinal columns (should now be numerical):")
print(f"Age unique values: {df_processed['Age'].unique()}")
print(f"Severity unique values: {df_processed['Severity'].unique()}")
print(f"Stages unique values: {df_processed['Stages'].unique()}")

print("\nUnique values of a few processed nominal (one-hot encoded) columns (should be 0 or 1):")
print(f"Gender_Male unique values: {df_processed['Gender_Male'].unique()}")
print(f"History_Yes unique values: {df_processed['History_Yes'].unique()}")


Unique values of a few processed ordinal columns (should now be numerical):
Age unique values: [0. 1. 2. 3.]
Severity unique values: [0. 2. 1.]
Stages unique values: [1. 2. 3. 0.]

Unique values of a few processed nominal (one-hot encoded) columns (should be 0 or 1):
Gender_Male unique values: [1. 0.]
History_Yes unique values: [1. 0.]


### Step 8: Saving the transformed and cleaned dataset

In [None]:
os.makedirs('../dataset', exist_ok=True)
os.makedirs('../models', exist_ok=True)

# 1. Save the processed DataFrame to CSV
output_csv_path = '../dataset/numerical_processed_patient_data.csv'
df_processed.to_csv(output_csv_path, index=False)
print(f"Processed DataFrame saved to: {output_csv_path}")

# 2. Save the fitted preprocessor object using pickle
preprocessor_path = '../models/fitted_preprocessor.pkl'
with open(preprocessor_path, 'wb') as f:
    pickle.dump(preprocessor, f)
print(f"Fitted preprocessor saved to: {preprocessor_path}")


Processed DataFrame saved to: ../dataset/processed_patient_data.csv
Fitted preprocessor saved to: ../models/fitted_preprocessor.pkl
