# 02 Data Preprocessing and Cleaning

## 2.1. Handle Missing Values


In [3]:
import pandas as pd

In [4]:
# Load the data
df = pd.read_csv('C:\\Users\\555555\\Malaria-Typhoid-ML-Diagnosis\\data\\malaria_typhoid_multiclass.csv')
df.head()

Unnamed: 0,age,sex,fever,cold,rigor,fatigue,headache,bitter_tongue,vomiting,diarrhea,convulsion,anemia,jaundice,cocacola_urine,hypoglycemia,prostration,hyperpyrexia,diagnosis
0,52,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,both
1,15,1,0,1,1,1,0,0,1,0,0,0,0,0,0,1,0,none
2,61,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,both
3,21,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,none
4,24,0,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,malaria


In [6]:
# Check missing values
missing = df.isnull().sum()
print("Missing values:\n", missing[missing > 0])
# Define symptoms as all binary feature columns except age, sex, diagnosis
symptoms = [
    'fever', 'cold', 'rigor', 'fatigue', 'headache', 'bitter_tongue',
    'vomiting', 'diarrhea', 'convulsion', 'anemia', 'jaundice',
    'cocacola_urine', 'hypoglycemia', 'prostration', 'hyperpyrexia'
]
# Impute missing numerics with median, binaries with mode if any (should be none in synthetic data)
if df['age'].isnull().sum() > 0:
    df['age'].fillna(df['age'].median(), inplace=True)
for col in symptoms:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

Missing values:
 Series([], dtype: int64)


## 2.2. Encode Categorical Variables

In [7]:
from sklearn.preprocessing import LabelEncoder

# Encode diagnosis
le = LabelEncoder()
df['diagnosis_encoded'] = le.fit_transform(df['diagnosis'])
print("Class mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

Class mapping: {'both': np.int64(0), 'malaria': np.int64(1), 'none': np.int64(2), 'typhoid': np.int64(3)}


## 2.3. Outlier Handling

In [9]:
import numpy as np

# Cap ages to reasonable human bounds
df['age'] = np.clip(df['age'], 0, 99)

## 2.4. Feature Engineering

In [10]:
# Age bins
df['age_bin'] = pd.cut(df['age'], bins=[0,12,18,40,60,100], labels=False)

# Example interaction: fever and headache
df['fever_headache'] = df['fever'] * df['headache']

## 2.5. Train-Test Split


In [11]:
from sklearn.model_selection import train_test_split

features = [col for col in df.columns if col not in ['diagnosis', 'diagnosis_encoded']]
X = df[features]
y = df['diagnosis_encoded']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (1600, 19)
Test shape: (400, 19)


## 2.6. Scaling

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train['age'] = scaler.fit_transform(X_train[['age']])
X_test['age'] = scaler.transform(X_test[['age']])

In [16]:
# Save processed train/test data for use in the next notebook
X_train.to_csv(r'C:\Users\555555\Malaria-Typhoid-ML-Diagnosis\data\processed\X_train_cleaned.csv', index=False)
X_test.to_csv(r'C:\Users\555555\Malaria-Typhoid-ML-Diagnosis\data\processed\X_test_cleaned.csv', index=False)
y_train.to_csv(r'C:\Users\555555\Malaria-Typhoid-ML-Diagnosis\data\processed\y_train_cleaned.csv', index=False)
y_test.to_csv(r'C:\Users\555555\Malaria-Typhoid-ML-Diagnosis\data\processed\y_test_cleaned.csv', index=False)