In [345]:
import pandas as pd
import numpy as np

In [346]:
healthy_ranges= {
    "Glucose": (70, 99),           # mg/dL (Fasting)
    "Cholesterol": (125, 199),     # mg/dL (Total)
    "Hemoglobin": (12.0, 17.5),    # g/dL
    "Platelets": (150000, 450000), # per microliter of blood
    "White Blood Cells": (4000, 11000), # per cubic millimeter of blood
    "Red Blood Cells": (4.0, 6.0), # million cells per microliter of blood
    "Hematocrit": (36, 52),        # percentage
    "Mean Corpuscular Volume": (80, 100), # femtoliters
    "Mean Corpuscular Hemoglobin": (27, 33), # picograms
    "Mean Corpuscular Hemoglobin Concentration": (32, 36), # grams per deciliter
    "Insulin": (2, 20),            # microU/mL (Fasting)
    "BMI": (18.5, 24.9),           # kg/m^2
    "Systolic Blood Pressure": (90, 119), # mmHg
    "Diastolic Blood Pressure": (60, 79), # mmHg
    "Triglycerides": (50, 150),    # mg/dL
    "HbA1c": (4.0, 5.6),           # percentage
    "LDL Cholesterol": (70, 99),   # mg/dL
    "HDL Cholesterol": (40, 90),   # mg/dL (Set high max as higher is better)
    "ALT": (10, 40),               # U/L
    "AST": (10, 40),               # U/L
    "Heart Rate": (60, 100),       # beats per minute
    "Creatinine": (0.6, 1.2),      # mg/dL
    "Troponin": (0, 0.04),         # ng/mL
    "C-reactive Protein": (0, 3)   # mg/L
}

dataset_ranges ={ "Glucose": (70, 140),  # mg/dL
"Cholesterol": (125, 200),  # mg/dL
"Hemoglobin": (13.5, 17.5),  # g/dL
"Platelets": (150000, 450000),  # per microliter of blood
"White Blood Cells": (4000, 11000),  # per cubic millimeter of blood
"Red Blood Cells": (4.2, 5.4),  # million cells per microliter of blood
"Hematocrit": (38, 52),  # percentage
"Mean Corpuscular Volume": (80, 100),  # femtoliters
"Mean Corpuscular Hemoglobin": (27, 33),  # picograms
"Mean Corpuscular Hemoglobin Concentration": (32, 36),  # grams per deciliter
"Insulin": (5, 25),  # microU/mL
"BMI": (18.5, 24.9),  # kg/m^2
"Systolic Blood Pressure": (90, 120),  # mmHg
"Diastolic Blood Pressure": (60, 80),  # mmHg
"Triglycerides": (50, 150),  # mg/dL
"HbA1c": (4, 6),  # percentage
"LDL Cholesterol": (70, 130),  # mg/dL
"HDL Cholesterol": (40, 60),  # mg/dL
"ALT": (10, 40),  # U/L
"AST": (10, 40),  # U/L
"Heart Rate": (60, 100),  # beats per minute
"Creatinine": (0.6, 1.2),  # mg/dL
"Troponin": (0, 0.04),  # ng/mL
"C-reactive Protein": (0, 3),  # mg/L
}

dataset_max = {key: value[1] for key, value in dataset_ranges.items()}
dataset_min = {key: value[0] for key, value in dataset_ranges.items()}

In [347]:
df = pd.read_csv("blood_samples_dataset_test.csv")
df.shape

(486, 25)

In [348]:
def gaussian_between(min_val, max_val, mean_val, std_ratio=6):
    std = (max_val - min_val) / std_ratio  # smaller std = tighter around mean
    val = np.random.normal(mean_val, std)
    return np.clip(val, min_val, max_val)

def generate_raw_healthy_row(healthy_ranges):
    row = {}
    for feature, (low, high) in healthy_ranges.items():
        row[feature] = np.random.uniform(low, high)
        #row[feature] = gaussian_between(low, high, (low + high) / 2)
    return row

def scale_row(raw_row, dataset_min, dataset_max):
    scaled = {}
    for feature, value in raw_row.items():
        scaled[feature] = (value - dataset_min[feature]) / (dataset_max[feature] - dataset_min[feature])
    return scaled

def generate_scaled_healthy_samples(n, healthy_ranges, dataset_min, dataset_max):
    samples = []

    for _ in range(n):
        raw_row = generate_raw_healthy_row(healthy_ranges)
        scaled_row = scale_row(raw_row, dataset_min, dataset_max)
        scaled_row["Disease"] = "Healthy"   # Add label
        samples.append(scaled_row)

    return pd.DataFrame(samples)

healthy_samples_df = generate_scaled_healthy_samples(30, healthy_ranges, dataset_min, dataset_max)
healthy_samples_df.head()

Unnamed: 0,Glucose,Cholesterol,Hemoglobin,Platelets,White Blood Cells,Red Blood Cells,Hematocrit,Mean Corpuscular Volume,Mean Corpuscular Hemoglobin,Mean Corpuscular Hemoglobin Concentration,...,HbA1c,LDL Cholesterol,HDL Cholesterol,ALT,AST,Heart Rate,Creatinine,Troponin,C-reactive Protein,Disease
0,0.073468,0.043506,0.133216,0.251976,0.939406,1.457814,0.627063,0.114293,0.886821,0.08292,...,0.733827,0.022342,2.481677,0.174941,0.673322,0.244904,0.766768,0.602777,0.490922,Healthy
1,0.221248,0.065765,0.098065,0.945224,0.635852,-0.107301,0.863773,0.052304,0.614714,0.109635,...,0.676544,0.413954,1.782734,0.61237,0.871191,0.203913,0.182985,0.588618,0.088579,Healthy
2,0.034908,0.900529,-0.263312,0.549239,0.123144,1.392,0.083678,0.769987,0.110873,0.970511,...,0.588843,0.410737,0.907685,0.599445,0.461463,0.682233,0.237975,0.083069,0.591293,Healthy
3,0.363209,0.956371,0.099294,0.986197,0.16384,0.595349,0.014418,0.341817,0.163856,0.760036,...,0.117785,0.190182,1.467158,0.643317,0.07061,0.303878,0.115838,0.374427,0.043513,Healthy
4,0.29798,0.176961,-0.073548,0.28819,0.829541,0.827469,0.16632,0.334886,0.652298,0.609033,...,0.390654,0.451597,0.431927,0.469283,0.123819,0.64626,0.732692,0.828583,0.832474,Healthy


In [349]:
df_clean = df[df['Disease'] != 'Healthy']
df_clean = pd.concat([df_clean, healthy_samples_df], ignore_index=True)
df_clean.shape

(511, 25)

In [352]:
df_clean["Disease"].value_counts()

Disease
Diabetes    294
Anemia       84
Thalasse     48
Heart Di     39
Healthy      30
Thromboc     16
Name: count, dtype: int64

In [353]:
x = df_clean.drop("Disease", axis=1)
y = df_clean["Disease"]

x = (x-x.min())/(x.max()-x.min())

classes = ['Anemia', 'Diabetes', 'Healthy', 'Heart Di', 'Thalasse', 'Thromboc']
encodings = [0, 1, 2, 3, 4, 5]

encode_dict = dict(zip(classes, encodings))
decode_dict = {v: k for k, v in encode_dict.items()}

y = y.map(encode_dict)
#print(encoder.classes_, encoder.transform(encoder.classes_))

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(x_train, y_train)

In [354]:
from xgboost import XGBClassifier

model = XGBClassifier(
    objective='multi:softprob',
    num_class=6,
    eval_metric='mlogloss',
    tree_method='hist',
    max_depth=8,
    learning_rate=0.03,
    n_estimators=600,
    gamma=0,
    min_child_weight=1,
    alpha=0,
    lambd=1,
)

model.set_params(objective='multi:softprob', 
                 params={'use_focal_loss': True})

weights = np.ones(len(y_res))

increased = 5
# increase weight for specific classes
weights[y_res == 0] = 0.5
weights[y_res == 1] = 0.5
weights[y_res == 2] = 0.5
weights[y_res == 3] = 5
weights[y_res == 4] = 5
weights[y_res == 5] = 5

model.fit(X_res, y_res, sample_weight=weights)

Parameters: { "lambd", "params" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [355]:
probs = model.predict_proba(x_test)
threshold = 0.2   # more sensitive

pred_threshold = []

y_pred = []
for p in probs:
    if p.max() < threshold and p.argmax() == 2:  # if max prob is less than threshold or predicted as Healthy
        sorted_indices = np.argsort(p, axis=1)
        descending_indices = sorted_indices[:, ::-1]
        second_highest_indices = descending_indices[:, 1]
        y_pred.append(second_highest_indices)  # fallback
    else:
        y_pred.append(p.argmax())




from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
recall = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9126
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.90      0.86        10
           1       0.97      0.98      0.97        59
           2       0.75      0.60      0.67        10
           3       1.00      0.71      0.83         7
           4       0.86      0.92      0.89        13
           5       0.80      1.00      0.89         4

    accuracy                           0.91       103
   macro avg       0.87      0.85      0.85       103
weighted avg       0.91      0.91      0.91       103

Confusion Matrix:
[[ 9  0  0  0  0  1]
 [ 0 58  1  0  0  0]
 [ 2  1  6  0  1  0]
 [ 0  1  0  5  1  0]
 [ 0  0  1  0 12  0]
 [ 0  0  0  0  0  4]]


In [356]:
import joblib
joblib.dump(model, "model_four_pm.pkl")

['model_four_pm.pkl']