BLOOD TEST ANALYZER MODEL


In [None]:
#Import Statements

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, QuantileTransformer
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
import warnings



# Load the dataset and define columns

file_path_btest = 'btest_dataset.csv'
df = pd.read_csv(file_path_btest)
numeric_columns = ['Age','Hemoglobin','Hematocrit','WBC','Platelets','MCH','MCV']
categorical_columns = ['Sex']


# Set up a column transformer to process the numerical and categorical columns

pre_processor = ColumnTransformer(transformers=[
    ('num', QuantileTransformer(n_quantiles=250), numeric_columns),
    ('cat', OneHotEncoder(sparse_output = False), categorical_columns)
])


# Create a Pipeline

pipe = Pipeline([
    ('pre_processor', pre_processor),
    ('model', MultiOutputClassifier(RandomForestClassifier()))
])


# Create parameters for the grid

param_grid = {
    'model__estimator__n_estimators': [100, 200],
    'model__estimator__max_depth': [10, 20, None],
    'model__estimator__min_samples_split': [2, 5],
    'model__estimator__min_samples_leaf': [1, 2]
}


# Create a grid

grid1 = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=5
)


# Fit data and predict

X = df[['Age','Sex','Hemoglobin','Hematocrit','WBC','Platelets','MCH','MCV',]]
y = df[['Hemoglobin_Status','Hematocrit_Status','WBC_Status','Platelets_Status','MCH_Status','MCV_Status','Condition']]

warnings.filterwarnings("ignore", category=UserWarning)

grid1.fit(X,y)
prediction = grid1.predict(X)

EFFICIENCY GRAPH BLOOD TEST ANALYZER MODEL

This cell plots graphs comparing the true and predicted values, showing that the model makes accurate predictions.

In [None]:
# Creating a graph to check the accuracy of the model


pred_df = pd.DataFrame(prediction, columns=y.columns)

for col in y.columns:
    plt.figure(figsize=(8,4))

    real_counts = y[col].value_counts(normalize=True).sort_index()
    pred_counts = pred_df[col].value_counts(normalize=True).sort_index()

    width = 0.35
    labels = sorted(set(y[col]) | set(pred_df[col]))

    real_vals = [real_counts.get(label, 0) for label in labels]
    pred_vals = [pred_counts.get(label, 0) for label in labels]

    x = range(len(labels))

    plt.bar(x, real_vals, width=width, label='Real', align='center')
    plt.bar([p + width for p in x], pred_vals, width=width, label='Predicted', align='center')

    plt.xticks([p + width/2 for p in x], labels)
    plt.title(f'Comparision Real vs Predicted - {col}')
    plt.ylabel('Proportion of samples')
    plt.legend()
    plt.tight_layout()
    plt.show()


SYMPTOMS ANALYZER MODEL

In [None]:
# Import statements
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, QuantileTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
import pandas as pd
import warnings


# Load dataset
file_path_symp = 'symptoms_dataset.csv'
df = pd.read_csv(file_path_symp)


# Clean dataset
df.columns = df.columns.str.replace('"', '', regex=False)


# Define numeric, categorical and symptom columns
numeric_column = ['Age']
categorical_columns = df.drop(['Age','Disease'], axis=1).columns.to_list()


# Create a pre_processor
pre_processor = ColumnTransformer(transformers=[
    ('num', QuantileTransformer(n_quantiles=250), numeric_column),
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_columns)
])


# Create a pipeline
pipe = Pipeline([
    ('pre_processor', pre_processor),
    ('model', RandomForestClassifier())
])


# Define parameter grid
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}


# Create grid search
grid2 = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=5
)


# Define X and y
X = df.drop('Disease', axis=1)
y = df['Disease']


# Ignore warnings and fit model
warnings.filterwarnings("ignore", category=UserWarning)

grid2.fit(X, y)
prediction = grid2.predict(X)


EFFICIENCY GRAPH SYMPTOMS ANALYZER MODEL

This cell plots a graph comparing the true and predicted values, showing that the model makes accurate predictions.

In [None]:
# Import statements

import matplotlib.pyplot as plt


# Create a graph to compare values from the 10 most common diseases


top_diseases = pd.Series(y).value_counts().head(10).index

mask = pd.Series(y).isin(top_diseases)
filtered_y = pd.Series(y)[mask].reset_index(drop=True)
filtered_pred = pd.Series(prediction)[mask].reset_index(drop=True)

plt.figure(figsize=(12, 6))
plt.plot(filtered_y.index, filtered_y, label='Real', marker='o', linestyle='', alpha=0.7)
plt.plot(filtered_pred.index, filtered_pred, label='Predicted', marker='x', linestyle='', alpha=0.7)
plt.title('Real vs Predicted Diseases (Top 10 Most Frequent)')
plt.xlabel('Sample Index')
plt.ylabel('Disease')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()




SAVING TRAINED MODELs

In [None]:
#Saving trained models

import joblib

joblib.dump(grid2.best_estimator_, 'symptoms_model.pkl')
joblib.dump(grid1.best_estimator_,'btest_model.pkl')

['btest_model.pkl']