# White Blood Cell Image Classification
### By [Anthony Medina](https://www.linkedin.com/in/anthony-medina-math/)

# Modeling Notebook
1. Notebook Objectives
2. Imports
3. Final Pre-Building Checks
4. Model 1 Neural Network
5. Model 2 Random Forest
6. Model 3 Gradient Boosting Machine
7. Model results analysis
8. Model Choice
9. Next Steps

### 1. Notebook Objectives

This notebook will house the model building, evaluation of each model, and picking the model with best Recall score.

### 2. Imports

In [74]:
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import os
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import make_scorer, recall_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier




In [3]:
df.dtypes

cell_name      object
image_array    object
dtype: object

In [9]:
# I added this block because importing my clean data was a nightmare.
# New Array that will contain the final values I need to save for modeling.
#import cv2

column_names = ['cell_name', 'image_array']

# Create a blank DataFrame with column names
df = pd.DataFrame(columns=column_names)

# Populating the data frame from the 4 different types of images
cell_names = ['neutrophil', 'monocyte', 'lymphocyte', 'eosinophil']

images = []
labels = []
for index, cell_name in enumerate(cell_names):
    print(index, cell_name)
    directory_path = '../raw_data/organized_data_set/images/' + cell_name
    for filename in os.listdir(directory_path):
        if os.path.isfile(os.path.join(directory_path, filename)):
            file_path = os.path.abspath(os.path.join(directory_path, filename))
            image = mpimg.imread(file_path) # First it's an image
            first_array = np.array(image) # Then it's an array
            float_array = first_array.astype('float32') # Now it's an array of floats
            rescaled_array = float_array / 255.0 # Rescaling the float
            new_entry = {"cell_name": cell_name, 'image_array': rescaled_array}
            df.loc[len(df)] = new_entry
            # images.append(image)
            images.append(rescaled_array)
            labels.append(cell_name)

                
X = np.array(images)
y = np.array(labels)

0 neutrophil
1 monocyte
2 lymphocyte
3 eosinophil


# GET RID OF THIS
# This is the small version of the data 

In [182]:
# I added this block because importing my clean data was a nightmare.
# New Array that will contain the final values I need to save for modeling.
#import cv2

column_names = ['cell_name', 'image_array']

# Create a blank DataFrame with column names
df = pd.DataFrame(columns=column_names)

# Populating the data frame from the 4 different types of images
cell_names = ['neutrophil', 'monocyte', 'lymphocyte', 'eosinophil']

images = []
labels = []
for index, cell_name in enumerate(cell_names):
    print(index, cell_name)
    directory_path = '../raw_data/organized_data_set/images/' + cell_name
    count = 0
    for filename in os.listdir(directory_path):
        if count < 150:
            count +=1
            if os.path.isfile(os.path.join(directory_path, filename)):
                file_path = os.path.abspath(os.path.join(directory_path, filename))
    #            print(file_path)
                image = mpimg.imread(file_path) # First it's an image
    #            image = cv2.imread(file_path)
                first_array = np.array(image) # Then it's an array
    #            reshaped_array = first_array.flatten() # Now it's a flat array
                float_array = first_array.astype('float32') # Now it's an array of floats
                rescaled_array = float_array / 255.0 # Rescaling the float
                new_entry = {"cell_name": cell_name, 'image_array': rescaled_array}
                df.loc[len(df)] = new_entry
                # images.append(image)
                images.append(rescaled_array)
                labels.append(cell_name)
                
                
X = np.array(images)
y = np.array(labels)

0 neutrophil
1 monocyte
2 lymphocyte
3 eosinophil


In [183]:
print(len(X), len(y))

600 600


### 3. Data Split

In [184]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

In [185]:
print(len(y))

600


In [208]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [209]:
print(len(y_encoded), len(set(y_encoded)) )

600 4


In [210]:
print(X_train[12].shape)

(240, 320, 3)


# 1. Efficient Net

In [217]:
# Efficient Net
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D

# Assuming you have image data and labels (X and y)

# Load the pre-trained EfficientNetB0 model
base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(240, 320, 3))

# Build your custom head on top of the base model
model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dense(256, activation='relu'),
    Dense(4, activation='softmax')  # Assuming 4 classes
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model = create_model()
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

recall = recall_score(y_test, y_pred_classes, average='macro')
print(f'Recall: {recall}')

confusion_mtx = confusion_matrix(y_test, y_pred_classes)

print(confusion_mtx)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Recall: 0.3136998059411853
[[13  8  4  5]
 [ 8 12  3  5]
 [ 8  9  7  5]
 [14 10  4  5]]


### 4. Model 1 Neural Network

In [218]:
def create_model():
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(240, 320, 3)))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(len(set(y_encoded)), activation='softmax'))
    model.compile(optimizer='adam', loss= 'sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

model = create_model()
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

recall = recall_score(y_test, y_pred_classes, average='macro')
print(f'Recall: {recall}')

confusion_mtx = confusion_matrix(y_test, y_pred_classes)

print(confusion_mtx)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Recall: 0.32344939543215406
[[ 8  6 11  5]
 [ 8 11  5  4]
 [ 3  7 14  5]
 [ 8  7 13  5]]


### Best Neural Network

# Flattening the images for random forest and svm

In [83]:
X.shape

(200, 240, 320, 3)

In [204]:
num_samples, height, width, channels = X.shape
X_flat = X.reshape(num_samples, -1)

# flattened_images is now a 2D array with shape (num_samples, height * width) or (num_samples, height * width * channels) for color images

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_flat, y_encoded, test_size=0.2, random_state=42)

In [205]:
X_flat.shape

(600, 230400)

### 5. Model 2 Random Forest

In [198]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.metrics import confusion_matrix

# Assuming you have features X and corresponding labels y

# Create the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform cross-validation and get predicted labels
y_pred_cv = cross_val_predict(rf_classifier, X_flat, y_encoded, cv=5)

# Calculate and print the confusion matrix
recall = recall_score(y_encoded, y_pred_cv, average='macro')
print(f'Recall: {recall}')

conf_matrix = confusion_matrix(y_encoded, y_pred_cv)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.38666666666666666
Confusion Matrix:
[[55 34 22 39]
 [37 71 17 25]
 [24 18 78 30]
 [36 35 51 28]]


In [96]:
"""# Assuming you have X_train, y_train for your data
# You should also have X_test and y_test for evaluation

# Define the Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Define hyperparameters and their possible values for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [4, 8, 10]
}

# Create a GridSearchCV object
recall_scorer = make_scorer(recall_score, average='macro')
grid_search = GridSearchCV(rf_classifier, param_grid, cv=3, scoring=recall_scorer)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_




# Calculate confusion matrix
y_pred = best_estimator.predict(X_flat)
conf_matrix = confusion_matrix(y_encoded, y_pred)

# Print the confusion matrix
print('Confusion Matrix:')
print(conf_matrix)

# Evaluate the model
recall = best_estimator.score(X_test, y_test)
print(f'Best Parameters: {best_params}')
print(f'Recall: {recall}')"""

Confusion Matrix:
[[41  3  0  6]
 [ 3 44  2  1]
 [ 7  2 39  2]
 [ 1  1  1 47]]
Best Parameters: {'max_depth': None, 'min_samples_split': 8, 'n_estimators': 50}
Recall: 0.275


In [97]:
"""# Calculate confusion matrix
y_pred = best_estimator.predict(X_flat)
conf_matrix = confusion_matrix(y_encoded, y_pred)

# Print the confusion matrix
print('Confusion Matrix:')
print(conf_matrix)

# Evaluate the model
recall = best_estimator.score(X_test, y_test)
print(f'Best Parameters: {best_params}')
print(f'Recall: {recall}')"""

Confusion Matrix:
[[41  3  0  6]
 [ 3 44  2  1]
 [ 7  2 39  2]
 [ 1  1  1 47]]
Best Parameters: {'max_depth': None, 'min_samples_split': 8, 'n_estimators': 50}
Recall: 0.275


### Best Random Forest

### 6. Model 3 Gradient Boosting Machine

In [206]:
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
gb_classifier.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = gb_classifier.predict(X_test)

recall = recall_score(y_encoded, y_pred, average='macro')
print(f'Recall: {recall}')

conf_matrix = confusion_matrix(y_encoded, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

KeyboardInterrupt: 

In [194]:
"""from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Assuming you have X_train, y_train for your data
# You should also have X_test and y_test for evaluation

# Define the Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier()

# Define hyperparameters and their possible values for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10]
}

# Create a GridSearchCV object

recall_scorer = make_scorer(recall_score, average='macro')
grid_search = GridSearchCV(gb_classifier, param_grid, cv=3, scoring=recall_scorer)



# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_



# Calculate confusion matrix
y_pred = best_estimator.predict(X_flat)
conf_matrix = confusion_matrix(y_encoded, y_pred)

# Print the confusion matrix
print('Confusion Matrix:')
print(conf_matrix)

# Evaluate the model
recall = best_estimator.score(X_test, y_test)
print(f'Best Parameters: {best_params}')
print(f'Recall: {recall}')"""

"from sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.model_selection import GridSearchCV\n\n# Assuming you have X_train, y_train for your data\n# You should also have X_test and y_test for evaluation\n\n# Define the Gradient Boosting Classifier\ngb_classifier = GradientBoostingClassifier()\n\n# Define hyperparameters and their possible values for grid search\nparam_grid = {\n    'n_estimators': [50, 100, 200],\n    'learning_rate': [0.01, 0.1, 0.2],\n    'max_depth': [3, 4, 5],\n    'min_samples_split': [2, 5, 10]\n}\n\n# Create a GridSearchCV object\n\nrecall_scorer = make_scorer(recall_score, average='macro')\ngrid_search = GridSearchCV(gb_classifier, param_grid, cv=3, scoring=recall_scorer)\n\n\n\n# Fit the grid search to the training data\ngrid_search.fit(X_train, y_train)\n\n# Get the best parameters and best estimator\nbest_params = grid_search.best_params_\nbest_estimator = grid_search.best_estimator_\n\n\n\n# Calculate confusion matrix\ny_pred = best_estimator

# Without flattening

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [None]:
"""from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Assuming you have X_train, y_train for your data
# You should also have X_test and y_test for evaluation

# Define the Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier()

# Define hyperparameters and their possible values for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10]
}

# Create a GridSearchCV object

recall_scorer = make_scorer(recall_score, average='macro')
grid_search = GridSearchCV(gb_classifier, param_grid, cv=3, scoring=recall_scorer)



# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_



# Calculate confusion matrix
y_pred = best_estimator.predict(X_flat)
conf_matrix = confusion_matrix(y_encoded, y_pred)

# Print the confusion matrix
print('Confusion Matrix:')
print(conf_matrix)

# Evaluate the model
recall = best_estimator.score(X_test, y_test)
print(f'Best Parameters: {best_params}')
print(f'Recall: {recall}')"""

### Best Gradient Boosting Machine

In [None]:
"""# Using Grid Search for optimizing the Neural Network
def create_model(learning_rate=0.001, optimizer='adam', batch_size=32):
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(240, 320, 3)))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(len(set(y_encoded)), activation='softmax'))  # Number of classes
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


# Create a KerasClassifier based on your model
model = KerasClassifier(build_fn=create_model, epochs=5, batch_size=32, verbose=0)

# Define the hyperparameters to search through
param_grid = {
    'learning_rate' : [0.001, 0.01],
    'batch_size': [16, 32],
    'epochs' : [5, 10],
    'optimizer': ['adam', 'rmsprop']
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='recall_macro')

# Fit the grid search to your data
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_




# Calculate confusion matrix
y_pred = best_estimator.predict(X)
conf_matrix = confusion_matrix(y_encoded, y_pred)

# Print the confusion matrix
print('Confusion Matrix:')
print(conf_matrix)

# Evaluate the model
recall = best_estimator.score(X_test, y_test)
print(f'Best Parameters: {best_params}')
print(f'Recall: {recall}')"""

In [None]:
"""y_pred = model.predict(X)
# Compute the confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)"""