# White Blood Cell Image Classification
### By [Anthony Medina](https://www.linkedin.com/in/anthony-medina-math/)

# Modeling Notebook
1. Notebook Objectives
2. Imports
3. Final Pre-Building Checks
4. Model 1 Neural Network
5. Model 2 Random Forest
6. Model 3 Gradient Boosting Machine
7. Model results analysis
8. Model Choice
9. Next Steps

### 1. Notebook Objectives

This notebook will house the model building, evaluation of each model, and picking the model with best Recall score.

### 2. Imports

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import os
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, recall_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

2023-09-05 17:53:18.878533: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df = pd.read_csv('../cleaned_data/cleaned_data.csv')
df.head()

Unnamed: 0,cell_name,image_array
0,neutrophil,[[[0.01176471 0. 0. ]\n [0.031...
1,neutrophil,[[[0. 0. 0. ]\n [0. ...
2,neutrophil,[[[0. 0. 0.]\n [0. 0. 0.]\n [0. 0. 0.]\n .....
3,neutrophil,[[[0. 0. 0. ]\n [0. ...
4,neutrophil,[[[0. 0. 0. ]\n [0. ...


In [3]:
df.dtypes

cell_name      object
image_array    object
dtype: object

In [39]:
# df['image_array'] = df['image_array'].apply(lambda x: np.fromstring(x[1:-1], sep='\n'))

In [2]:
# I added this block because importing my clean data was a nightmare.
# New Array that will contain the final values I need to save for modeling.

column_names = ['cell_name', 'image_array']

# Create a blank DataFrame with column names
df = pd.DataFrame(columns=column_names)

# Populating the data frame from the 4 different types of images
cell_names = ['neutrophil', 'monocyte', 'lymphocyte', 'eosinophil']

for index, cell_name in enumerate(cell_names):
    print(index, cell_name)
    directory_path = '../raw_data/organized_data_set/images/' + cell_name
    for filename in os.listdir(directory_path):
        if os.path.isfile(os.path.join(directory_path, filename)):
            file_path = os.path.abspath(os.path.join(directory_path, filename))
            image = mpimg.imread(file_path) # First it's an image
            first_array = np.array(image) # Then it's an array
#            reshaped_array = first_array.flatten() # Now it's a flat array
            float_array = first_array.astype('float32') # Now it's an array of floats
            rescaled_array = float_array / 255.0 # Rescaling the float
            new_entry = {"cell_name": cell_name, 'image_array': rescaled_array}
            df.loc[len(df)] = new_entry

0 neutrophil
1 monocyte
2 lymphocyte
3 eosinophil


In [8]:
df.to_parquet('../cleaned_data/df.csv')

ArrowInvalid: ('Can only convert 1-dimensional array values', 'Conversion failed for column image_array with type object')

In [6]:
df.dtypes

cell_name      object
image_array    object
dtype: object

In [6]:
df.head()

Unnamed: 0,cell_name,image_array
0,neutrophil,"[[[0.011764706, 0.0, 0.0], [0.03137255, 0.0156..."
1,neutrophil,"[[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,..."
2,neutrophil,"[[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,..."
3,neutrophil,"[[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,..."
4,neutrophil,"[[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,..."


### 3. Data Split

In [3]:
X = np.array(df['image_array'].tolist())
y = np.array(df['cell_name'])
recall_list = []
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 4. Model 1 Neural Network

In [4]:
def create_model():
    model = Sequential([
        # Add convolutional 2D layer
#        Conv2D(input_shape=(num_features,), filters = 4, kernel_size, strides=(1, 1), padding="valid", data_format=None, dilation_rate=(1, 1), activation='relu',)
        # Maxpool Layer
#        MaxPooling2D(pool_size=(2, 2), strides=None, padding="valid", data_format=None, **kwargs)
        # Drop out
        
        # Flattening Layer
        
        Dense(64, activation='relu', input_shape=(240, 320, 3)),
        Dense(32, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Define constants
num_features = X_train.shape[1]
num_classes = 4

# Create a KerasClassifier based on the create_model function
model = KerasClassifier(build_fn=create_model, epochs = 10, batch_size = 32, verbose=0)

# Define the parameter grid for grid search
param_grid = {
    'epochs': [10, 20, 50],
    'batch_size': [32, 64],
}

# Define the scoring function (recall)
scoring = make_scorer(recall_score, average='macro')

# Initialize Kfold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid, scoring=scoring, cv=kfold)

# Fit the grid search to your training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best estimator from grid search
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on your test data
y_pred = best_model.predict(X_test)
recall = recall_score(y_test, y_pred, average='macro')


y_pred2 = best_model.predict(X)
cm = confusion_matrix(y, y_pred2)

# Display the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


print("Best Parameters:", best_params)
print("Test Recall:", recall)
recall_list.append(recall)


  model = KerasClassifier(build_fn=create_model, epochs = 10, batch_size = 32, verbose=0)
2023-09-05 17:58:40.891629: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/keras/wrappers/scikit_learn.py", line 248, in fit
    return super().fit(x, y, **kwargs)
  File "/opt/anaconda3/lib/python3.9/site-packages/keras/wrappers/scikit_learn.py", line 175, in fit
    history = self.model.fit(x, y, **fit_args)
  File "/opt/anaconda3/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/var/folders/b0/x4p60r3n3dz3lbgp1_bgg05h0000gn/T/__autograph_generated_filel7oxgjhf.py", line 15, in tf__train_function
    retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
ValueError: in user code:

    File "/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 994, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "/opt/anaconda3/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/opt/anaconda3/lib/python3.9/site-packages/keras/losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "/opt/anaconda3/lib/python3.9/site-packages/keras/losses.py", line 272, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/opt/anaconda3/lib/python3.9/site-packages/keras/losses.py", line 1990, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/opt/anaconda3/lib/python3.9/site-packages/keras/backend.py", line 5529, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 4) and (None, 240, 320, 4) are incompatible



### Best Neural Network

### 5. Model 2 Random Forest

In [5]:
# Define the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the scoring function (recall)
scoring = make_scorer(recall_score, average='macro')

# Initialize KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(rf_classifier, param_grid, cv=kfold, scoring=scoring)

# Fit the grid search to your training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best estimator from grid search
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# Evaluate the best model on your test data
y_pred = best_rf.predict(X_test)
recall = recall_score(y_test, y_pred, average='macro')



y_pred2 = best_rf.predict(X)
cm = confusion_matrix(y, y_pred2)

# Display the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


print("Best Parameters:", best_params)
print("Test Recall:", recall)
recall_list.append(recall)

KeyboardInterrupt: 

In [None]:
### Best Random Forest

### 6. Model 3 Gradient Boosting Machine

In [10]:
# Define the Gradient Boosting classifier
gbm_classifier = GradientBoostingClassifier(random_state=42)

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Define the scoring function (recall)
scoring = make_scorer(recall_score, average='macro')

# Initialize Kfold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(gbm_classifier, param_grid, scoring=scoring, cv=kfold)

# Fit the grid search to your training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best estimator from grid search
best_params = grid_search.best_params_
best_gbm = grid_search.best_estimator_

# Evaluate the best model on your test data
y_pred = best_gbm.predict(X_test)
recall = recall_score(y_test, y_pred, average='macro')


y_pred2 = best_gbm.predict(X)
cm = confusion_matrix(y, y_pred2)

# Display the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


print("Best Parameters:", best_params)
print("Test Recall:", recall)
recall_list.append(recall)

ValueError: 
All the 1215 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
972 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 429, in fit
    X, y = self._validate_data(
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1106, in check_X_y
    X = check_array(
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 940, in check_array
    raise ValueError(
ValueError: Found array with 0 feature(s) (shape=(7959, 0)) while a minimum of 1 is required by GradientBoostingClassifier.

--------------------------------------------------------------------------------
243 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 429, in fit
    X, y = self._validate_data(
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1106, in check_X_y
    X = check_array(
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 940, in check_array
    raise ValueError(
ValueError: Found array with 0 feature(s) (shape=(7960, 0)) while a minimum of 1 is required by GradientBoostingClassifier.


In [None]:
### Best Gradient Boosting Machine

### 7. Model results analysis

In [None]:
print('All Model Scores', recall_list)

### 8. Model Choice

In [None]:
max_index = recall_list.index(max(recall_list))
print('The model with the best score was model number', max_index + 1, '.')

## The best model

### 9. Next Steps


Create a final model using the best parameters from the random forest model