# White Blood Cell Image Classification
### By [Anthony Medina](https://www.linkedin.com/in/anthony-medina-math/)

# Modeling Notebook
1. Notebook Objectives
2. Imports
3. Final Pre-Building Checks
4. Model 1 Neural Network
5. Model 2 Random Forest
6. Model 3 Gradient Boosting Machine
7. Model results analysis
8. Model Choice
9. Export Model
10. Next Steps

### 1. Notebook Objectives

This notebook will house the model building, evaluation of each model, and picking the model with best Recall score.

### 2. Imports

In [57]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [46]:
dtype_mapping = {
    'cell_key': int,
    'image_array': list
}
df = pd.read_csv('../cleaned_data/cleaned_data.csv')
df.head()

Unnamed: 0,cell_key,image_array
0,0,[0.01176471 0. 0. ... 0. ...
1,0,[0. 0. 0. ... 0.760784...
2,0,[0. 0. 0. ... 0. 0. 0.]
3,0,[0. 0. 0. ... 0.764705...
4,0,[0. 0. 0. ... 0. 0. 0.]


In [32]:
df.dtypes

cell_key        int64
image_array    object
dtype: object

In [61]:
df["image_array"] = np.array(df["image_array"])

In [62]:
df.dtypes

cell_key        int64
image_array    object
dtype: object

### 3. Data Split

In [58]:
X = df['image_array']
y = df['cell_key']
recall_list = []
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
X.shape

(12437,)

In [71]:
X.reshape(12437,1)

AttributeError: 'Series' object has no attribute 'reshape'

### 4. Model 1 Neural Network

In [56]:
# Build the neural network
model1 = Sequential()
model1.add(Dense(64, activation='relu', input_dim=X_train.shape[0]))
model1.add(Dense(32, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))

# Compile the model
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
model1.fit(X_train, y_train, validation_split=0.1, epochs=50, batch_size=32, callbacks=[early_stopping])

# Make predictions on the test set
y_pred = model1.predict(X_test)
y_pred_binary = np.round(y_pred)

# Calculate the recall score
recall1 = recall_score(y_test, y_pred_binary)
recall_list.append(recall1)
print("Recall:", recall1)

Epoch 1/50


ValueError: in user code:

    File "/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "/opt/anaconda3/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/opt/anaconda3/lib/python3.9/site-packages/keras/engine/input_spec.py", line 277, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer "sequential_5" "                 f"(type Sequential).
    
    Input 0 of layer "dense_6" is incompatible with the layer: expected axis -1 of input shape to have value 9949, but received input with shape (None, 1)
    
    Call arguments received by layer "sequential_5" "                 f"(type Sequential):
      • inputs=tf.Tensor(shape=(None, 1), dtype=string)
      • training=True
      • mask=None


### 5. Model 2 Random Forest

In [None]:


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV with recall as the scoring metric
grid_search = GridSearchCV(rf_classifier, param_grid, scoring='recall', cv=5, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best estimator from grid search
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# Predict on the test set
y_pred = best_rf.predict(X_test)

# Calculate recall on the test set
recall = recall_score(y_test, y_pred, average='macro')
recall_list.append(recall)
print("Best Parameters:", best_params)
print("Test Recall:", recall)

### 6. Model 3 Gradient Boosting Machine

In [None]:
# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize the Gradient Boosting classifier
gbm_classifier = GradientBoostingClassifier(random_state=42)

# Initialize GridSearchCV with recall as the scoring metric
grid_search = GridSearchCV(gbm_classifier, param_grid, scoring='recall', cv=5, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best estimator from grid search
best_params = grid_search.best_params_
best_gbm = grid_search.best_estimator_

# Predict on the test set
y_pred = best_gbm.predict(X_test)

# Calculate recall on the test set
recall = recall_score(y_test, y_pred, average='macro')
recall_list.append(recall)
print("Best Parameters:", best_params)
print("Test Recall:", recall)

### 7. Model results analysis

In [None]:
print('All Model Scores', recall_list)

### 8. Model Choice

In [None]:
max_index = recall_list.index(max(recall_list))
print('The model with the best score was model number', max_index + 1, '.')

### 9. Export Model

### 10. Next Steps