# Modeling

In [128]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [129]:
import os

# os.chdir allows you to change directories, like cd in the Terminal
os.chdir("/content/drive/MyDrive/Colab Notebooks/gi-disease-detection")
     

## Loading and Cleaning Datasets

In [3]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os

In [4]:
SOURCE = "local"
BATCH_SIZE = 1
RAW_DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/gi-disease-detection/raw_data"
IMAGE_RESCALE_RATIO = 1. / 255
IMAGE_TARGET_WIDTH = 224
IMAGE_TARGET_HEIGHT = 224

In [214]:
def train_val_test_generator2(source = "local"):
    """
    Generate the train, validation, and test batches.
    """
    def load_images(path):
        """
        Enter a path to load images from.
        """
        datagen = ImageDataGenerator(rescale = IMAGE_RESCALE_RATIO)
        images = datagen.flow_from_directory(path,
                                             target_size = (IMAGE_TARGET_WIDTH, IMAGE_TARGET_HEIGHT),
                                             color_mode = "rgb",
                                             batch_size = BATCH_SIZE,
                                             class_mode = "categorical")

        return images

    if source == "local":
        train_directory = os.path.join(RAW_DATA_PATH, "train")
        val_directory = os.path.join(RAW_DATA_PATH, "val")
        test_directory = os.path.join(RAW_DATA_PATH, "test")

    #this doesn't work right now
    if source == "cloud":
        credentials = service_account.Credentials.from_service_account_file(GOOGLE_APPLICATION_CREDENTIALS)
        client = storage.Client(project = GCLOUD_PROJECT_ID, credentials = credentials)
        bucket = client.get_bucket(BUCKET_NAME)

        train_directory = f"gs://{BUCKET_NAME}/train"
        val_directory = f"gs://{BUCKET_NAME}/val"
        test_directory = f"gs://{BUCKET_NAME}/test"

    train_dataset = load_images(train_directory)
    val_dataset = load_images(val_directory)
    test_dataset = load_images(test_directory)

    return train_dataset, val_dataset, test_dataset

In [215]:
train_dataset, val_dataset, test_dataset = train_val_test_generator2(source = SOURCE)

Found 3200 images belonging to 4 classes.
Found 2000 images belonging to 4 classes.
Found 800 images belonging to 4 classes.


## Modeling

In [16]:
from tensorflow.keras import models
from tensorflow.keras import Sequential, layers, regularizers
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import metrics

In [17]:
reg_l1_l2 = regularizers.l1_l2(l1 = 0.005, l2 = 0.0005)

In [18]:
es = EarlyStopping(restore_best_weights = True)

In [216]:
model = Sequential()
model.add(layers.Conv2D(128, kernel_size = (4, 4), input_shape = (224, 224, 3), activation = "relu", padding = "same"))
model.add(layers.Conv2D(128, kernel_size = (4, 4), activation = "relu"))
model.add(layers.MaxPool2D(pool_size = (3, 3)))
model.add(layers.Conv2D(64, kernel_size = (3, 3), activation = "relu"))
model.add(layers.Conv2D(64, kernel_size = (3, 3), activation = "relu"))
model.add(layers.MaxPool2D(pool_size = (3, 3)))
model.add(layers.Conv2D(64, kernel_size = (3, 3), activation = "relu"))
model.add(layers.Conv2D(64, kernel_size = (3, 3), activation = "relu"))
model.add(layers.MaxPool2D(pool_size = (2, 2)))
model.add(layers.Dropout(rate = 0.2))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation = "relu"))
model.add(layers.Dense(4, activation = "softmax"))

In [217]:
model.summary()

Model: "sequential_43"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_169 (Conv2D)         (None, 224, 224, 128)     6272      
                                                                 
 conv2d_170 (Conv2D)         (None, 221, 221, 128)     262272    
                                                                 
 max_pooling2d_121 (MaxPooli  (None, 73, 73, 128)      0         
 ng2D)                                                           
                                                                 
 conv2d_171 (Conv2D)         (None, 71, 71, 64)        73792     
                                                                 
 conv2d_172 (Conv2D)         (None, 69, 69, 64)        36928     
                                                                 
 max_pooling2d_122 (MaxPooli  (None, 23, 23, 64)       0         
 ng2D)                                               

In [218]:
model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = [metrics.Accuracy(), metrics.Recall()])

In [221]:
model.fit(train_dataset, validation_data = val_dataset, batch_size = 32, epochs = 10)

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: ignored

In [None]:
model.evaluate(test_dataset)



[1.173465371131897, 0.06031249836087227, 0.7699999809265137]

## Better Cleaning? Including X and y

In [5]:
import tqdm
import numpy as np

In [114]:
def train_val_test_generator(source = SOURCE):
    """
    Generates X_train, y_train, X_val, y_val, X_test, y_test.
    """
    def load_images(path):
        """
        Enter a path to load images from.
        """
        datagen = ImageDataGenerator(rescale = float(IMAGE_RESCALE_RATIO))
        images = datagen.flow_from_directory(path,
                                             target_size = (int(IMAGE_TARGET_WIDTH), int(IMAGE_TARGET_HEIGHT)),
                                             color_mode = "rgb",
                                             batch_size = int(BATCH_SIZE),
                                             class_mode = "categorical")

        return images

    def convert_to_numpy(dataset):
        """
        Converts DirectoryIterator dataset to numpy.array before cleaning images.
        """
        X = np.concatenate([dataset.next()[0] for i in range(dataset.__len__())])
        y = np.concatenate([dataset.next()[1] for i in range(dataset.__len__())])

        return X, y

    if source == "local":
        train_directory = os.path.join(RAW_DATA_PATH, "train")
        val_directory = os.path.join(RAW_DATA_PATH, "val")
        test_directory = os.path.join(RAW_DATA_PATH, "test")

    #this doesn't work right now
    if source == "cloud":
        credentials = service_account.Credentials.from_service_account_file(GOOGLE_APPLICATION_CREDENTIALS)
        client = storage.Client(project = GCLOUD_PROJECT_ID, credentials = credentials)
        bucket = client.get_bucket(BUCKET_NAME)

        train_directory = f"gs://{BUCKET_NAME}/train"
        val_directory = f"gs://{BUCKET_NAME}/val"
        test_directory = f"gs://{BUCKET_NAME}/test"

    X_train, y_train = convert_to_numpy(load_images(train_directory))
    X_val, y_val = convert_to_numpy(load_images(val_directory))
    X_test, y_test = convert_to_numpy(load_images(test_directory))

    return X_train, y_train, X_val, y_val, X_test, y_test

In [115]:
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_generator(source = SOURCE)

Found 3200 images belonging to 4 classes.
Found 2000 images belonging to 4 classes.
Found 800 images belonging to 4 classes.


In [130]:
print(f"X_train type: {type(X_train)}")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train type: <class 'numpy.ndarray'>
X_train shape: (3200, 224, 224, 3)
y_train shape: (3200, 4)
X_val shape: (2000, 224, 224, 3)
y_val shape: (2000, 4)
X_test shape: (800, 224, 224, 3)
y_test shape: (800, 4)


## Redoing the Original Model

In [207]:
model_2 = Sequential()
model_2.add(layers.Conv2D(128, kernel_size = (3, 3), input_shape = (224, 224, 3), activation = "relu", padding = "same"))
model_2.add(layers.MaxPool2D(pool_size = (3, 3)))

model_2.add(layers.Conv2D(64, kernel_size = (2, 2), activation = "relu"))
model_2.add(layers.MaxPool2D(pool_size = (2, 2)))

model_2.add(layers.Conv2D(32, kernel_size = (2, 2), activation = "relu"))
model_2.add(layers.MaxPool2D(pool_size = (2, 2)))
model_2.add(layers.Dropout(rate = 0.2))

model_2.add(layers.Flatten())
model_2.add(layers.Dense(32, activation = "relu"))
model_2.add(layers.Dense(4, activation = "softmax"))

In [208]:
model_2.summary()

Model: "sequential_42"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_166 (Conv2D)         (None, 224, 224, 128)     3584      
                                                                 
 max_pooling2d_118 (MaxPooli  (None, 74, 74, 128)      0         
 ng2D)                                                           
                                                                 
 conv2d_167 (Conv2D)         (None, 73, 73, 64)        32832     
                                                                 
 max_pooling2d_119 (MaxPooli  (None, 36, 36, 64)       0         
 ng2D)                                                           
                                                                 
 conv2d_168 (Conv2D)         (None, 35, 35, 32)        8224      
                                                                 
 max_pooling2d_120 (MaxPooli  (None, 17, 17, 32)     

In [209]:
model_2.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = [metrics.Accuracy(), metrics.Recall()])

In [210]:
model_2.fit(X_train, y_train, validation_data = (X_val, y_val), batch_size = 32, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f96647da1f0>

In [211]:
model_2.evaluate(X_test, y_test)



[5.799577236175537, 0.0015625000232830644, 0.17374999821186066]

In [212]:
y_pred = model_2.predict(X_test)



In [213]:
y_pred[0]

array([0.31772488, 0.43536004, 0.24618122, 0.00073391], dtype=float32)

In [176]:
y_train[0]

array([0., 0., 0., 1.], dtype=float32)

## Better Model?

In [148]:
# Create instance of to model
f_model = Sequential()

# Add Convolution layers + Pooling and Dropout layers to limit overfitting.
f_model.add(layers.Conv2D(64, kernel_size=(3,3), input_shape=(224, 224, 3), activation='relu', padding='same'))
f_model.add(layers.MaxPool2D(pool_size=(3,3)))
f_model.add(layers.Dropout(0.3))
f_model.add(layers.Conv2D(32, kernel_size=(2,2), activation='relu'))
f_model.add(layers.MaxPool2D(pool_size=(2,2)))
f_model.add(layers.Conv2D(16, kernel_size=(2,2), activation='relu'))
f_model.add(layers.MaxPool2D(pool_size=(2,2)))

# Flatten and Dense layers
f_model.add(layers.Flatten())
f_model.add(layers.Dense(15, activation='relu'))
f_model.add(layers.Dropout(0.3))
f_model.add(layers.Dense(5, activation='relu'))

# Output layer
f_model.add(layers.Dense(4, activation='softmax'))

In [149]:
f_model.summary()

Model: "sequential_31"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_125 (Conv2D)         (None, 224, 224, 64)      1792      
                                                                 
 max_pooling2d_84 (MaxPoolin  (None, 74, 74, 64)       0         
 g2D)                                                            
                                                                 
 dropout_27 (Dropout)        (None, 74, 74, 64)        0         
                                                                 
 conv2d_126 (Conv2D)         (None, 73, 73, 32)        8224      
                                                                 
 max_pooling2d_85 (MaxPoolin  (None, 36, 36, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_127 (Conv2D)         (None, 35, 35, 16)      

In [150]:
f_model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = [metrics.Accuracy(), metrics.Recall()])

In [151]:
f_model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f97426215b0>

In [None]:
model_2.evaluate(test_dataset)



[0.4698968827724457, 0.0, 0.7300000190734863]