### Imports

In [14]:
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split

In [15]:
print(tf.config.list_physical_devices('GPU'))

[]


## Creating annotations
First we need to transform our Yolo labels dataset into a csv file

In [16]:
# Iterate over each label file in the labels folder
def create_csv_annotations(images_folder, labels_folder, name):
    annotations = []
    image_width = 224
    image_height = 224

    for label_file in os.listdir(labels_folder):
        if label_file.endswith('.txt'):
            with open(os.path.join(labels_folder, label_file), 'r') as f:
                lines = f.readlines()
            
            image_name = os.path.splitext(label_file)[0] + '.jpg'
            image_path = os.path.join(images_folder, image_name)
            
            for line in lines:
                class_label, x_center, y_center, width, height = map(float, line.split())
                x_min = (x_center - width / 2)
                y_min = (y_center - height / 2)
                x_max = (x_center + width / 2)
                y_max = (y_center + height / 2)
                
                annotations.append([image_path, x_min, y_min, x_max, y_max, image_width, image_height, class_label])

        # Here we create a DataFrame from annotations list and then we convert the df into a csv file
        df = pd.DataFrame(annotations, columns=['img_path', 'xmin', 'ymin', 'xmax', 'ymax', 'width', 'height', 'label'])
        df.to_csv(name, index=False)

In [17]:
create_csv_annotations('images/train', 'labels/train', 'annotations_train.csv')

In [18]:
create_csv_annotations('images/test', 'labels/test', 'annotations_test.csv')

In [19]:
create_csv_annotations('images/val', 'labels/val', 'annotations_val.csv')

### Transforming the csv annotations to arrays required by TensorFlow

In [20]:
# Load annotations from CSV
train_annotations = pd.read_csv('annotations_train.csv')
val_annotations = pd.read_csv('annotations_val.csv')
test_annotations = pd.read_csv('annotations_test.csv')

#train_annotations, val_annotations = train_test_split(annotations, test_size=0.2, random_state=42)

# The 224 x 224 input is required because -> this is because vast majority of pretrained TF models
# were trained using that input
input_shape = (224, 224, 3)  # height, width, depth (this is the # of color channels RGB = 3)

# Function to preprocess image and annotations
# the annotations are still not in the format required for TF
def preprocess_data(annotation):
    image = load_img(annotation['img_path'], target_size=(input_shape[0], input_shape[1]))
    image_array = img_to_array(image)
    image_array /= 255.0
    bbox = [annotation['xmin'], annotation['ymin'], annotation['xmax'], annotation['ymax']]
    label = annotation['label']
    return image_array, bbox, label

train_data = train_annotations.apply(preprocess_data, axis=1)
val_data = val_annotations.apply(preprocess_data, axis=1)
test_data = test_annotations.apply(preprocess_data, axis=1)

# Convert preprocessed data into arrays -> this is the format needed for TF
X_train, y_train_bbox, y_train_label = zip(*train_data)
X_val, y_val_bbox, y_val_label = zip(*val_data)
X_test, y_test_bbox, y_test_label = zip(*test_data)

# Convert lists to numpy arrays
X_train = tf.convert_to_tensor(X_train)
y_train_bbox = tf.convert_to_tensor(y_train_bbox)
y_train_label = tf.convert_to_tensor(y_train_label)
X_val = tf.convert_to_tensor(X_val)
y_val_bbox = tf.convert_to_tensor(y_val_bbox)
y_val_label = tf.convert_to_tensor(y_val_label)
X_test = tf.convert_to_tensor(X_test)
y_test_bbox = tf.convert_to_tensor(y_test_bbox)
y_test_label = tf.convert_to_tensor(y_test_label)


Here we are checking that the shapes of the images[0] and labels are the same

In [21]:
len(X_train)

2776

In [33]:
X_train.shape

TensorShape([2776, 224, 224, 3])

In [31]:
y_train_label

<tf.Tensor: shape=(2776,), dtype=float32, numpy=array([3., 3., 3., ..., 2., 1., 1.], dtype=float32)>

In [22]:
len(X_val)

424

In [23]:
len(X_test)

859

In [37]:
y_test_label.shape

TensorShape([859])

## Concepts to wrap your head around the next models

[Tensor flow intro -> Why sequential](https://towardsdatascience.com/a-comprehensive-introduction-to-tensorflows-sequential-api-and-model-for-deep-learning-c5e31aee49fa#:~:text=The%20sequential%20model%20allows%20us,for%20building%20deep%20learning%20models.): Here we have a more in depth explanation of what we are actually doing when adding layers to the model.Sequential and what that means. All of the following models work based on that cause we are training neural networks.

[Input and output shapes for CNN](https://towardsdatascience.com/understanding-input-and-output-shapes-in-convolution-network-keras-f143923d56ca): This can help you understand why the value behind our input_shape and a bit more about the CNNs

[Types of Convolutions in Deep Learning](https://towardsdatascience.com/types-of-convolutions-in-deep-learning-717013397f4d): Here we have some types of convolutions used by some of the next models. For example, MobileNetV2 uses depthwise separable convolutions, and those are explained here.

[But What is a convolution?](https://www.youtube.com/watch?v=KuXjwB4LzSA):
In case you really want to understand the basics and what a convolution really is.

## First model

[Basics of the R-CNN model](https://towardsdatascience.com/object-detection-explained-r-cnn-a6c813937a76)

In [None]:
# This is the RCNN model, this is just base model for testing
def create_rcnn_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),#batch_input_shape (if you wanted to give the batch_size)
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.Flatten(), # Here we are basically changing the 4D output of the CNN to 2D so that we can use Dense
        layers.Dense(64, activation='relu'),
        layers.Dense(num_classes, activation='softmax', name='classifier_output')
    ])
    return model

# Number of classes
num_classes = train_annotations['label'].nunique()

# Create an instance of the R-CNN model
rcnn_model = create_rcnn_model(input_shape, num_classes)

# Compile the model with appropriate losses and metrics
rcnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
rcnn_model.fit(X_train, y_train_label, validation_data=(X_test, y_test_label), epochs=10)

## RCNN With MobileNetV2

[What is MobileNetV2](https://towardsdatascience.com/mobilenetv2-inverted-residuals-and-linear-bottlenecks-8a4362f4ffd5)

In [24]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.optimizers import Adam

In [38]:
def create_rcnn_MovileNetV2(input_shape, num_classes):
    
    base_model = MobileNetV2(input_shape=input_shape, include_top=False, weights='imagenet')
    
    for layer in base_model.layers[-20:]:
        layer.trainable = True
    
    # Additional convolutional layers with reduced kernel size
    conv_layers = models.Sequential([ #Without the padding / strides I get dimensionality errors :(
        layers.Conv2D(32, (3, 3), activation='relu', padding='same'),  # Add padding to maintain spatial dimensions
        layers.MaxPooling2D((2, 2), strides=(1, 1)),  # Reduce the pooling stride
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),  # Add padding to maintain spatial dimensions
        layers.MaxPooling2D((2, 2), strides=(1, 1)),  # Reduce the pooling stride
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),  # Add padding to maintain spatial dimensions
    ])
    
    # R-CNN top layers
    top_layers = models.Sequential([
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(num_classes, activation='softmax', name='classifier_output')
    ])
    
    # Combine the base ResNet model, additional convolutional layers, and top layers
    model = models.Sequential([
        base_model,
        conv_layers,
        top_layers
    ])
    
    return model

# Number of classes
num_classes = train_annotations['label'].nunique()

input_shape = (224, 224, 3)  # 224 is the one used by ResNet
rcnn_resnet_model = create_rcnn_MovileNetV2(input_shape, num_classes)

rcnn_resnet_model.compile(optimizer=Adam(lr=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

rcnn_resnet_model.fit(X_train, y_train_label, validation_data=(X_test, y_test_label), epochs=15)




Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x140f2773400>

In [40]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

y_pred = np.argmax(rcnn_resnet_model.predict(X_val), axis=1)

y_val_label = np.array(y_val_label, dtype=int)
y_pred = np.array(y_pred, dtype=int)

target_names = ['Vehiculos', 'Construcciones', 'Vias', 'Rios', 'Mineria']  # Get unique class labels
print(classification_report(y_val_label, y_pred, target_names=target_names))

conf_matrix = confusion_matrix(y_val_label, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

                precision    recall  f1-score   support

     Vehiculos       0.00      0.00      0.00         3
Construcciones       0.80      0.93      0.86       254
          Vias       0.73      0.39      0.51        98
          Rios       0.67      0.83      0.74        60
       Mineria       0.00      0.00      0.00         9

      accuracy                           0.77       424
     macro avg       0.44      0.43      0.42       424
  weighted avg       0.74      0.77      0.74       424

Confusion Matrix:
[[  0   0   0   3   0]
 [  0 237   5  12   0]
 [  0  55  38   5   0]
 [  0   3   7  50   0]
 [  0   2   2   5   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Resnet152
This one is the deepest variant of ResNet providing 152 layers

In [25]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet152V2
from tensorflow.keras import layers, models

In [41]:
def create_resnet152_model(input_shape, num_classes):
    # Load pre-trained ResNet152V2 model without the top classification layer
    base_model = ResNet152V2(weights='imagenet', include_top=False, input_shape=input_shape)

    # Freeze the weights of the pre-trained layers
    for layer in base_model.layers:
        layer.trainable = False

    # Add custom classification head
    top_layers = models.Sequential([
        layers.GlobalAveragePooling2D(),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])

    # Combine the base model with custom classification head
    model = models.Sequential([
        base_model,
        top_layers
    ])

    return model

input_shape = (224, 224, 3)  # Input shape for ResNet152V2
num_classes = train_annotations['label'].nunique()

# Create the ResNet152 model
resnet152_model = create_resnet152_model(input_shape, num_classes)

# Compile the model
resnet152_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

resnet152_model.fit(X_train, y_train_label, validation_data=(X_test, y_test_label), epochs=10, batch_size=32)

# Print model summary
resnet152_model.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resnet152v2 (Functional)    (None, 7, 7, 2048)        58331648  
                                                                 
 sequential_11 (Sequential)  (None, 5)                 525829    
                                                                 
Total params: 58857477 (224.52 MB)
Trainable params: 525829 (2.01 MB)
Non-trainable params: 58331648 (222.52 MB)
_________________________________________________________________


In [42]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

y_pred = np.argmax(resnet152_model.predict(X_val), axis=1)

y_val_label = np.array(y_val_label, dtype=int)
y_pred = np.array(y_pred, dtype=int)

target_names = ['Vehiculos', 'Construcciones', 'Vias', 'Rios', 'Mineria']  # Get unique class labels
print(classification_report(y_val_label, y_pred, target_names=target_names))

conf_matrix = confusion_matrix(y_val_label, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

                precision    recall  f1-score   support

     Vehiculos       0.00      0.00      0.00         3
Construcciones       0.82      1.00      0.90       254
          Vias       0.86      0.51      0.64        98
          Rios       0.98      0.85      0.91        60
       Mineria       0.40      0.22      0.29         9

      accuracy                           0.84       424
     macro avg       0.61      0.52      0.55       424
  weighted avg       0.84      0.84      0.82       424

Confusion Matrix:
[[  0   3   0   0   0]
 [  0 253   0   0   1]
 [  0  46  50   1   1]
 [  0   0   8  51   1]
 [  0   7   0   0   2]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## EfficientNet

In [47]:
from efficientnet.tfkeras import EfficientNetB6

In [55]:
def create_efficientnet_b6(input_shape, num_classes):
    # Load pre-trained EfficientNet-B6 model without the top classification layer
    base_model = EfficientNetB6(weights='imagenet', include_top=False, input_shape=input_shape)

    for layer in base_model.layers:
        layer.trainable = False

    #  Custom classification head
    top_layers = models.Sequential([
        layers.GlobalAveragePooling2D(),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])

    # Here we combine the base model with our custom classification head
    model = models.Sequential([
        base_model,
        top_layers
    ])

    return model

input_shape = (224, 224, 3)  # Input shape for EfficientNet-B6
num_classes = train_annotations['label'].nunique() 

# Create the EfficientNet-B6 model
efficientnet_b6_model = create_efficientnet_b6(input_shape, num_classes)

# Compile the model - sparse_cc instead of cc is needed otherwise I get ValueError: Shapes (None, 1) and (None, 5) are incompatible
efficientnet_b6_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

efficientnet_b6_model.fit(X_train, y_train_label, validation_data=(X_test, y_test_label), epochs=10, batch_size=32)


efficientnet_b6_model.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 efficientnet-b6 (Functiona  (None, 7, 7, 2304)        40960136  
 l)                                                              
                                                                 
 sequential_21 (Sequential)  (None, 5)                 591365    
                                                                 
Total params: 41551501 (158.51 MB)
Trainable params: 591365 (2.26 MB)
Non-trainable params: 40960136 (156.25 MB)
_________________________________________________________________


In [58]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

y_pred = np.argmax(efficientnet_b6_model.predict(X_val), axis=1)

y_val_label = np.array(y_val_label, dtype=int)
y_pred = np.array(y_pred, dtype=int)

target_names = ['Vehiculos', 'Construcciones', 'Vias', 'Rios', 'Mineria']  # Get unique class labels
print(classification_report(y_val_label, y_pred, target_names=target_names))

conf_matrix = confusion_matrix(y_val_label, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

                precision    recall  f1-score   support

     Vehiculos       0.40      0.67      0.50         3
Construcciones       0.82      0.98      0.89       254
          Vias       0.84      0.50      0.63        98
          Rios       0.93      0.90      0.92        60
       Mineria       0.50      0.11      0.18         9

      accuracy                           0.83       424
     macro avg       0.70      0.63      0.62       424
  weighted avg       0.83      0.83      0.82       424

Confusion Matrix:
[[  2   1   0   0   0]
 [  2 248   3   0   1]
 [  0  45  49   4   0]
 [  0   1   5  54   0]
 [  1   6   1   0   1]]


## DenseNet

In [56]:
from tensorflow.keras import layers, models

In [57]:
def create_densenet_264(input_shape, num_classes):
    base_model = tf.keras.applications.DenseNet201(input_shape=input_shape, include_top=False, weights='imagenet')

    for layer in base_model.layers:
        layer.trainable = False

    # Add custom classification head
    top_layers = models.Sequential([
        layers.GlobalAveragePooling2D(),
        layers.Dense(512, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])

    model = models.Sequential([
        base_model,
        top_layers
    ])

    return model

input_shape = (224, 224, 3) 
num_classes = train_annotations['label'].nunique()

# Create the DenseNet-264 model
densenet_264_model = create_densenet_264(input_shape, num_classes)

# Compile the model
densenet_264_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

densenet_264_model.fit(X_train, y_train_label, validation_data=(X_test, y_test_label), epochs=10, batch_size=32)

densenet_264_model.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 densenet201 (Functional)    (None, 7, 7, 1920)        18321984  
                                                                 
 sequential_23 (Sequential)  (None, 5)                 986117    
                                                                 
Total params: 19308101 (73.65 MB)
Trainable params: 986117 (3.76 MB)
Non-trainable params: 18321984 (69.89 MB)
_________________________________________________________________


In [59]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

y_pred = np.argmax(densenet_264_model.predict(X_val), axis=1)

y_val_label = np.array(y_val_label, dtype=int)
y_pred = np.array(y_pred, dtype=int)

target_names = ['Vehiculos', 'Construcciones', 'Vias', 'Rios', 'Mineria']  # Get unique class labels
print(classification_report(y_val_label, y_pred, target_names=target_names))

conf_matrix = confusion_matrix(y_val_label, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

                precision    recall  f1-score   support

     Vehiculos       0.00      0.00      0.00         3
Construcciones       0.80      0.98      0.88       254
          Vias       0.89      0.41      0.56        98
          Rios       0.93      0.90      0.92        60
       Mineria       0.55      0.67      0.60         9

      accuracy                           0.82       424
     macro avg       0.63      0.59      0.59       424
  weighted avg       0.83      0.82      0.80       424

Confusion Matrix:
[[  0   3   0   0   0]
 [  0 249   0   0   5]
 [  0  54  40   4   0]
 [  0   1   5  54   0]
 [  0   3   0   0   6]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [64]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score

In [None]:
y_pred_reshaped = y_pred.reshape(-1, 1) 
conf_scores = np.max(y_pred_reshaped, axis=1)  

sorted_indices = np.argsort(conf_scores)[::-1]
sorted_conf_scores = conf_scores[sorted_indices]

precisions = []
recalls = []

for class_label in range(5):
    class_precisions = []
    class_recalls = []
    for threshold in sorted_conf_scores:
        threshold_mask = conf_scores >= threshold
        y_pred_threshold = y_pred_reshaped[threshold_mask]
        y_true_threshold = y_val_label[threshold_mask]
        precision = precision_score(y_true_threshold == class_label, y_pred_threshold == class_label)
        recall = recall_score(y_true_threshold == class_label, y_pred_threshold == class_label)
        class_precisions.append(precision)
        class_recalls.append(recall)
    
    precisions.append(class_precisions)
    recalls.append(class_recalls)

plt.figure(figsize=(10, 6))
for class_label in range(5):
    plt.plot(recalls[class_label], precisions[class_label], label=f'Class {class_label}')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for Each Class')
plt.legend()
plt.grid(True)
plt.show()