<a href="https://colab.research.google.com/github/DrAdamDev/ETL-pipeline-for-UK-Employment-data/blob/main/Rock_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install keras-tuner
!pip install tensorflow-addons

from google.colab import drive
drive.mount('/content/drive')

# Set the output directory path
output_dir = '/content/'

# Get the free space in bytes
free_space = os.statvfs(output_dir).f_frsize * os.statvfs(output_dir).f_bavail

# Convert to GB
free_space_gb = free_space / (1024 ** 3)

print(f"Free space in {output_dir}: {free_space_gb:.2f} GB")

In [None]:
# Importing necessary modules for the project
import os
import cv2
import uuid
import random
import itertools
import numpy as np
import pandas as pd
import matplotlib.image as image
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
from keras_tuner import HyperModel
from sklearn.utils import class_weight
from keras_tuner.tuners import Hyperband
from keras.optimizers import SGD
from keras.optimizers import Adam
from keras.optimizers import RMSprop
from sklearn.model_selection import train_test_split
from keras_tuner.engine.hyperparameters import HyperParameters
from keras.applications import MobileNetV2, ResNet50
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.imagenet_utils import preprocess_input
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import BatchNormalization, Concatenate
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

In [None]:
def remove_augmented_images(directories):
    for dir in directories:
        for root, dirs, files in os.walk(dir):
            for file in files:
                if '_aug' in file:
                    os.remove(os.path.join(root, file))

def read_image_data(directories):
    i = 0
    data = []
    problem_images = []
    for dir in directories:
        for root, dirs, files in os.walk(dir):
            for file in files:
                if file.endswith('.jpg') | file.endswith('.jpeg'):
                    path = os.path.join(root, file)
                    try:
                        img = cv2.imread(path)
                        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                        data.append((path, os.path.basename(os.path.dirname(path))))
                        i += 1
                        print(f'Files read: {i}')
                    except:
                        print(f'Error reading image: {path}')
                        problem_images.append(path)
                        os.remove(path)
    print(f'Problem images: {problem_images}')
    return data

def augment_data(data):
    augmented_data = []
    for i, (path, label) in enumerate(data):
        print(f'Augmenting image {i+1}/{len(data)}: {path}') # output progress
        img = cv2.imread(path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        augmented_data.append((path, label))
        for j in range(5):
            transform_parameters = {
                'theta': np.random.uniform(-40, 40),
                'tx': np.random.uniform(-0.2, 0.2),
                'ty': np.random.uniform(-0.2, 0.2),
                'shear': np.random.uniform(-0.2, 0.2),
                'zx': np.random.uniform(0.8, 1.2),
                'zy': np.random.uniform(0.8, 1.2),
                'flip_horizontal': np.random.random() < 0.5,
                'flip_vertical': np.random.random() < 0.5,
                'brightness': np.random.uniform(0.5, 1.2),
            }

            datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
            img_transformed = datagen.apply_transform(img, transform_parameters)
            img_path = os.path.splitext(path)[0] + '_aug' + str(j) + '.jpg'
            cv2.imwrite(img_path, cv2.cvtColor(img_transformed, cv2.COLOR_RGB2BGR))
            augmented_data.append((img_path, label))
    return augmented_data

In [None]:
def split_data(data):
    data_dict = {}
    for path, label in data:
        key = os.path.splitext(path)[0]
        if '_aug' in key:
            key = key[:key.rindex('_aug')]
        if key not in data_dict:
            data_dict[key] = []
        data_dict[key].append((path, label))

    train_keys, val_keys = train_test_split(list(data_dict.keys()), test_size=0.1, random_state=42)

    train_data_dict = {key: data_dict[key] for key in train_keys}

    val_data_dict = {key: data_dict[key] for key in val_keys}

    train_data = [(path, label) for sublist in train_data_dict.values() for path, label in sublist]
    val_data = [(path, label) for sublist in val_data_dict.values() for path, label in sublist]

    # Shuffle the training and validation sets
    random.shuffle(train_data)
    random.shuffle(val_data)

    train_df = pd.DataFrame(train_data, columns=['path', 'label'])
    val_df = pd.DataFrame(val_data, columns=['path', 'label'])

    return train_df, val_df

def create_generators(train_df, val_df, image_size, batch_size):
    train_datagen = ImageDataGenerator(
        preprocessing_function=preprocess_input,
    )

    val_datagen = ImageDataGenerator(
        preprocessing_function=preprocess_input,
    )

    train_generator = train_datagen.flow_from_dataframe(
        train_df,
        x_col='path',
        y_col='label',
        target_size=(image_size, image_size),
        batch_size=batch_size,
        class_mode='categorical',
        classes=['Basalt', 'Granite', 'Marble', 'Quartzite', 'Coal', 'Limestone', 'Sandstone']
    )

    val_generator = val_datagen.flow_from_dataframe(
        val_df,
        x_col='path',
        y_col='label',
        target_size=(image_size, image_size),
        batch_size=batch_size,
        class_mode='categorical',
        classes=['Basalt', 'Granite', 'Marble', 'Quartzite', 'Coal', 'Limestone', 'Sandstone']
    )

    return train_generator, val_generator

In [None]:
def calculate_class_weights(train_generator):
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced', 
        classes=np.unique(train_generator.classes), 
        y=train_generator.classes
    )

    class_weights_dict = dict(enumerate(class_weights))
    return class_weights_dict

In [None]:
# Hypermodel class for input into Hyperband tuner
class MyHyperModel(HyperModel):
    def __init__(self, input_shape, num_classes):
        self.input_shape = input_shape
        self.num_classes = num_classes
        
    def build(self, hp):
        # Define hyperparameters
        learning_rate = hp.Choice('learning_rate', values=[1e-4, 1e-5])
        optimizer = hp.Choice('optimizer', values=['adam', 'sgd', 'rmsprop'])
        base_model_name = hp.Choice('base_model_name', values=['MobileNetV2', 'ResNet50'])
        num_conv_blocks_to_freeze = hp.Int('num_conv_blocks_to_freeze', min_value=1, max_value=5, step=1)
        num_fc_layers = hp.Int('num_fc_layers', min_value=1, max_value=3, step=1)
        num_fc_neurons = hp.Choice('num_fc_neurons', values=[64, 128, 256])
        dropout_rate = hp.Float('dropout_rate', min_value=0.0, max_value=0.5, step=0.1)
        batch_size = hp.Choice('batch_size', values=[32, 64, 128])
        num_epochs = hp.Choice('num_epochs', values=[10, 20, 30, 40, 50])

        # Define base model
        if base_model_name == 'MobileNetV2':
            base_model = MobileNetV2(input_shape=self.input_shape, include_top=False, weights='imagenet')
        else:
            base_model = ResNet50(input_shape=self.input_shape, include_top=False, weights='imagenet')
            
        # Freeze some convolutional blocks
        for i, layer in enumerate(base_model.layers):
            if i < len(base_model.layers) - num_conv_blocks_to_freeze:
                layer.trainable = False
        
        # Add fully connected layers
        x = base_model.output
        x = Flatten()(x)
        for i in range(num_fc_layers):
            x = Dense(num_fc_neurons, activation='relu')(x)
            x = Dropout(dropout_rate)(x)
        predictions = Dense(self.num_classes, activation='softmax')(x)

        # Build and compile the model
        if optimizer == 'adam':
            optimizer = Adam(learning_rate=learning_rate)
        elif optimizer == 'sgd':
            optimizer = SGD(learning_rate=learning_rate)
        else:
            optimizer = RMSprop(learning_rate=learning_rate)
            
        model = Model(inputs=base_model.input, outputs=predictions)
        model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
        
        return model

In [None]:
# Function for building the model manually with custom hyperparameters
def build_model(input_shape, num_classes, learning_rate, optimizer, base_model_name, num_conv_blocks_to_freeze, num_fc_layers, num_fc_neurons, dropout_rate):
    
    # Define base model
    if base_model_name == 'MobileNetV2':
        base_model = MobileNetV2(input_shape=input_shape, include_top=False, weights='imagenet')
    else:
        base_model = ResNet50(input_shape=input_shape, include_top=False, weights='imagenet')
            
    # Freeze some convolutional blocks
    for i, layer in enumerate(base_model.layers):
        if i < len(base_model.layers) - num_conv_blocks_to_freeze:
            layer.trainable = False
        
    # Add fully connected layers
    x = base_model.output
    x = Flatten()(x)
    for i in range(num_fc_layers):
        x = Dense(num_fc_neurons, activation='relu')(x)
        x = Dropout(dropout_rate)(x)
    predictions = Dense(num_classes, activation='softmax')(x)

    # Build and compile the model
    if optimizer == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer == 'sgd':
        optimizer = SGD(learning_rate=learning_rate)
    else:
        optimizer = RMSprop(learning_rate=learning_rate)
            
    model = Model(inputs=base_model.input, outputs=predictions)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
        
    return model

In [None]:
def define_early_stop(monitor, patience):
    #Define early stopping criteria
    return EarlyStopping(monitor=monitor, patience=patience)

def set_steps_per_epoch(train_generator):
    #Calculate the number of steps per epoch
    return train_generator.n // train_generator.batch_size

def set_hyperband_search(input_shape, num_classes):
    """Set up the hyperparameter search."""
    hypermodel = MyHyperModel(input_shape=input_shape, num_classes=num_classes)
    return Hyperband(hypermodel,
                     max_epochs=50,
                     factor=3,
                     hyperband_iterations=2,
                     objective='val_accuracy',
                     directory='my_dir',
                     project_name='my_project',
                     seed=36)
    
def search_for_hyperparameters(tuner, train_generator, val_generator, steps_per_epoch_train, class_weights_dict):
    """Search for the optimal hyperparameters."""
    tuner.search(train_generator,
                 steps_per_epoch=steps_per_epoch_train,
                 validation_data=val_generator,
                 epochs=50,
                 callbacks=[define_early_stop()],
                 class_weight=class_weights_dict,
                 verbose=1)


def print_optimal_hyperparameters(tuner):
    """Print the optimal hyperparameters."""
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    print(f'Learning rate: {best_hps.get("learning_rate")}')
    print(f'Base model name: {best_hps.get("base_model_name")}')
    print(f'Number of convolutional blocks to freeze: {best_hps.get("num_conv_blocks_to_freeze")}')
    print(f'Number of fully connected layers: {best_hps.get("num_fc_layers")}')
    print(f'Number of neurons in the fully connected layers: {best_hps.get("num_fc_neurons")}')
    print(f'Dropout rate: {best_hps.get("dropout_rate")}')
    print(f'Batch size: {best_hps.get("batch_size")}')
    print(f'Number of epochs: {best_hps.get("num_epochs")}')

def get_best_model(tuner):
    """Get the best model from the search."""
    return tuner.get_best_models(num_models=1)[0]

In [None]:
def train_best_model(best_model, train_generator, val_generator, early_stop, steps_per_epoch_train, class_weights_dict):
        history = best_model.fit(
        train_generator,
        epochs=9,
        validation_data=val_generator,
        steps_per_epoch=steps_per_epoch_train,
        callbacks=[early_stop],
        class_weight=class_weights_dict
    )

        return history


def plot_training_history(history):
   #Plots the loss and accuracy over time.
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.show()

In [None]:
# Main script
if __name__ == '__main__':
    
    print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

    # Set paths for data
    directories = ['/content/drive/MyDrive/RockData/Dataset/Igneous',
                   '/content/drive/MyDrive/RockData/Dataset/Metamorphic',
                   '/content/drive/MyDrive/RockData/Dataset/Sedimentary']

    # Set image size and batch size
    image_size = 224
    batch_size = 64

    # # Delete all augmented images in the directories
    #remove_augmented_images(directories)

    # # Perform data augmentation and append the resulting images to the original data
    #augment_data(data)

    # Create dataframe to hold image paths and labels
    data = read_image_data(directories)    

    # Split the data into train and validation sets
    train_df, val_df = split_data(data)

    # Set up the generators with the modified data
    train_generator, val_generator = create_generators(train_df, val_df, image_size, batch_size)

    # Calculate class weights
    class_weights_dict = calculate_class_weights(train_generator)

    # Add the remaining code for model training, evaluation, etc.

    #Define early stopping criteria
    early_stop = define_early_stop(monitor='val_loss', patience=5)

    #Calculate the number of steps per epoch
    steps_per_epoch_train = set_steps_per_epoch(train_generator)

    # Set up the hyperparameter search
    #tuner = set_hyperband_search(input_shape=(224, 224, 3), num_classes=7)

    # Search for the optimal hyperparameters
    #search_for_hyperparameters(tuner, train_generator, val_generator, steps_per_epoch_train, class_weights_dict)

    # Print the optimal hyperparameters
    #print_optimal_hyperparameters(tuner)

    # Get the best model from the search
    #best_model = get_best_model(tuner)

    # Train the best model found during hyperparameter search
    #history = train_best_model(best_model, train_generator, val_generator, early_stop, steps_per_epoch_train, class_weights_dict)

    # Plot the training history
    #plot_training_history(history)

In [None]:
# Set up the hyperparameter search
hypermodel = MyHyperModel(input_shape=(224, 224, 3), num_classes=7)
tuner = Hyperband(
    hypermodel,
    max_epochs=50,
    factor=3,
    hyperband_iterations=2,
    objective='val_accuracy',
    directory='my_dir',
    project_name='my_project',
    seed=36
)

# Search for the optimal hyperparameters
#with strategy.scope():
tuner.search(
    train_generator,
    steps_per_epoch=steps_per_epoch_train,
    validation_data=val_generator,
    epochs=50,
    callbacks=[early_stop],
    class_weight=class_weights_dict,
    verbose=1
)

# Print the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f'Learning rate: {best_hps.get("learning_rate")}')
print(f'Base model name: {best_hps.get("base_model_name")}')
print(f'Number of convolutional blocks to freeze: {best_hps.get("num_conv_blocks_to_freeze")}')
print(f'Number of fully connected layers: {best_hps.get("num_fc_layers")}')
print(f'Number of neurons in the fully connected layers: {best_hps.get("num_fc_neurons")}')
print(f'Dropout rate: {best_hps.get("dropout_rate")}')
print(f'Batch size: {best_hps.get("batch_size")}')
print(f'Number of epochs: {best_hps.get("num_epochs")}')

# Get the best model from the search
best_model = tuner.get_best_models(num_models=1)[0]

In [None]:
# MANUAL MODEL BUILD

# Current Best Configuration with leaking
learning_rate = 1e-5
optimizer = 'adam'
base_model_name = 'ResNet50'
num_conv_blocks_to_freeze = 4
num_fc_layers = 3
num_fc_neurons = 256
dropout_rate = 0
batch_size = 64
num_epochs = 10

# Current Best Configuration without leaking
learning_rate = 1e-4
optimizer = 'adam'
base_model_name = 'ResNet50'
num_conv_blocks_to_freeze = 1
num_fc_layers = 1
num_fc_neurons = 128
dropout_rate = 0.4
batch_size = 32
num_epochs = 10

# Build model based on manually selected hyperparameters
model = build_model(input_shape=(224, 224, 3),
                    num_classes=7,
                    learning_rate=learning_rate,
                    optimizer=optimizer,
                    base_model_name=base_model_name,
                    num_conv_blocks_to_freeze=num_conv_blocks_to_freeze,
                    num_fc_layers=num_fc_layers,
                    num_fc_neurons=num_fc_neurons,
                    dropout_rate=dropout_rate)

model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(
          train_generator,
          epochs=9,
          validation_data=val_generator,
          steps_per_epoch=steps_per_epoch_train,
          callbacks=[early_stop],
          class_weight=class_weights_dict
)

# Evaluate model on the validation data
val_loss, val_acc = model.evaluate(val_generator)
print(f"Validation loss: {val_loss:.3f}")
print(f"Validation accuracy: {val_acc:.3f}")

In [None]:
# CREATE TFLITE MODEL AND SAVE BOTH .h5 and tflite MODELS

# Save the trained model
#best_model.save('/content/drive/MyDrive/RockData/my_dir/best_model.h5')

# Convert the Keras model to a TensorFlow Lite model
converter = tf.lite.TFLiteConverter.from_keras_model(best_model)
tflite_model = converter.convert()

# Save the TensorFlow Lite model to a file
with open('/content/drive/MyDrive/RockData/my_dir/best_model.tflite', 'wb') as f:
    f.write(tflite_model)

In [None]:
# Evaluate the saved .h5 model on the validation data

# Load the saved model
loaded_model = tf.keras.models.load_model('/content/drive/MyDrive/RockData/my_dir/my_model.h5')

# Evaluate the saved model on the validation data
val_loss, val_acc = loaded_model.evaluate(val_generator)
print(f"Validation loss: {val_loss:.3f}")
print(f"Validation accuracy: {val_acc:.3f}")

In [None]:
# Evaluate the saved tflite model on the validation data

# Load the saved model
loaded_model = tf.keras.models.load_model('/content/drive/MyDrive/RockData/my_dir/best_model.tflite')

# Evaluate the saved model on the validation data
val_loss, val_acc = loaded_model.evaluate(val_generator)
print(f"Validation loss: {val_loss:.3f}")
print(f"Validation accuracy: {val_acc:.3f}")

In [None]:
# TEST THE .h5 MODEL

# Set path to image
img_path = '/content/drive/MyDrive/RockData/my_dir/new_images/sandstone_sample_1.jpg'

# Read in image
img = cv2.imread(img_path)

# Convert BGR to RGB color space
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Resize image to target size
target_size = (224, 224)
img = cv2.resize(img, target_size)

# Preprocess image
img = preprocess_input(img)

# Add batch dimension
img = np.expand_dims(img, axis=0)

# Get prediction on the image
prediction = loaded_model.predict(img)

# Print the predicted class and probability
predicted_class = np.argmax(prediction)
class_names = ['Basalt', 'Granite', 'Marble', 'Quartzite', 'Coal', 'Limestone', 'Sandstone']
class_prob = prediction[0][predicted_class]
print(f'Predicted class: {class_names[predicted_class]}, probability: {class_prob}')

In [None]:
# TEST THE TFLITE MODEL

# Load the TFLite model
interpreter = tf.lite.Interpreter(model_path="/content/drive/MyDrive/RockData/my_dir/model.tflite")
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Load and preprocess the image
image = cv2.imread("/content/drive/MyDrive/RockData/my_dir/new_images/sandstone_sample_1.jpg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = cv2.resize(image, (224, 224))
image = np.expand_dims(image, axis=0)
image = preprocess_input(image)

# Make a prediction on the image
interpreter.set_tensor(input_details[0]["index"], image)
interpreter.invoke()
output_data = interpreter.get_tensor(output_details[0]["index"])

# Print the predicted class and probability
predicted_class = np.argmax(output_data)
class_names = ['Basalt', 'Granite', 'Marble', 'Quartzite', 'Coal', 'Limestone', 'Sandstone']
class_prob = output_data[0][predicted_class]
print(f"Predicted class: {class_names[predicted_class]}, probability: {class_prob}")