# E2E Pipeline RGB Basic
Trying something different
- Input satellite images
- Do feature extraction without fine tuning
- Linear regression on top of features extracted

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pylab as plt

import PIL
from keras import Sequential
from keras.src.layers import Conv2D, MaxPooling2D, Flatten, Dense
from keras.src.losses import MeanSquaredError
from keras.src.metrics import RootMeanSquaredError
from keras.src.optimizers import Adam
from keras.src.utils import plot_model

pd.options.mode.chained_assignment = None

# Enable GPU    

In [2]:
print('TensorFlow version: {}'.format(tf.__version__))
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print('GPU device not found - On for CPU time!')
else:
    print('Found GPU at {}'.format(device_name))

# Understand images

In [3]:
import glob
path = '../outputs/tiled-satellite-images-rgb'

training_img = glob.glob(f'{path}/*.png')
print('There are {} images in the training directory'.format(len(training_img)))

img_sz = {'width': list(),
         'height': list()} # store image attributes for further analysis
width, height = 1000, 1000

for im in training_img:
    print(im)
    img = PIL.Image.open(im)
    w, h = img.size
    if w < width:
        width = w
    if h < height:
        height = h

IMG_WIDTH = width
IMG_HEIGHT = height
IMG_CHANNELS = 3

print('Min training image width: {} px'.format(IMG_WIDTH))
print('Min training image height: {} px'.format(IMG_HEIGHT))

Reading and displaying images

### Build the training dataset

In [27]:
from sklearn.model_selection import StratifiedShuffleSplit

data_path = '../outputs/clustered-data-gauteng.csv'
data = pd.read_csv(data_path, dtype={"image":"string"})

# Use stratified sampling
sssplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
print(data.shape)
print(data['QoLIndex_Data_Driven'].shape)
for train_index,test_index in sssplit.split(data, data['QoLIndex_Data_Driven']):
    training_set = data.iloc[train_index]
    eval_set = data.iloc[test_index]
    
# Visually check the distribution of service_index score in training and test sets
training_set['QoLIndex_Data_Driven'].hist(label='Training set')
eval_set['QoLIndex_Data_Driven'].hist(label='Eval set')
plt.title('QoLIndex_Data_Driven score distribution in training and test set')
plt.xlabel('QoLIndex_Data_Driven score')
plt.ylabel('Count')
plt.legend(loc='upper right')
plt.show()

# Export training and test sets as .csv files
training_set['filename'] = training_set['image'].apply(lambda x: f'../outputs/tiled-satellite-images-rgb/{x}.png')
training_set[['filename', 'QoLIndex_Data_Driven']].to_csv('../outputs/hack/working/training_set.csv', header=False, index=False)
eval_set['filename'] = eval_set['image'].apply(lambda x: f'../outputs/tiled-satellite-images-rgb/{x}.png')
eval_set[['filename', 'QoLIndex_Data_Driven']].to_csv('../outputs/hack/working/eval_set.csv', header=False, index=False)

### Some functions

In [28]:
def read_and_decode(filename, reshape_dims):
    # Read an image file to a tensor as a sequence of bytes
    image = tf.io.read_file(filename)
    # Convert the tensor to a 3D uint8 tensor
    image = tf.image.decode_png(image, channels=IMG_CHANNELS)
    # Convert 3D uint8 tensor with values in [0, 1]
    image = tf.image.convert_image_dtype(image, tf.float32)
    # Resize the image to the desired size
    return tf.image.resize(image, reshape_dims)

def show_image(filename):
    image = read_and_decode(filename, [IMG_HEIGHT, IMG_WIDTH])
    plt.imshow(image.numpy())
    plt.axis('off')

def decode_csv(csv_row):
    record_defaults = ['filename', 'service_index']
    filename, service_index = tf.io.decode_csv(csv_row, record_defaults)
    service_index = tf.convert_to_tensor(float(service_index), dtype=tf.float32)
    image = read_and_decode(filename, [IMG_HEIGHT, IMG_WIDTH])
    return image, service_index

## Import training & eval datasets

In [29]:
IMG_WIDTH = 256
IMG_HEIGHT = 256
IMG_CHANNELS = 3

path = '../outputs/tiled-satellite-images-rgb'
training_img = glob.glob(f'{path}/*.png')
rand_idx = np.random.randint(0, len(training_img)-1)
rand_img = training_img[rand_idx]

show_image(rand_img)

In [30]:
BATCH_SIZE = 256

train_dataset = tf.data.TextLineDataset(
    '../outputs/hack/working/training_set.csv'
).map(decode_csv).batch(BATCH_SIZE)

eval_dataset = tf.data.TextLineDataset(
    '../outputs/hack/working/eval_set.csv'
).map(decode_csv).batch(BATCH_SIZE)

## Build CNN

In [31]:
# Build model
model = Sequential([
    Conv2D(64, (3, 3), activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS)),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(units=1, activation=None)
])

In [32]:
model.summary()

In [33]:
plot_model(model, show_shapes=True, show_layer_names=False)

In [34]:
model.compile(optimizer= Adam(),
              loss=MeanSquaredError(),
              metrics=[RootMeanSquaredError()])

In [35]:
%%time

history = model.fit(train_dataset, validation_data=eval_dataset, epochs=10, batch_size=BATCH_SIZE)

In [36]:
def training_plot(metrics, history):
    f, ax = plt.subplots(1, len(metrics), figsize=(5*len(metrics), 5))
    for idx, metric in enumerate(metrics):
        ax[idx].plot(history.history[metric], ls='dashed')
        ax[idx].set_xlabel('Epochs')
        ax[idx].set_ylabel(metric)
        ax[idx].plot(history.history['val_'+metric])
        ax[idx].legend(['train_'+metric, 'val_'+metric])

In [37]:
training_plot(['loss', 'root_mean_squared_error'], history)

## Notes & Takeaways

- I was following [this tutorial](https://www.kaggle.com/code/emilyrosesteyn/convolutional-neural-network-for-image-regression/edit) for image regression
- The shuffle split algorithm requires more image data than what I had clustered. 
- I want to build my pipeline:
    - Data downloading (satellite image data, GCRO data, municipal boundaries data, DHS data for SA)
    - Module for clustering data
    - Module for downloading satellite images for clusters
    - Class for feature extraction/encoding - see geocolab