In [15]:
import os
import numpy as np
import cv2
import boto3
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import tempfile
import time

In future versions of this I would not embed login credentials in the code, but for testing purposes it's ok

In [None]:
aws_access_key_id = "ACCESS"
aws_secret_access_key = "SECRET"
s3_bucket_name = "ca1-simpsons"
dataset = "simpsons_dataset"

percentage_to_use = 1 # used for testing code, 1 meaning 100%

img_size = 64 # 64 pixels
batch_size = 128
epochs = 30
input_shape = (img_size, img_size, 3)

In [None]:
map_characters = {0: 'abraham_grampa_simpson', 1: 'apu_nahasapeemapetilon', 2: 'bart_simpson',
                  3: 'charles_montgomery_burns', 4: 'chief_wiggum', 5: 'comic_book_guy', 6: 'edna_krabappel',
                  7: 'homer_simpson', 8: 'kent_brockman', 9: 'krusty_the_clown', 10: 'lisa_simpson',
                  11: 'marge_simpson', 12: 'milhouse_van_houten', 13: 'moe_szyslak',
                  14: 'ned_flanders', 15: 'nelson_muntz', 16: 'principal_skinner', 17: 'sideshow_bob'}

num_classes = len(map_characters)

Creating Spark Instance with setting the memory of worker and drivers to 28GB (t2.2xlarge EC2 instances)

In [None]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-aws:3.3.1 pyspark-shell'

conf = SparkConf().setAppName("TrainCNN").set("spark.executor.memory", "28g").set("spark.driver.memory", "28g")
spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext

Below function is quite complex, essentially it iterates through the S3 bucket and dataset for each character

This time the s3 object connection is inside the function as each worker needs to have it's own connection in a distributed system.

It then reads the image, resizes it to 64x64 pixels with 3 RGB channels

And to normalize the values it divides by 255

In [None]:
def load_data(character_id, character_name, percentage=0.1):
    data = []
    labels = []
    character_folder = f"{s3_bucket_name}/simpsons_dataset/{character_name}"
    print(f"Loading data from: {character_folder}")

    s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
    paginator = s3.get_paginator("list_objects_v2")

    all_objects = []
    for result in paginator.paginate(Bucket=s3_bucket_name, Prefix=f"simpsons_dataset/{character_name}"):
        all_objects += result.get("Contents", [])

    all_objects.sort(key=lambda x: x['LastModified'])

    subset_objects = all_objects[:int(len(all_objects) * percentage)]

    for obj in subset_objects:
        img_path = obj['Key']
        print(f"Loading image: {img_path}")

        with tempfile.TemporaryFile() as fp:
            s3.download_fileobj(s3_bucket_name, img_path, fp)
            fp.seek(0)
            img = cv2.imdecode(np.frombuffer(fp.read(), np.uint8), 1)

        img = cv2.resize(img, (img_size, img_size)).astype('float32') / 255
        data.append((img, character_id))

    return data

Distributed the list of characters to each worker in an RDD

Then each item in the RDD from the load_data function is then flattened into a new RDD

In [None]:
start_time = time.time()
data_rdd = sc.parallelize(map_characters.items())
data_rdd = data_rdd.flatMap(lambda x: load_data(x[0], x[1], percentage=percentage_to_use))
print("Data Loading Time:", time.time() - start_time)

Now the master node collected the transformed data and trains the model on it

In [None]:
collected_data = data_rdd.collect() # Memory intensive

Split the data into training and test sets

In [None]:
X = np.array([item[0] for item in collected_data])
Y = to_categorical([item[1] for item in collected_data], num_classes)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
model.add(MaxPooling2D((2, 2)))
model.add(BatchNormalization())
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(BatchNormalization())
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(map_characters), activation='softmax'))

lr_schedule = ExponentialDecay(initial_learning_rate=1e-3, decay_steps=1000, decay_rate=0.9)
opt = Adam(learning_rate=lr_schedule)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])


In [None]:
datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest')

model.fit(datagen.flow(X_train, Y_train, batch_size=batch_size),
          validation_data=(X_test, Y_test),
          steps_per_epoch=len(X_train) / batch_size, epochs=epochs)

loss, accuracy = model.evaluate(X_test, Y_test)

print(f'Test accuracy: {accuracy * 100:.2f}%')

In [None]:
model.save("model.h5")

sc.stop()
spark.stop()