In [None]:
# For PySpark
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

# For Delta Lake
from delta import *

# Working with images
from PIL import Image
import io
import base64

import pickle
from itertools import groupby

# for loading/processing the images  
from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.datasets import load_digits

import numpy as np
import matplotlib.pyplot as plt

import time

import os
import sys
os.environ["SPARK_HOME"] = "/opt/spark-3.0.1-bin-hadoop2.7"
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [None]:
# These 2 links include the jar files needed to interact with AWS S3
!wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar -P $SPARK_HOME/jars/
!wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.3/hadoop-aws-2.7.3.jar -P $SPARK_HOME/jars/

In [None]:
AWS_ACCESS_KEY="*****************"
AWS_SECRET_KEY="*****************************"

In [None]:
# Create a Spark Session
spark = SparkSession.builder.appName("drones") \
    .config("spark.executor.memory", "12g") \
    .config("spark.driver.memory", "12g") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config('spark.hadoop.fs.s3a.access.key', AWS_ACCESS_KEY) \
    .config('spark.hadoop.fs.s3a.secret.key', AWS_SECRET_KEY) \
    .getOrCreate()

In [None]:
with open('img_path.pickle','rb') as file:
    img_path = pickle.load(file)
len(img_path)

In [None]:
one_tenth = len(img_path)//10

img_paths1TB = img_path[:3*one_tenth]
len(img_paths1TB)

In [None]:
# get a list of flight ids (folder names)
flight_ids = []
for path_name in img_paths1TB: #switch name
    flight_ids.append(path_name[13:45])
len(flight_ids)

In [None]:
flight_img_counts = [len(list(group)) for key, group in groupby(flight_ids)]
print(flight_img_counts[0:10])

In [None]:
len(flight_img_counts)

In [None]:
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

In [None]:
training = {}

In [None]:
k = 0
for i in range(len(flight_img_counts[55:])):
    i = i + 56
    starting_time = time.time()
    prv_n_images = sum(flight_img_counts[0:i])
    flight_id = flight_ids[prv_n_images]
    
    s3_bucket_name = "s3a://drones-project-test/" + flight_id + "/"
    df = spark.read.format("delta").load(s3_bucket_name)
    print("Total time to read back flight", i, "(#", flight_id, ") from our s3 bucket took", time.time()-starting_time, "seconds.")
    
    images_df = df.rdd.map(lambda x: x["img_content"])
    n_images = images_df.count()
    content = images_df.take(n_images)
    print("Starting feature extraction for flight", i, "at:", time.time()-starting_time)
    one_fifth = n_images//5
    for j in range(one_fifth): #test set: 4*one_fifth
        png_decoded = base64.b64decode(content[j][1:]) # j + one_fifth
        print("Image decoded", time.time()-starting_time)
        img = Image.open(io.BytesIO(png_decoded))
        img2 = np.array(img)
        print("Reshaping image", time.time()-starting_time)
        reshaped_img = img2.reshape(1,224,224,3)
        imgx = preprocess_input(reshaped_img)
        print("Extracting features", time.time()-starting_time)
        features = model.predict(imgx, use_multiprocessing=True)
         k = k + 1
    
    print("Done with flight", i)
    del images_df
    del df
    del s3_bucket_name
    del content

In [None]:
len(training.keys())

In [None]:
training2 = training

In [None]:
with open('features.pickle','wb') as file:
     pickle.dump(training2, file)

In [None]:
with open('features.pickle','rb') as file:
    training = pickle.load(file)

In [None]:
feat = np.array(list(training.values())).reshape(-1,4096)
pca = PCA(n_components = 20, random_state = 22)
pca.fit(feat)
x = pca.transform(feat)
print("Components before PCA:", feat.shape[1])
print("Components after PCA:", pca.n_components)

In [None]:
kmeans = KMeans(n_clusters=4, random_state=22)
kmeans.fit(x)
kmeans.labels_

In [None]:
kmeans.labels_[0:2000]

In [None]:
sse = []
list_k = list(range(2, 20))
for k in list_k:
    km = KMeans(n_clusters=k, random_state=22)
    km.fit(x)
    
    sse.append(km.inertia_)
# Plot sse against k
plt.figure(figsize=(6, 6))
plt.plot(list_k, sse)
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance')