In [1]:
# For extracting files from Amazon S3 Buckets
import boto3
from botocore import UNSIGNED
from botocore.client import Config

# For PySpark
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

# For Delta Lake
from delta import *

# Working with images
from PIL import Image
import io
import base64

# To speed up, track time
import multiprocessing
import time
import tqdm

# to dump data if something happens
import pickle

#to group flight ids and get counts
from itertools import groupby

import os
import sys
os.environ["SPARK_HOME"] = "/opt/spark-3.0.1-bin-hadoop2.7"
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
# These 2 links include the jar files needed to interact with AWS S3
!wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar -P $SPARK_HOME/jars/
!wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.3/hadoop-aws-2.7.3.jar -P $SPARK_HOME/jars/

--2022-05-03 23:24:21--  https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11948376 (11M) [application/java-archive]
Saving to: ‘/opt/spark-3.0.1-bin-hadoop2.7/jars/aws-java-sdk-1.7.4.jar’


2022-05-03 23:24:21 (108 MB/s) - ‘/opt/spark-3.0.1-bin-hadoop2.7/jars/aws-java-sdk-1.7.4.jar’ saved [11948376/11948376]

--2022-05-03 23:24:22--  https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.3/hadoop-aws-2.7.3.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 126287 (123K) [application/java-archive]
Saving to: ‘/opt/spark-3.0.1-bin-hadoop2.7/jars/hadoop-aws-2.7

In [3]:
# Create a Spark Session
spark = SparkSession.builder.appName("drones") \
    .config("spark.executor.memory", "25g") \
    .config("spark.driver.memory", "25g") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .getOrCreate()

sc = spark.sparkContext

schema = StructType([StructField("img_path", StringType()),
                    StructField("img_content", StringType())])
spark

In [None]:
with open('../img_path.pickle','rb') as file:
    img_path = pickle.load(file)
len(img_path)

In [None]:
one_tenth = len(img_path)//10
tenth1 = img_path[:one_tenth]
tenth2 = img_path[one_tenth:2*one_tenth]
tenth3 = img_path[2*one_tenth:3*one_tenth]
tenth4 = img_path[3*one_tenth:4*one_tenth]
tenth5 = img_path[4*one_tenth:5*one_tenth]
tenth6 = img_path[5*one_tenth:6*one_tenth]
tenth7 = img_path[6*one_tenth:7*one_tenth]
tenth8 = img_path[7*one_tenth:8*one_tenth]
tenth9 = img_path[8*one_tenth:9*one_tenth]
last_tenth = img_path[9*one_tenth:]
len(tenth1 + tenth2 + tenth3 + tenth4 + tenth5 + tenth6 + tenth7 + tenth8 + tenth9 + last_tenth)

In [None]:
cpus = multiprocessing.cpu_count()
cpus

In [None]:
s3 = boto3.resource('s3', config=Config(signature_version=UNSIGNED))
bucket = s3.Bucket("airborne-obj-detection-challenge-training")

In [None]:
def download_images(image_name):
    img_s3 = bucket.Object(image_name)
    img_content = img_s3.get()['Body'].read()
    img_PIL = Image.open(io.BytesIO(img_content))
    img_smaller = img_PIL.convert('RGB').resize((224,224))
    temp_img = io.BytesIO()
    img_smaller.save(temp_img, format = "png")
    png_encoded = base64.b64encode(temp_img.getvalue())
    
    return str(png_encoded)

In [None]:
starting_time = time.time()

with multiprocessing.Pool(cpus) as p:
    img_content = list(tqdm.tqdm(p.imap(download_images, tenth3), total = len(tenth3))) #switch name

print("Multiprocessing time for Part 1.2 with", cpus," Cores:", time.time()-starting_time)

In [None]:
len(img_content)

In [None]:
# save a copy as a list
img_content2 = img_content
len(img_content2)

In [None]:
# save a copy as a pickle file
with open('../tenth1.0.pickle','wb') as file:
    pickle.dump(img_content,file)

In [None]:
# read in pickle file if kernel dies and need img content (without redownloading content)
#with open('../tenth1.0.pickle','rb') as file:
 #   img_content = pickle.load(file)
#len(img_content)

In [None]:
# get a list of flight ids (folder names)
flight_ids = []
for path_name in tenth3: #switch name
    flight_ids.append(path_name[13:45])
len(flight_ids)

In [None]:
flight_img_counts = [len(list(group)) for key, group in groupby(flight_ids)]
print(flight_img_counts[0:10])

In [None]:
len(flight_img_counts)

In [None]:
starting_time = time.time()

for i in range(len(flight_img_counts)):
    starting_time2 = time.time()
    i = i + 1
    prv_n_images = sum(flight_img_counts[0:i])
    n_images = flight_img_counts[i]
    flight_ids1 = flight_ids[prv_n_images:prv_n_images+n_images]
    img_content1 = img_content[prv_n_images:prv_n_images+n_images]
    flight_id = flight_ids1[0]

    print("Starting upload for flight", i)
    img_path_rdd = sc.parallelize(flight_ids1)
    img_content_rdd = sc.parallelize(img_content1)
    zipped_rdd = img_path_rdd.zip(img_content_rdd).collect()
    df = spark.createDataFrame(zipped_rdd, schema)
    
    s3_bucket_name = "s3a://drones-project-test/" + flight_id + "/"
    df.write.format("delta").mode("append").save(s3_bucket_name)
    print("Total time to upload flight", i, "(#", flight_id, ") to our own s3 bucket took", time.time()-starting_time2, "seconds.")

    del img_content1
    del img_path_rdd
    del img_content_rdd
    del zipped_rdd
    del df

In [None]:
print("Upload time for 131 (1 tenth of the) flights with", cpus,"cpus:", time.time()-starting_time)