In [4]:
import boto3
import matplotlib.pyplot as plt
import re
import os
import glob
import scipy.io
import numpy as np
import io
import tqdm
import numpy as np
import pandas as pd
from PIL import Image
from multiprocessing import Pool

# Utils

In [5]:
def image_name_formating(aws_key_object):
    """
    format the name of the object in aws in proper date
    """
    items = aws_key_object.split('/')
    return '-'.join(items[-4:-1]) +' '+items[-1][:2] +':'+ items[-1][2:4]+':00.jpg'

def day_of_the_year_computation(universal_time):
    """
    Computes the floating value for the day of the year
    :param numpy.datetime64 universal_time:
    :return: float day of year:
    """
    time_object = pd.to_datetime(universal_time)
    return time_object.dayofyear + time_object.hour/24.0 + time_object.minute/(24.0*60)


def zenith_computation(universal_time, longitude, latitude):
    """
    Computes the zenith matrix associated with longitude, latitude at a given universal time,
    reference NOAA Global Monitoring Division
    :param numpy.datetime64 universal_time: universal time
    :param numpy.ndarray longitude: array of the considered longitude
    :param numpy.ndarray latitude: array of the considered latitude
    :return:numpy.ndarray zenith: matrix of the zenith in degrees
    """
    if longitude.shape == latitude.shape:
        day_of_year = day_of_the_year_computation(universal_time)
        x = 2 * np.pi / 365 * (day_of_year - 1)  # day of year un radian, named x because heavily referenced

        eqtime = (0.000075 + 0.001868 * np.cos(x) - 0.032077 * np.sin(x) - 0.014615 * np.cos(2 * x)
                  - 0.040849 * np.sin(2 * x)) * ((24 * 60) / (2 * np.pi))

        solar_declination = 0.006918 - 0.399912 * np.cos(x) + 0.070257 * np.sin(x) \
                            - 0.006758 * np.cos(2 * x) + 0.000907 * np.sin(2 * x) \
                            - 0.002697 * np.cos(3 * x) + 0.001480 * np.sin(3 * x)

        longitude_corrected = 4*longitude
        offset = (longitude_corrected + eqtime).astype('timedelta64[m]')
        true_solar_time = offset + universal_time
        true_solar_time = [x.astype(object) for x in true_solar_time.flatten()]
        true_solar_time_minutes = [(lambda time: (time.hour*60 + time.minute + time.second/60))(time)
                                   for time in true_solar_time]

        hour_angle = np.array([(x/4 - 180)*np.pi/180 for x in true_solar_time_minutes]).reshape(latitude.shape)
        zenith = np.arccos(np.cos(solar_declination)*np.cos(latitude*np.pi/180)*np.cos(hour_angle)
                           + np.sin(solar_declination)*np.sin(latitude*np.pi/180))

        return zenith * 180 / np.pi
    else:
        return ValueError

# Constants

In [6]:
TEMP_DIR = '/tmp/images/'
os.makedirs(TEMP_DIR, exist_ok=True)
bucket_name  = 'edp.engie-digital.prod.instance1.prod.data'

coordinates = scipy.io.loadmat('geographicCoordinate.mat')
latitude = coordinates['geographicCoordinate']['latitude'][0][0]
longitude = coordinates['geographicCoordinate']['longitude'][0][0]

percentiles = (3,6)

num_partitions = 53
num_cores = 28 

# Connexion

In [7]:
s3 = boto3.resource('s3')

bucket = s3.Bucket(bucket_name)

s3_client = boto3.client('s3')

In [8]:
%%time
processed_images = [x.key for x  in bucket.objects.filter(Prefix='LCV_SATELLITE_IMAGE_24/lcv_visual_images/processed/')]

CPU times: user 10.4 s, sys: 64.2 ms, total: 10.5 s
Wall time: 18.6 s


In [9]:
archives = list(set([(x.split('/')[4],x.split('/')[6]) for x in processed_images]))
archives = [elt for elt in archives if (int(elt[1].split('.')[0][-2:])%15 == 0)]

In [11]:
len(archives)

848

In [13]:
def parallelize_dataframe(list_archives, func):
    df_split = np.array_split(list_archives, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [50]:
def compute_image_ref(arch):
    done = []
    for month, hour in arch:
        TEMP_DIR_LOOP = TEMP_DIR+month+hour[:-4]+'/'
        os.makedirs(TEMP_DIR_LOOP, exist_ok=True)
        pattern = re.compile('LCV_SATELLITE_IMAGE_24\/lcv_visual_images\/processed\/\d{4}\/'+month+'\/\d{2}\/'+hour)
        images_to_concatenate = [x for x in processed_images if re.match(pattern, x)]
        image_names = [image_name_formating(name) for name in images_to_concatenate]

        #Stop criterion, to get enough image

        if len(image_names)<15:
            continue

        #Downloading of the image
        [bucket.download_file(image, TEMP_DIR_LOOP+image_name) for image, image_name in zip(images_to_concatenate, image_names)];
        concatenated_images = [plt.imread(TEMP_DIR_LOOP+x) for x in os.listdir(TEMP_DIR_LOOP)]

        #Computation of zenith & sorting
        zenith_images = [zenith_computation(np.datetime64(image_name[:-4]), longitude, latitude)>85 
                         for image_name in image_names]
        array_images = np.array(concatenated_images, dtype=float)

        #Overcast image
        overcast = array_images.max(axis=0)
        minimum_clearsky = array_images.min(axis=0)

        #Sorting info
        array_images[np.array(zenith_images)] = np.inf
        array_images.sort(axis=0)


        #Clearsky image for this reference
        index_to_choose = np.percentile(range(len(image_names)), percentiles)
        index_to_choose[0] = min(np.ceil(index_to_choose[0]), 2)
        temporary_quantile = np.ceil(index_to_choose[1])
        index_to_choose[1] = temporary_quantile if temporary_quantile < index_to_choose[0] else index_to_choose[0]+2
        clearsky_image_ref = np.mean(array_images[int(index_to_choose[0]):int(index_to_choose[1])], axis=0)

        clearsky_image_ref[clearsky_image_ref == np.inf] = minimum_clearsky[clearsky_image_ref == np.inf]
        clearsky_image_ref[clearsky_image_ref == np.inf] = 0

        clearsky_image_ref = Image.fromarray(clearsky_image_ref).convert('L')
        overcast = Image.fromarray(overcast).convert('L')

        #Save in AWS
        with io.BytesIO() as file:
            clearsky_image_ref.save(file, format='JPEG')
            file.seek(0)
            s3_client.put_object(Body=file, Bucket=bucket_name,
                          Key='LCV_SATELLITE_IMAGE_24/clearsky_ref/month_' + month +'_hour_'+hour)

        with io.BytesIO() as file:
            overcast.save(file, format='JPEG')
            file.seek(0)
            s3_client.put_object(Body=file, Bucket=bucket_name,
                          Key='LCV_SATELLITE_IMAGE_24/overcast_ref/month_' + month +'_hour_'+hour)
        [os.remove(TEMP_DIR_LOOP+x) for x in os.listdir(TEMP_DIR_LOOP)];
        done.append('month_'+month+'_hour_'+hour)
    return pd.DataFrame({'image':done})

In [None]:
%%time
computed_image = parallelize_dataframe(archives, compute_image_ref)