In [1]:
import boto3
import matplotlib.pyplot as plt
import re
import os
import glob
import scipy.io
import numpy as np
import io
import numpy as np
import pandas as pd
from PIL import Image
from multiprocessing import Pool

# Utils

In [2]:
def image_name_formating(aws_key_object):
    """
    format the name of the object in aws in proper date
    """
    items = aws_key_object.split('/')
    return '-'.join(items[-4:-1]) +' '+items[-1][:2] +':'+ items[-1][2:4]+':00.jpg'

def day_of_the_year_computation(universal_time):
    """
    Computes the floating value for the day of the year
    :param numpy.datetime64 universal_time:
    :return: float day of year:
    """
    time_object = pd.to_datetime(universal_time)
    return time_object.dayofyear + time_object.hour/24.0 + time_object.minute/(24.0*60)


def zenith_computation(universal_time, longitude, latitude):
    """
    Computes the zenith matrix associated with longitude, latitude at a given universal time,
    reference NOAA Global Monitoring Division
    :param numpy.datetime64 universal_time: universal time
    :param numpy.ndarray longitude: array of the considered longitude
    :param numpy.ndarray latitude: array of the considered latitude
    :return:numpy.ndarray zenith: matrix of the zenith in degrees
    """
    if longitude.shape == latitude.shape:
        day_of_year = day_of_the_year_computation(universal_time)
        x = 2 * np.pi / 365 * (day_of_year - 1)  # day of year un radian, named x because heavily referenced

        eqtime = (0.000075 + 0.001868 * np.cos(x) - 0.032077 * np.sin(x) - 0.014615 * np.cos(2 * x)
                  - 0.040849 * np.sin(2 * x)) * ((24 * 60) / (2 * np.pi))

        solar_declination = 0.006918 - 0.399912 * np.cos(x) + 0.070257 * np.sin(x) \
                            - 0.006758 * np.cos(2 * x) + 0.000907 * np.sin(2 * x) \
                            - 0.002697 * np.cos(3 * x) + 0.001480 * np.sin(3 * x)

        longitude_corrected = 4*longitude
        offset = (longitude_corrected + eqtime).astype('timedelta64[m]')
        true_solar_time = offset + universal_time
        true_solar_time = [x.astype(object) for x in true_solar_time.flatten()]
        true_solar_time_minutes = [(lambda time: (time.hour*60 + time.minute + time.second/60))(time)
                                   for time in true_solar_time]

        hour_angle = np.array([(x/4 - 180)*np.pi/180 for x in true_solar_time_minutes]).reshape(latitude.shape)
        zenith = np.arccos(np.cos(solar_declination)*np.cos(latitude*np.pi/180)*np.cos(hour_angle)
                           + np.sin(solar_declination)*np.sin(latitude*np.pi/180))
        
        return zenith * 180 / np.pi
    else:
        return ValueError

# Constants

In [3]:
TEMP_DIR = '/tmp/images/'
os.makedirs(TEMP_DIR, exist_ok=True)
bucket_name  = 'edp.engie-digital.prod.instance1.prod.data'

coordinates = scipy.io.loadmat('./geographicCoordinate.mat')
latitude = coordinates['geographicCoordinate']['latitude'][0][0]
longitude = coordinates['geographicCoordinate']['longitude'][0][0]

percentiles = (3,6)

num_partitions = 64
num_cores = 32 

# Connexion

In [4]:
s3 = boto3.resource('s3')

bucket = s3.Bucket(bucket_name)

s3_client = boto3.client('s3')

In [77]:
%%time
source_images = [x.key for x  in bucket.objects.filter(Prefix='LCV_SATELLITE_IMAGE_24/clearsky_ref/month')]

CPU times: user 253 ms, sys: 27.9 ms, total: 281 ms
Wall time: 1.18 s


In [73]:
%%time
processed_images = [x.key for x  in bucket.objects.filter(Prefix='LCV_SATELLITE_IMAGE_24/lcv_visual_images/processed/')]

CPU times: user 11.3 s, sys: 261 ms, total: 11.6 s
Wall time: 1min 6s


In [78]:
archives = list(set([(x.split('/')[2].split('_')[1], x.split('/')[2].split('_')[3]) for x in source_images]))
archives = [elt for elt in archives if (int(elt[1].split('.')[0][-2:])%15 == 0)]

In [86]:
def parallelize_dataframe(list_archives, func):
    df_split = np.array_split(list_archives, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [55]:
def compute_cloud_index_walon(archives):
    
    latitude_walon = 42.7983
    longitude_walon = 2.8856
    distance = np.sqrt((latitude - latitude_walon)**2+(longitude-longitude_walon)**2)
    pixel = np.where(distance == distance.min())
    pix = pixel[0][0],pixel[1][0]
    latitude_wal = latitude[pix[0]-5: pix[0]+6,pix[1]-5: pix[1]+6]
    longitude_wal = longitude[pix[0]-5: pix[0]+6,pix[1]-5: pix[1]+6]
    
    date_resultats = []
    cloud_index_a_walon = []
    
    for arch in archives:
        month = arch[0]
        hour = arch[1]
        TEMP_DIR_LOOP = TEMP_DIR+month+hour[:-4]+'/'
        os.makedirs(TEMP_DIR_LOOP, exist_ok=True)
        #print(hour)
        bucket.download_file('LCV_SATELLITE_IMAGE_24/clearsky_ref/month_{}_hour_{}'.format(month, hour), '/tmp/clearsky_ref_month_{}_hour_{}'.format(month,hour))
        bucket.download_file('LCV_SATELLITE_IMAGE_24/overcast_ref/month_{}_hour_{}'.format(month, hour), '/tmp/overcast_ref_month_{}_hour_{}'.format(month,hour))
        clear_sky = plt.imread('/tmp/clearsky_ref_month_{}_hour_{}'.format(month,hour))[pix[0]-5: pix[0]+6,pix[1]-5: pix[1]+6]
        overcast = plt.imread('/tmp/overcast_ref_month_{}_hour_{}'.format(month,hour))[pix[0]-5: pix[0]+6,pix[1]-5: pix[1]+6]
        pattern = re.compile('LCV_SATELLITE_IMAGE_24\/lcv_visual_images\/processed\/\d{4}\/'+month+'\/\d{2}\/'+hour)
        images_to_concatenate = [x for x in processed_images if re.match(pattern, x)]
        [os.remove(TEMP_DIR_LOOP+x) for x in os.listdir(TEMP_DIR_LOOP)];
        image_names = [image_name_formating(name) for name in images_to_concatenate]
        [bucket.download_file(image, TEMP_DIR_LOOP+image_name) for image, image_name in zip(images_to_concatenate, image_names)];
        concatenated_images = [plt.imread(TEMP_DIR_LOOP+x)[pix[0]-5: pix[0]+6,pix[1]-5: pix[1]+6] for x in os.listdir(TEMP_DIR_LOOP)]
        zenith_images = [zenith_computation(np.datetime64(image_name[:-4]), longitude_wal, latitude_wal)<85 
                     for image_name in image_names]

        masks = [(img > clear_sky) & ((255 - clear_sky)>=1)&zenith for img, zenith in zip(concatenated_images, zenith_images)]
        results = [np.zeros(img.shape) for img in concatenated_images]
        for mask, result,img in zip(masks, results, concatenated_images):
            result[mask] = np.divide((img[mask] - clear_sky[mask]),  overcast[mask]-clear_sky[mask])
        cloud_index = results.copy()
        for cloud, result in zip(cloud_index, results):
            cloud[result>1.2] = 0
        date_resultats.append(image_names)
        cloud_index_a_walon.append(cloud_index)
        
    return pd.DataFrame({'date': [x for subl in date_resultats for x in subl],
                         'cloud_index_walon':[x for subl in cloud_index_a_walon for x in subl]})

In [87]:
%%time
computed_cloud_index = parallelize_dataframe(archives, compute_cloud_index_walon)

CPU times: user 2.46 s, sys: 2.53 s, total: 4.99 s
Wall time: 7min 17s


In [117]:
computed_cloud_index.date = computed_cloud_index.date.apply(lambda x : pd.to_datetime(x[:-4]))

In [118]:
computed_cloud_index.to_csv('computed_cloud_index_walon.csv', index = False)

In [109]:
computed_cloud_index.cloud_index_walon.iloc[0]

array([[0.21910112, 0.26704545, 0.24528302, 0.1958042 , 0.19078947,
        0.21333333, 0.22222222, 0.23863636, 0.25543478, 0.20942408,
        0.18716578],
       [0.20994475, 0.25414365, 0.22159091, 0.18709677, 0.18954248,
        0.21917808, 0.21621622, 0.2392638 , 0.26190476, 0.20652174,
        0.19125683],
       [0.16574586, 0.16939891, 0.22285714, 0.19745223, 0.2147651 ,
        0.25547445, 0.23943662, 0.25490196, 0.28571429, 0.21387283,
        0.23595506],
       [0.14772727, 0.12429379, 0.23595506, 0.20118343, 0.22641509,
        0.28368794, 0.25517241, 0.2745098 , 0.31724138, 0.25465839,
        0.2       ],
       [0.18343195, 0.1626506 , 0.24309392, 0.19672131, 0.21142857,
        0.25      , 0.21518987, 0.24550898, 0.32679739, 0.28220859,
        0.23529412],
       [0.20245399, 0.19620253, 0.26785714, 0.19553073, 0.19318182,
        0.21290323, 0.17058824, 0.2032967 , 0.29585799, 0.27272727,
        0.26666667],
       [0.22699387, 0.25      , 0.25      , 0.20338983, 0.

In [104]:
computed_cloud_index.cloud_index_walon[0]

0    [[0.21910112359550563, 0.26704545454545453, 0....
0    [[0.03759398496240601, 0.044444444444444446, 0...
0    [[0.5321100917431193, 0.4642857142857143, 0.47...
0    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
0    [[0.5511363636363636, 0.4819277108433735, 0.46...
0    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
0    [[0.0670391061452514, 0.03296703296703297, 0.0...
0    [[0.07692307692307693, 0.057971014492753624, 0...
0    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
0    [[0.03225806451612903, 0.07792207792207792, 0....
0    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
0    [[0.021052631578947368, 0.05172413793103448, 0...
0    [[0.02027027027027027, 0.026143790849673203, 0...
0    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
0    [[0.24087591240875914, 0.0, 0.0, 0.02127659574...
0    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
0    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
0    [[0.05917159763313609, 0.05421686746987952, 0....
0    [[0.1

In [110]:
adnot_data = pd.DataFrame({'cloud_index_walon_gps':computed_cloud_index.cloud_index_walon.apply(lambda x : x[5,5]),
             'date': computed_cloud_index.date})

In [111]:
adnot_data.to_csv('adnot_data_train.csv', index = False)

# Test