-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataseting.py
523 lines (453 loc) · 24.6 KB
/
dataseting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
# %% #General Imports
import warnings
warnings.filterwarnings ( "ignore" )
import os,sys
from time import time
from datetime import datetime , timedelta # time,
import multiprocess as mp
from IPython.display import display
import logging
from tqdm import tqdm
# %% Scientific and data imports
import matplotlib.pyplot as plt
import sqlite3
import numpy as np
import pandas as pd
import xarray as xr
from scipy.ndimage import gaussian_filter1d , gaussian_filter
from scipy.interpolate import griddata
from functools import partial
eps = np.finfo ( np.float ).eps
# %% Local modules imports
import global_settings as gs
import data_loader as dl
import preprocessing as prep
from utils import create_and_configer_logger
# %% Dataset creating helper functions
def create_dataset ( station_name = 'haifa' , start_date = datetime ( 2017 , 9 , 1 ) ,
end_date = datetime ( 2017 , 9 , 2 ) , sample_size = '29.5min' ,list_dates =[]) :
"""
CHOOSE: telescope: far_range , METHOD: Klett_Method
each sample will have 60 bins (aka 30 mins length)
path to db: stationdb_file
:param start_date:
:param end_date:
:key
Date DONE
start_time
end_time
period <30 min>
wavelength DONE
method (KLET) DONE
Y:
LC (linconst from .db) DONE
LC std (un.linconst from.db) DONE
r0 (from *_profile.nc DONE
delta_r (r1-r0) DONE
X:
lidar_path (station.lidar_src_folder/<%Y/%M/%d> +nc_zip_file+'att_bsc.nc'
molecular_path ( station .molecular_src_folder/<%Y/%M><day_mol.nc>
Dataset of calibrated samples for learning calibration method.
The samples were generated by TROPOS using Pollynet_Processing_Chain.
For more info visit here: https://github.com/PollyNET/Pollynet_Processing_Chain
# %% Generating Database of samples:
# X = {range corrected signal (lidar source), attbsc signal (molecular source)}
# Y = estimated values {LC,r0,r1} (and also LC_std is possible)
# TODO : organize and update usage
"""
logger = logging.getLogger ( )
station = gs.Station ( station_name = station_name )
db_path = station.db_file
wavelengths = gs.LAMBDA_nm ( ).get_elastic ( )
cali_method = 'Klett_Method'
if len(list_dates)>0:
try:
dates = [datetime.combine(t.date(), t.time()) for t in list_dates]
except TypeError as e :
logger.exception ( f"{e}: `list_dates`, when not empty, is excepted to have datetime.datetime() objects.")
return
else:
dates = pd.date_range ( start = start_date , end = end_date , freq = 'D' ).to_pydatetime ( ).tolist ( )
full_df = pd.DataFrame ( )
for wavelength in tqdm ( wavelengths ) :
for day_date in dates : # tqdm(dates) #TODO: find a way to run inner loop of tqdm, in a convenient way
# Query the db for a specific day, wavelength and calibration method
try :
query = get_query ( wavelength , cali_method , day_date )
df = query_database ( query = query , database_path = db_path )
if df.empty :
raise Exception (
f"\n Not existing data for {station.location} station, during {day_date.strftime ( '%Y-%m-%d' )} in '{db_path}'" )
df [ 'date' ] = day_date.strftime ( '%Y-%m-%d' )
df = add_profiles_values ( df , station , day_date , file_type = 'profiles' )
df = add_X_path ( df , station , day_date , lambda_nm = wavelength , data_source = 'lidar' ,
file_type = 'range_corr' )
df = add_X_path ( df , station , day_date , lambda_nm = wavelength , data_source = 'molecular' ,
file_type = 'attbsc' )
df = df.rename (
{'liconst' : 'LC' , 'uncertainty_liconst' : 'LC_std' , 'matched_nc_profile' : 'profile_path'} ,
axis = 'columns' )
expanded_df = get_time_slots_expanded ( df , sample_size )
# reorder the columns
key = [ 'date' , 'wavelength' , 'cali_method' , 'telescope' , 'cali_start_time' , 'cali_stop_time' ,
'start_time_period' , 'end_time_period' , 'profile_path' ]
Y_features = [ 'LC' , 'LC_std' , 'r0' , 'r1' , 'dr' , 'bin_r0' , 'bin_r1' ]
X_features = [ 'lidar_path' , 'molecular_path' ]
expanded_df = expanded_df [ key + X_features + Y_features ]
full_df = full_df.append ( expanded_df )
except Exception as e :
logger.exception ( f"{e}, skipping to next day." )
continue
# convert height bins to int
full_df [ 'bin_r0' ] = full_df [ 'bin_r0' ].astype ( np.uint16 )
full_df [ 'bin_r1' ] = full_df [ 'bin_r1' ].astype ( np.uint16 )
return full_df.reset_index ( drop = True )
def get_query ( wavelength , cali_method , day_date ) :
start_time = datetime.combine ( date = day_date.date ( ) , time = day_date.time ( ).min )
end_time = datetime.combine ( date = day_date.date ( ) , time = day_date.time ( ).max )
query = f"""
SELECT lcc.liconst, lcc.uncertainty_liconst,
lcc.cali_start_time, lcc.cali_stop_time,
lcc.wavelength, lcc.cali_method, lcc.telescope
FROM lidar_calibration_constant as lcc
WHERE
wavelength == {wavelength} AND
cali_method == '{cali_method}' AND
telescope == 'far_range' AND
(cali_start_time BETWEEN '{start_time}' AND '{end_time}');
"""
return query
def query_database ( query = "SELECT * FROM lidar_calibration_constant;" ,
database_path = "pollyxt_tropos_calibration.db" ) :
"""
Query is a string following sqlite syntax (https://www.sqlitetutorial.net/) to query the .db
Examples:
query_basic = "
SELECT * -- This is a comment. Get all columns from table
FROM lidar_calibration_constant -- Which table to query
"
query_advanced = "
SELECT lcc.id, lcc.liconst, lcc.cali_start_time, lcc.cali_stop_time -- get only some columns
FROM lidar_calibration_constant as lcc
WHERE -- different filtering options on rows
wavelength == 1064 AND
cali_method LIKE 'Klet%' AND
(cali_start_time BETWEEN '2017-09-01' AND '2017-09-02');
"
"""
# logger = logging.getLogger ( )
# Connect to the db and query it directly into pandas df.
#try:
with sqlite3.connect ( database_path ) as c :
# Query to df
# optionally parse 'id' as index column and 'cali_start_time', 'cali_stop_time' as dates
df = pd.read_sql ( sql = query , con = c , parse_dates = [ 'cali_start_time' , 'cali_stop_time' ] )
#except sqlite3.OperationalError as e:
# logger.exception ( f"{e}: Unable to load dataset." )
# sys.exit(1)
# TODO: raise load exception if file does not exsits. the above commented solution is not really catching this and continues to run
return df
def add_profiles_values ( df , station , day_date , file_type = 'profiles' ) :
logger = logging.getLogger ( )
try :
df [ 'matched_nc_profile' ] = df.apply ( lambda row : prep.get_TROPOS_dataset_paths ( station , day_date ,
start_time = row.cali_start_time ,
end_time = row.cali_stop_time ,
file_type = file_type ) [
0 ] ,
axis = 1 , result_type = 'expand' )
except Exception :
logger.exception (
f"Non resolved 'matched_nc_profile' for {station.location} station, at date {day_date.strftime ( '%Y-%m-%d' )} " )
pass
def _get_info_from_profile_nc ( row ) :
"""
Get the r_0,r_1, and delta_r of the selected row. The values are following rebasing according to sea-level height.
:param row:
:return:
"""
data = prep.load_dataset ( row [ 'matched_nc_profile' ] )
wavelen = row.wavelength
# get altitude to rebase the reference heights according to sea-level-height
altitude = data.altitude.item ( )
[ r0 , r1 ] = data [ f'reference_height_{wavelen}' ].values
[ bin_r0 , bin_r1 ] = [ np.argmin ( abs ( data.height.values - r ) ) for r in [ r0 , r1 ] ]
delta_r = r1 - r0
return r0 + altitude , r1 + altitude , delta_r , bin_r0 , bin_r1
df [ [ 'r0' , 'r1' , 'dr' , 'bin_r0' , 'bin_r1' ] ] = df.apply ( _get_info_from_profile_nc , axis = 1 ,
result_type = 'expand' )
return df
def add_X_path ( df , station , day_date , lambda_nm = 532 , data_source = 'molecular' , file_type = 'attbsc' ) :
"""
Add path of 'molecular' or 'lidar' dataset to df
:param df: pd.DataFrame ( ) result from database query
:param station: gs.station() object of the lidar station
:param day_date: datetime.datetime object of the measuring date
:param lambda_nm: wavelength [nm] e.g., for the green channel 532 [nm]
:param data_source: string object: 'molecular' or 'lidar'
:param file_type: string object: e.g., 'attbsc' for molecular_dataset or 'range_corr' for a lidar_dataset
:return: ds with the added collum of the relevant raw
"""
paths = prep.get_prep_dataset_paths ( station = station ,
day_date = day_date ,
data_source = data_source ,
lambda_nm = lambda_nm ,
file_type = file_type )
if not paths :
raise Exception (
f"\n Not exiting any '{data_source}' path for {station.location} station, at {day_date.strftime ( '%Y-%m-%d' )}" )
elif len ( paths ) != 1 :
raise Exception (
f"\n Expected ONE '{data_source}' path for {station.location} station, at {day_date.strftime ( '%Y-%m-%d' )}.\nGot: {paths} " )
df [ f"{data_source}_path" ] = paths [ 0 ]
return df
def get_time_slots_expanded ( df , sample_size ) :
if sample_size :
expanded_df = pd.DataFrame ( )
for indx , row in df.iterrows ( ) :
time_slots = pd.date_range ( row.loc [ 'cali_start_time' ] , row.loc [ 'cali_stop_time' ] ,
freq = sample_size )
time_slots.freq = None
if len ( time_slots ) < 2 :
continue
for start_time , end_time in zip ( time_slots [ :-1 ] , time_slots [ 1 : ] ) :
mini_df = row.copy ( )
mini_df [ 'start_time_period' ] = start_time
mini_df [ 'end_time_period' ] = end_time
expanded_df = expanded_df.append ( mini_df )
else :
expanded_df = df.copy ( )
expanded_df [ 'start_time_period' ] = expanded_df [ 'cali_start_time' ]
expanded_df [ 'end_time_period' ] = expanded_df [ 'cali_stop_time' ]
return expanded_df
# %% Dataset extending helper functions
def extend_calibration_info ( df ) :
"""
Extending the input dataset (df) by - recalculation of Lidar Constant and calculation info of reference range
:param df: Dataset of calibrated samples for learning calibration method.
The samples were generated by TROPOS using Pollynet_Processing_Chain.
For more info visit here: https://github.com/PollyNET/Pollynet_Processing_Chain
:return: Dataset of calibrated samples
"""
logger = logging.getLogger ( )
tqdm.pandas ( )
logger.info ( f"Start LC calculations" )
df [ [ 'LC_recalc' , 'LCp_recalc' ] ] = df.progress_apply ( recalc_LC , axis = 1 , result_type = 'expand' )
logger.info ( f"Start mid-reference range calculations" )
df [ 'bin_rm' ] = df [ [ 'bin_r0' , 'bin_r1' ] ].progress_apply ( np.mean , axis = 1 ,
result_type = 'expand' ).astype ( int )
df [ 'rm' ] = df [ [ 'r0' , 'r1' ] ].progress_apply ( np.mean , axis = 1 , result_type = 'expand' ).astype (
float )
logger.info ( f"Done database extension" )
return df
def convert_Y_features_units ( df , Y_features = [ 'LC' , 'LC_std' , 'r0' , 'r1' , 'dr' ] ,
scales = {'LC' : 1E-9 , 'LC_std' : 1E-9 , 'r0' : 1E-3 , 'r1' : 1E-3 , 'dr' : 1E-3} ) :
Y_scales = [ scales [ feature ] for feature in Y_features ]
for feature , scale in zip ( Y_features , Y_scales ) :
df [ feature ] *= scale
return df
def create_calibration_ds ( df , station , source_file ) :
"""
Create calibration dataset from the input df, by splitting it to a dataset according to wavelengths and time (as coordinates).
:param df: extended dataset of calibrated samples
:param station: gs.station() object of the lidar station
:param source_file: string, the name of the extended dataset
:return: calibration dataset for the the times in df
"""
times = pd.to_datetime ( df [ 'start_time_period' ] )
times = times.dt.to_pydatetime ( )
wavelengths = df.wavelength.unique ( )
ds_chan = [ ]
tqdm.pandas ( )
for wavelength in tqdm ( wavelengths ) :
ds_cur = xr.Dataset (
data_vars = {'LC' : (('Time') , df [ df [ 'wavelength' ] == wavelength ] [ 'LC' ]) ,
'LCrecalc' : (('Time') , df [ df [ 'wavelength' ] == wavelength ] [ 'LC_recalc' ]) ,
'LCprecalc' : (('Time') , df [ df [ 'wavelength' ] == wavelength ] [ 'LCp_recalc' ]) ,
'r0' : (('Time') , df [ df [ 'wavelength' ] == wavelength ] [ 'r0' ]) ,
'r1' : (('Time') , df [ df [ 'wavelength' ] == wavelength ] [ 'r1' ]) ,
'rm' : (('Time') , df [ df [ 'wavelength' ] == wavelength ] [ 'rm' ]) ,
'bin_r0' : (('Time') , df [ df [ 'wavelength' ] == wavelength ] [ 'bin_r0' ]) ,
'bin_r1' : (('Time') , df [ df [ 'wavelength' ] == wavelength ] [ 'bin_r1' ]) ,
'bin_rm' : (('Time') , df [ df [ 'wavelength' ] == wavelength ] [ 'bin_rm' ])} ,
coords = {
'Time' : times [ list ( df [ df [ 'wavelength' ] == wavelength ] [ 'LC' ].index.values ) ] ,
'Wavelength' : np.uint16 ( [ wavelength ] )} )
ds_cur.LC.attrs = {'units' : r'$\rm{photons\,sr\,km^3}$' , 'long_name' : r'$\rm{ LC_{TROPOS}}$' ,
'info' : 'LC - Lidar constant - by TROPOS'}
ds_cur.LCrecalc.attrs = {'units' : r'$\rm{photons\,sr\,km^3}$' , 'long_name' : r'$\rm{LC_{recalc}}$' ,
'info' : 'LC - Lidar constant - recalculated'}
ds_cur.LCprecalc.attrs = {'units' : r'$\rm{photons\,sr\,km^3}$' , 'long_name' : r'$\rm{LC_{pos-recalc}}$' ,
'info' : 'LC - Lidar constant - recalculated on non negative range corrected dignal'}
ds_cur.r0.attrs = {'units' : r'$km$' , 'long_name' : r'$r_0$' , 'info' : 'Lower calibration height'}
ds_cur.r1.attrs = {'units' : r'$km$' , 'long_name' : r'$r_1$' , 'info' : 'Upper calibration height'}
ds_cur.rm.attrs = {'units' : r'$km$' , 'long_name' : r'$r_m$' , 'info' : 'Mid calibration height'}
ds_cur.Wavelength.attrs = {'units' : r'$nm$' , 'long_name' : r'$\lambda$'}
ds_chan.append ( ds_cur )
ds_calibration = xr.concat ( ds_chan , dim = 'Wavelength' )
# After concatenating, making sure the 'bin' values are integers.
for bin in tqdm ( [ 'bin_r0' , 'bin_r1' , 'bin_rm' ] ) :
ds_calibration [ bin ] = ds_calibration [ bin ].astype ( np.uint16 )
ds_calibration [ bin ].attrs = {'long_name' : fr'${bin}$'}
# Global info
ds_calibration.attrs = {'info' : 'Periodic pollyXT calibration info' ,
'location' : station.name ,
'source_file' : source_file ,
'sample_size' : f"{(times [ 1 ] - times [ 0 ]).seconds}S" ,
'start_time' : times [ 0 ].strftime ( "%Y-%d-%m %H:%M:%S" ) ,
'end_time' : times [ -1 ].strftime ( "%Y-%d-%m %H:%M:%S" )}
return ds_calibration
def recalc_LC ( row ) :
"""
Extending database:by recalculate LC_recalc, LCp_recalc
:param row:
:return:
"""
sliced_ds , _ = get_sample_ds ( row )
slice_ratio = sliced_ds [ 0 ].copy ( deep = True )
slice_ratio.data /= sliced_ds [ 1 ].data
LC_recalc = slice_ratio.mean ( ).values.item ( )
LCp_recalc = slice_ratio.where ( slice_ratio >= 0.0 ).mean ( ).values.item ( )
return LC_recalc , LCp_recalc
# %% General datasets functions
def _df_split ( tup_arg , **kwargs ) :
split_ind , df_split , df_f_name = tup_arg
return (split_ind , getattr ( df_split , df_f_name ) ( **kwargs ))
def df_multi_core ( df , df_f_name , subset = None , njobs = -1 , **kwargs ) :
# %% testing multiproccesing from: https://gist.github.com/morkrispil/3944242494e08de4643fd42a76cb37ee
if njobs == -1 :
njobs = mp.cpu_count ( )
pool = mp.Pool ( processes = njobs )
try :
df_sub = df [ subset ] if subset else df
splits = np.array_split ( df_sub , njobs )
except ValueError :
splits = np.array_split ( df , njobs )
pool_data = [ (split_ind , df_split , df_f_name) for split_ind , df_split in enumerate ( splits ) ]
results = pool.map ( partial ( _df_split , **kwargs ) , pool_data )
pool.close ( )
pool.join ( )
results = sorted ( results , key = lambda x : x [ 0 ] )
results = pd.concat ( [ split [ 1 ] for split in results ] )
return results
def get_sample_ds ( row ) :
mol_path = row.molecular_path
lidar_path = row.lidar_path
bin_r0 = row.bin_r0
bin_r1 = row.bin_r1
t0 = row.start_time_period
t1 = row.end_time_period
full_ds = [ prep.load_dataset ( path ) for path in [ lidar_path , mol_path ] ]
tslice = slice ( t0 , t1 )
profiles = [ 'range_corr' , 'attbsc' ]
sliced_ds = [ ds_i.sel ( Time = tslice ,
Height = slice ( float ( ds_i.Height [ bin_r0 ].values ) ,
float ( ds_i.Height [ bin_r1 ].values ) ) ) [ profile ]
for ds_i , profile in zip ( full_ds , profiles ) ]
return sliced_ds , full_ds
def make_interpolated_image ( nsamples , im ) :
"""Make an interpolated image from a random selection of pixels.
Take nsamples random pixels from im and reconstruct the image using
scipy.interpolate.griddata.
"""
nx , ny = im.shape [ 1 ] , im.shape [ 0 ]
X , Y = np.meshgrid ( np.arange ( 0 , nx , 1 ) , np.arange ( 0 , ny , 1 ) )
ix = np.random.randint ( im.shape [ 1 ] , size = nsamples )
iy = np.random.randint ( im.shape [ 0 ] , size = nsamples )
samples = im [ iy , ix ]
int_im = griddata ( (iy , ix) , samples , (Y , X) , method = 'nearest' , fill_value = 0 )
return int_im
def get_aerBsc_profile_ds ( path , profile_df ) :
cur_profile = prep.load_dataset ( path )
height_units = 'km'
height_scale = 1e-3 # converting [m] to [km]
bsc_scale = 1e+3 # converting [1/m sr] to to [1/km sr]
heights_indx = height_scale * (cur_profile.height.values + cur_profile.altitude.values)
start_time = datetime.utcfromtimestamp ( cur_profile.start_time.values [ 0 ].tolist ( ) )
end_time = datetime.utcfromtimestamp ( cur_profile.end_time.values [ 0 ].tolist ( ) )
time_indx = pd.date_range ( start = start_time , end = end_time , freq = '30S' )
profile_r0 = profile_df.groupby ( [ 'wavelength' ] ) [ 'bin_r0' ].unique ( ).astype ( np.int ).values
profile_r1 = profile_df.groupby ( [ 'wavelength' ] ) [ 'bin_r0' ].unique ( ).astype ( np.int ).values
ds_chans = [ ]
var_names = [ f"aerBsc_klett_{wavelength}" for wavelength in [ 355 , 532 , 1064 ] ]
for v_name , wavelength , r in zip ( var_names , [ 355 , 532 , 1064 ] , profile_r1 ) :
vals = cur_profile [ v_name ].values
vals [ r : ] = eps
vals = gaussian_filter1d ( vals , 21 , mode = 'nearest' )
vals = vals.T.reshape ( len ( heights_indx ) , 1 )
vals [ vals < 0 ] = eps
vals *= bsc_scale
mat_vals = np.repeat ( vals , len ( time_indx ) , axis = 1 )
aerbsc_df = pd.DataFrame ( mat_vals , index = heights_indx , columns = time_indx )
aerBsc_ds_chan = xr.Dataset (
data_vars = {'aerBsc' : (('Height' , 'Time') , aerbsc_df) ,
'lambda_nm' : ('Wavelength' , [ wavelength ])
} ,
coords = {'Height' : aerbsc_df.index.to_list ( ) ,
'Time' : aerbsc_df.columns ,
'Wavelength' : [ wavelength ]
} )
aerBsc_ds_chan.aerBsc.attrs = {'long_name' : r'$\beta$' , # _{{a}}$' ,
'units' : r'$km^{{-1}} sr^{-1}$' ,
'info' : r'$Aerosol backscatter$'}
# set attributes of coordinates
aerBsc_ds_chan.Height.attrs = {'units' : '{}'.format ( '{}'.format ( height_units ) ) ,
'info' : 'Measurements heights above sea level'}
aerBsc_ds_chan.Wavelength.attrs = {'long_name' : r'$\lambda$' , 'units' : r'$nm$'}
ds_chans.append ( aerBsc_ds_chan )
return xr.concat ( ds_chans , dim = 'Wavelength' )
# %% MAIN
def main ( station_name , start_date , end_date ) :
logging.getLogger ( 'matplotlib' ).setLevel ( logging.ERROR ) # Fix annoying matplotlib logs
logging.getLogger ( 'PIL' ).setLevel ( logging.ERROR ) # Fix annoying PIL logs
logger = create_and_configer_logger ( 'dataseting_log.log' , level = logging.INFO )
# set operating mode
DO_DATASET = False
EXTEND_DATASET = True
DO_CALIB_DATASET = True
USE_KM_UNITS = True
# Load data of station
station = gs.Station ( stations_csv_path = 'stations.csv' , station_name = station_name )
logger.info ( f"Loading {station.location} station" )
# Set new paths
csv_path = f"dataset_{station_name}_{start_date.strftime ( '%Y-%m-%d' )}_{end_date.strftime ( '%Y-%m-%d' )}.csv"
csv_path_extended = f"dataset_{station_name}_{start_date.strftime ( '%Y-%m-%d' )}_{end_date.strftime ( '%Y-%m-%d' )}_extended.csv"
ds_path_extended = f"dataset_{station_name}_{start_date.strftime ( '%Y-%m-%d' )}_{end_date.strftime ( '%Y-%m-%d' )}_extended.nc"
if DO_DATASET :
logger.info (
f"Start generating dataset for period: [{start_date.strftime ( '%Y-%m-%d' )},{end_date.strftime ( '%Y-%m-%d' )}]" )
# Generate dataset for learning
sample_size = '29.5min'
df = create_dataset ( station_name = station_name , start_date = start_date ,
end_date = end_date , sample_size = sample_size )
# Convert m to km
if USE_KM_UNITS :
df = convert_Y_features_units ( df )
# TODO: Split the dataset to train, and test, and add a key (column) to each entry to indicate if this is train or test
df.to_csv ( csv_path , index = False )
logger.info ( f"Done database creation, saving to: {os.path.join ( os.path.curdir , csv_path )}" )
# Create extended dataset and save to csv - recalculation of Lidar Constant (currently this is a naive calculation)
if EXTEND_DATASET :
logger.info (
f"Start extending dataset for period: [{start_date.strftime ( '%Y-%m-%d' )},{end_date.strftime ( '%Y-%m-%d' )}]" )
if 'df' not in globals ( ) :
df = pd.read_csv ( csv_path )
df = extend_calibration_info ( df )
display ( df )
df.to_csv ( csv_path_extended , index = False )
logger.info ( f"\n The extended dataset saved to :{os.path.join ( os.path.curdir , csv_path_extended )}" )
# Create calibration dataset from the extended df (or csv) = by spliting df to A dataset according to wavelengths and time coordinates.
if DO_CALIB_DATASET :
logger.info (
f"Start creating calibration dataset for period: [{start_date.strftime ( '%Y-%m-%d' )},{end_date.strftime ( '%Y-%m-%d' )}]" )
if 'df' not in globals ( ) :
df = pd.read_csv ( csv_path_extended )
ds_calibration = create_calibration_ds ( df , station , source_file = csv_path_extended )
# Save the calibration dataset
prep.save_dataset ( ds_calibration , os.path.curdir , ds_path_extended )
logger.info ( f"The calibration dataset saved to :{os.path.join ( os.path.curdir , ds_path_extended )}" )
if __name__ == '__main__' :
station_name = 'haifa'
start_date = datetime ( 2017 , 9 , 1 )
end_date = datetime ( 2017 , 10 , 31 )
main ( station_name , start_date , end_date )