In [2]:
import os
import numpy as np
from datetime import datetime
import learning_lidar.utils.global_settings as gs
import pandas as pd
import learning_lidar.preprocessing.preprocessing as prep
%matplotlib inline
from learning_lidar.generation.daily_signals_generations_utils import  calc_poiss_measurement,calc_range_corr_measurement,calc_r2_ds
import learning_lidar.generation.generation_utils as gen_utils
import learning_lidar.dataseting.dataseting as dataseting
gs.set_visualization_settings()

# 1. Set parameters

In [3]:
station_name = 'haifa'
station = gs.Station(station_name)
wavelengths = gs.LAMBDA_nm().get_elastic()

main_folder = os.path.dirname(os.path.abspath(os.path.curdir))
data_folder = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(os.path.curdir))), 'data')
data_folder

'C:\\Users\\addalin\\Dropbox\\Lidar\\lidar_learning\\data'

# 2. Calculate poisson on "clear" signals, without background, for the given period.
Adding range corrected with applies poisson noise to signal database
# TODO: ADD this to generation.py

In [4]:
start_date = datetime(2017, 9, 1)
end_date = datetime(2017, 10, 31)
dates = pd.date_range(start_date,end_date,freq='D')

In [5]:
CALC_STATS=False
if CALC_STATS:
    base_folder = station.gen_signal_dataset
    paths = [os.path.join(prep.get_month_folder_name(base_folder, dt),
     gen_utils.get_gen_dataset_file_name(station, dt, data_source='signal')) for dt in dates]
    mean = np.zeros(3)
    std = np.zeros(3)
    norm_scale = 1 / len(dates)

    for cur_date,nc_path in zip(dates,paths):
        signal_ds = prep.load_dataset(nc_path)
        signal_ds
        pn_ds = calc_poiss_measurement(station, cur_date, signal_ds.p)  # lidar measurement: pn ~Poiss(mu_p)
        pr2n_ds = calc_range_corr_measurement(station, cur_date, pn_ds, signal_ds.r2) # range corrected measurement: pr2n = pn * r^2
        pr2n_ds.attrs['info']+=' - w.o. background'
        mean += pr2n_ds.mean(dim={'Height', 'Time'}).values
        std += pr2n_ds.std(dim={'Height', 'Time'}).values
        signal_ds = signal_ds.assign(range_corr_p =pr2n_ds)
        gen_utils.save_generated_dataset(station, signal_ds,
                                         data_source='signal',
                                         save_mode='both',
                                         profiles=['range_corr_p'])

    mean *= norm_scale
    std *= norm_scale

# 3. Statistics for a period of the dataset
> Loading the statistics database created in dataseting.py
# TODO: ADD this to statistics calculation
# TODO: ADD `range_corr_p` as  column to dataseting.py (gen_csv)

In [6]:
stats_fname = f"stats_gen_{station.name}_{start_date.strftime('%Y-%m-%d')}_{end_date.strftime('%Y-%m-%d')}.csv"
csv_stats_path = os.path.join(data_folder, stats_fname)
df_stats = pd.read_csv(csv_stats_path)
df_stats

Unnamed: 0,wavelength,p_signal_mean,p_signal_std,p_signal_min,p_signal_max,range_corr_signal_mean,range_corr_signal_std,range_corr_signal_min,range_corr_signal_max,range_corr_p_signal_mean,...,attbsc_molecular_min,attbsc_molecular_max,p_bg_bg_mean,p_bg_bg_std,p_bg_bg_min,p_bg_bg_max,LC_mean,LC_std,LC_min,LC_max
0,355,88.280901,812.655,0.003964,21699.117677,15.549797,24.042348,0.936831,134.568434,15.533479,...,0.000491,0.007664,0.136932,0.152953,0.000403,0.595537,10892.717451,765.014962,9648.912511,12252.522087
1,532,94.20655,844.996252,0.008764,23194.884773,22.552368,28.921022,2.071057,145.640789,22.529013,...,0.00021,0.001449,0.356151,0.385006,0.000725,1.365883,32985.833422,2403.3498,29265.571768,37062.944908
2,1064,25.641477,223.603415,0.001039,6651.830518,7.308976,9.713228,0.245562,47.037022,7.304823,...,1.5e-05,8.8e-05,0.025049,0.026637,0.000283,0.146088,25442.852221,1830.105197,22457.129272,28573.149193


In [7]:
if CALC_STATS:
    df_stats['range_corr_p_mean'] = mean
    df_stats['range_corr_p_std'] = std
    df_stats
    df_stats.to_csv(csv_stats_path,index=False)


# 4. Split files per time for each file in the database per sample time
# TODO: move this to a new function prepare_generated_samples() after calling to create_generated_dataset()

In [9]:
CREATE_SAMPLES = False
if CREATE_SAMPLES:
    dataseting.prepare_generated_samples(station,start_date,end_date)

# Update the current train and test datasets according to new paths in the generated dataset
> such that the keys are not changed.

In [10]:
gen_base_name = f"dataset_gen_{station_name}_{start_date.strftime('%Y-%m-%d')}_{end_date.strftime('%Y-%m-%d')}"
csv_gen_path = os.path.join(data_folder,f"{gen_base_name}.csv")
csv_gen_train_path = os.path.join(data_folder,f"{gen_base_name}_train.csv")
csv_gen_test_path = os.path.join(data_folder,f"{gen_base_name}_test.csv")
csv_gen_path, csv_gen_train_path,csv_gen_test_path

df_gen = pd.read_csv(csv_gen_path)
df_gen_train = pd.read_csv(csv_gen_train_path)
df_gen_test = pd.read_csv(csv_gen_test_path)



# update p_bg with r2 multiplication
> such that the keys are not changed.

In [33]:
df_gen['date'] = pd.to_datetime(df_gen['date'])

In [91]:
grps_days = df_gen.groupby('date').groups
for day_dt,inds in grps_days.items():
    r2_ds = calc_r2_ds(station,day_dt)
    nan_inds = []
    for ind in inds:
        bg_ds = prep.load_dataset(df_gen.iloc[ind]['bg_path'])
        mean_val = bg_ds.p_bg.values.mean()
        if np.isnan(mean_val):
            nan_inds.append(ind)
    if nan_inds:
        print(f'{day_dt}, nan indices: {nan_inds}')

2017-09-30 00:00:00, nan indices: [1392, 1393, 1394, 1395, 1396, 1397, 1398, 1399, 1400, 1401, 1402, 1403, 1404, 1405, 1406, 1407, 1408, 1409, 1410, 1411, 1412, 1413, 1414, 1415, 1416, 1417, 1418, 1419, 1420, 1421, 1422, 1423, 1424, 1425, 1426, 1427, 1428, 1429, 1430, 1431, 1432, 1433, 1434, 1435, 1436, 1437, 1438, 1439, 4320, 4321, 4322, 4323, 4324, 4325, 4326, 4327, 4328, 4329, 4330, 4331, 4332, 4333, 4334, 4335, 4336, 4337, 4338, 4339, 4340, 4341, 4342, 4343, 4344, 4345, 4346, 4347, 4348, 4349, 4350, 4351, 4352, 4353, 4354, 4355, 4356, 4357, 4358, 4359, 4360, 4361, 4362, 4363, 4364, 4365, 4366, 4367, 7248, 7249, 7250, 7251, 7252, 7253, 7254, 7255, 7256, 7257, 7258, 7259, 7260, 7261, 7262, 7263, 7264, 7265, 7266, 7267, 7268, 7269, 7270, 7271, 7272, 7273, 7274, 7275, 7276, 7277, 7278, 7279, 7280, 7281, 7282, 7283, 7284, 7285, 7286, 7287, 7288, 7289, 7290, 7291, 7292, 7293, 7294, 7295]
2017-10-31 00:00:00, nan indices: [2880, 2881, 2882, 2883, 2884, 2885, 2886, 2887, 2888, 2889, 2890, 

288

In [71]:
df_gen.iloc[ind]['bg_path']
bg_ds = prep.load_dataset(df_gen.iloc[ind]['bg_path'])
wavelength = bg_ds.Wavelength.values
wavelength

array(1064, dtype=int64)

In [89]:
np.isnan(bg_ds.p_bg.values.mean())

True

In [None]:
hslice = slice (bg_ds.Height[0].values,bg_ds.Height[-1].values )
tslice = slice( prep.dt64_2_datetime(bg_ds.Time[0].values),
                prep.dt64_2_datetime(bg_ds.Time[-1].values))
r2_ds_slice = r2_ds.sel(Height = hslice, Time =tslice, Wavelength = wavelength)
p_bg_r2 = bg_ds*r2_ds_slice
p_bg_r2

In [73]:
bg_ds

In [37]:

"""def update_row_df(row,orig_df):
    idx = row['idx']
    row_orig = orig_df.iloc[idx]
    row['lidar_path'] = row_orig['lidar_path']
    row['signal_path'] = row_orig['signal_path']
    row['signal_p_path'] = row_orig['signal_p_path']
    row['molecular_path'] = row_orig['molecular_path']
    return  row

new_train_df = df_gen_train.apply(update_row_df,
                                  args=(df_gen,),
                                  axis=1,
                                  result_type='expand')
new_train_df

new_test_df = df_gen_test.apply(update_row_df,
                                  args=(df_gen,),
                                  axis=1,
                                  result_type='expand')
new_test_df

csv_genN_train_path = os.path.join(data_folder,f"{gen_base_name}_train_new.csv")
csv_genN_test_path = os.path.join(data_folder,f"{gen_base_name}_test_new.csv")
new_train_df.to_csv(csv_genN_train_path,index=False)
new_test_df.to_csv(csv_genN_test_path,index=False)
"""