In [2]:
import os
import numpy as np
from datetime import datetime
import learning_lidar.utils.global_settings as gs
import pandas as pd
import learning_lidar.preprocessing.preprocessing as prep
%matplotlib inline
from learning_lidar.generation.daily_signals_generations_utils import  calc_poiss_measurement,calc_range_corr_measurement
import learning_lidar.generation.generation_utils as gen_utils
gs.set_visualization_settings()

# 1. Set parameters

In [3]:
station_name = 'haifa'
station = gs.Station(station_name)
wavelengths = gs.LAMBDA_nm().get_elastic()

main_folder = os.path.dirname(os.path.abspath(os.path.curdir))
data_folder = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(os.path.curdir))), 'data')
data_folder

'C:\\Users\\addalin\\Dropbox\\Lidar\\lidar_learning\\data'

# 2. Calculate poisson on "clear" signals, without background, for the given period.
Adding range corrected with applies poisson noise to signal database
# TODO: ADD this to generation.py

In [4]:
start_date = datetime(2017, 9, 1)
end_date = datetime(2017, 10, 31)
dates = pd.date_range(start_date,end_date,freq='D')

In [5]:
CALC_STATS=False
if CALC_STATS:
    base_folder = station.gen_signal_dataset
    paths = [os.path.join(prep.get_month_folder_name(base_folder, dt),
     gen_utils.get_gen_dataset_file_name(station, dt, data_source='signal')) for dt in dates]
    mean = np.zeros(3)
    std = np.zeros(3)
    norm_scale = 1 / len(dates)

    for cur_date,nc_path in zip(dates,paths):
        signal_ds = prep.load_dataset(nc_path)
        signal_ds
        pn_ds = calc_poiss_measurement(station, cur_date, signal_ds.p)  # lidar measurement: pn ~Poiss(mu_p)
        pr2n_ds = calc_range_corr_measurement(station, cur_date, pn_ds, signal_ds.r2) # range corrected measurement: pr2n = pn * r^2
        pr2n_ds.attrs['info']+=' - w.o. background'
        mean += pr2n_ds.mean(dim={'Height', 'Time'}).values
        std += pr2n_ds.std(dim={'Height', 'Time'}).values
        signal_ds = signal_ds.assign(range_corr_p =pr2n_ds)
        gen_utils.save_generated_dataset(station, signal_ds,
                                         data_source='signal',
                                         save_mode='both',
                                         profiles=['range_corr_p'])

    mean *= norm_scale
    std *= norm_scale

# 3. Statistics for a period of the dataset
> Loading the statistics database created in dataseting.py
# TODO: ADD this to statistics calculation
# TODO: ADD `range_corr_p` as  column to dataseting.py (gen_csv)

In [6]:
stats_fname = f"stats_gen_{station.name}_{start_date.strftime('%Y-%m-%d')}_{end_date.strftime('%Y-%m-%d')}.csv"
csv_stats_path = os.path.join(data_folder, stats_fname)
df_stats = pd.read_csv(csv_stats_path)

In [7]:
if CALC_STATS:
    df_stats['range_corr_p_mean'] = mean
    df_stats['range_corr_p_std'] = std
    df_stats
    df_stats.to_csv(csv_stats_path,index=False)


# 4. Split files per time for each file in the database per sample time
# TODO: move this to a new function prepare_generated_samples() after calling to create_generated_dataset()

In [None]:
import learning_lidar.dataseting.dataseting as dataseting
start_date = datetime(2017, 9, 1)
end_date = datetime(2017, 10, 31)
dataseting.prepare_generated_samples(station,start_date,end_date)

[2021-05-12 07:34:40,353] {C:\Users\addalin\Dropbox\Lidar\lidar_learning\learning_lidar\dataseting\dataseting.py:553} INFO - Load and split datasets for 2017-09-01
Split and save time slices for: range_corr_p, 355: 100%|██████████| 48/48 [00:06<00:00,  7.84it/s]
Split and save time slices for: range_corr_p, 532: 100%|██████████| 48/48 [00:05<00:00,  8.14it/s]
Split and save time slices for: range_corr_p, 1064: 100%|██████████| 48/48 [00:06<00:00,  7.48it/s]
Split and save time slices for: range_corr, 355: 100%|██████████| 48/48 [00:06<00:00,  7.69it/s]
Split and save time slices for: range_corr, 532: 100%|██████████| 48/48 [00:08<00:00,  5.86it/s]
Split and save time slices for: range_corr, 1064: 100%|██████████| 48/48 [00:07<00:00,  6.06it/s]
Split and save time slices for: range_corr, 355: 100%|██████████| 48/48 [00:05<00:00,  9.43it/s]
Split and save time slices for: range_corr, 532: 100%|██████████| 48/48 [00:06<00:00,  7.09it/s]
Split and save time slices for: range_corr, 1064: 10

# Update the current train and test datasets according to new paths in the generated dataset
> such that the keys are not changed.

In [14]:
gen_base_name = f"dataset_gen_{station_name}_{start_date.strftime('%Y-%m-%d')}_{end_date.strftime('%Y-%m-%d')}"
csv_gen_path = os.path.join(data_folder,f"{gen_base_name}.csv")
csv_gen_train_path = os.path.join(data_folder,f"{gen_base_name}_train.csv")
csv_gen_test_path = os.path.join(data_folder,f"{gen_base_name}_test.csv")
csv_gen_path, csv_gen_train_path,csv_gen_test_path

df_gen = pd.read_csv(csv_gen_path)
df_gen_train = pd.read_csv(csv_gen_train_path)
df_gen_test = pd.read_csv(csv_gen_test_path)

In [37]:
def update_row_df(row,orig_df):
    idx = row['idx']
    row_orig = orig_df.iloc[idx]
    row['lidar_path'] = row_orig['lidar_path']
    row['signal_path'] = row_orig['signal_path']
    row['signal_p_path'] = row_orig['signal_p_path']
    row['molecular_path'] = row_orig['molecular_path']
    return  row

new_train_df = df_gen_train.apply(update_row_df,
                                  args=(df_gen,),
                                  axis=1,
                                  result_type='expand')
new_train_df

new_test_df = df_gen_test.apply(update_row_df,
                                  args=(df_gen,),
                                  axis=1,
                                  result_type='expand')
new_test_df

csv_genN_train_path = os.path.join(data_folder,f"{gen_base_name}_train_new.csv")
csv_genN_test_path = os.path.join(data_folder,f"{gen_base_name}_test_new.csv")
new_train_df.to_csv(csv_genN_train_path,index=False)
new_test_df.to_csv(csv_genN_test_path,index=False)