In [1]:
import xarray as xr
from pathlib import Path
import numpy as np

In [2]:
train_data_str = '2021_09_02_TRAIN_For_Nando_shuffle.nc'
val_data_str = '2021_09_02_VALID_For_Nando.nc'
test_data_str = '2021_09_02_TEST_For_Nando_shuffle.nc'

In [3]:
root_str = 'SPCAM_tb_preproc'

In [4]:
train_path = Path("..", root_str, train_data_str)
val_path = Path("..", root_str, val_data_str)
test_path = Path("..", root_str, test_data_str)

### Check out dataset

In [5]:
data_ds = xr.open_dataset(train_path)

In [6]:
data_ds

In [7]:
data_ds.dims # dims, coords, attrs, values, vars

Frozen({'var_names': 159, 'sample': 47177728})

### Try out random sampling

In [8]:
n_samples = data_ds.dims['sample']
var_names = data_ds.dims['var_names']

In [9]:
perc = 0.01

In [10]:
num_select = int(perc * n_samples)
print(f"select {num_select} samples")

select 471777 samples


In [11]:
select_samples = np.random.choice(range(n_samples), size=num_select, replace=False)

In [12]:
len(select_samples)

471777

In [13]:
selected_vars = data_ds.vars[select_samples, :]

In [14]:
selected_time = data_ds.time[select_samples]

In [15]:
selected_lat = data_ds.lat[select_samples]

In [16]:
selected_lon = data_ds.lon[select_samples]
selected_lon

In [17]:
data_ds.var_names

In [18]:
ds_resampled_attrs = data_ds.attrs.copy()
ds_resampled_attrs['history'] = ''


In [19]:
data_arrays = [selected_time, selected_lat, selected_lon, selected_vars]

In [20]:
ds_resampled = xr.Dataset({x.name: x for x in data_arrays},
                           attrs = ds_resampled_attrs)

In [21]:
ds_resampled

In [22]:
str(perc).replace(".","-")

'0-01'

In [23]:
output_file = train_data_str.split(".")[0] + "_perc" + str(perc).replace(".","-") + ".nc"

In [24]:
output_file

'2021_09_02_TRAIN_For_Nando_shuffle_perc0-01.nc'

In [25]:
# ds_resampled.to_netcdf(output_file)

In [26]:
# try reading it again
new_data_ds = xr.open_dataset(output_file)
new_data_ds

In [27]:
new_data_ds.dims

Frozen({'sample': 471777, 'var_names': 159})

In [28]:
new_data_ds.close()
data_ds.close()
ds_resampled.close()

### Make a function out of it

In [29]:
def random_dataset_percentage(perc_, input_filepath, output_dir="test_data"):
    # Load dataset
    ds = xr.open_dataset(input_filepath)
    
    # Prepare selected indices
    n_samples = ds.dims['sample']
    num_select = int(perc_ * n_samples)
    print(f"Select {num_select} samples.")
    
    indices_select = np.random.choice(range(n_samples), size=num_select, replace=False)
    
    # Select data arrays
    selected_time = ds.time[indices_select]
    selected_lat = ds.lat[indices_select]
    selected_lon = ds.lon[indices_select]
    selected_vars = ds.vars[indices_select, :]
    
    # Create new dataset
    ds_resampled_attrs = ds.attrs.copy()
    ds_resampled_attrs['history'] = ''
    
    data_arrays = [selected_time, selected_lat, selected_lon, selected_vars]
    ds_resampled = xr.Dataset({x.name: x for x in data_arrays},
                           attrs = ds_resampled_attrs)
    
    # Save new dataset in current directory
    str(perc_).replace(".","-")
    output_file = Path(output_dir, input_filepath.parts[-1].split('.')[0] + "_perc" + str(perc_).replace(".","-") + ".nc")
    print(f"Saving output .nc file as '{output_file}'.")
    ds_resampled.to_netcdf(output_file)
    
    # Close dataset
    ds.close()
    ds_resampled.close()
    

In [30]:
def get_sample_size(input_filepath):
    ds = xr.open_dataset(input_filepath)
    n_samples = ds.dims['sample']
    ds.close()
    return n_samples

In [31]:
print(f"Train samples: {get_sample_size(train_path)}")
print(f"Validation samples: {get_sample_size(val_path)}")
print(f"Test samples: {get_sample_size(test_path)}")

Train samples: 47177728
Validation samples: 48357376
Test samples: 47964160


In [33]:
# random_dataset_percentage(0.0001, train_path)
# random_dataset_percentage(0.0002, val_path)
random_dataset_percentage(0.001, test_path)

Select 47964 samples.
Saving output .nc file as 'test_data/2021_09_02_TEST_For_Nando_shuffle_perc0-001.nc'.
