# <a id='toc1_'></a>[Downsampling](#toc0_)

**Table of contents**<a id='toc0_'></a>    
1. [Import dependencies](#toc1_)    
2. [Load scaled parquet into Pandas DataFrame](#toc2_)
3. [Downsampling](#toc3_)

## 1. <a id='toc1'></a>[Import dependencies](#toc1_)

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
##> import libraries
import sys
from pathlib import Path
import pandas as pd
root_dir = Path.cwd().resolve().parent
if root_dir.exists():
    sys.path.append(str(root_dir))
else:
    raise FileNotFoundError('Root directory not found')

#> import custom libraries
from src.load import load_df_to_dataset

In [5]:
# assets_dir = root_dir.parent / 'data' / 'local' / 'aistraj' / 'tvt_assets'
assets_dir = root_dir.parents[2] / 'aistraj' / 'tvt_assets'
assets_dir = assets_dir.resolve()
if not assets_dir.exists():
    raise FileNotFoundError('Assets directory not found')

scaled_tvt_data_import_assets_dir = assets_dir / 'scaled' 
scaled_tvt_data_import_assets_dir = scaled_tvt_data_import_assets_dir.resolve()
if not scaled_tvt_data_import_assets_dir.exists():
    raise FileNotFoundError('Train-Validate-Test Pickled Data directory not found')

extend_tvt_data_import_assets_dir = assets_dir / 'extended' 
extend_tvt_data_import_assets_dir = extend_tvt_data_import_assets_dir.resolve()
if not extend_tvt_data_import_assets_dir.exists():
    raise FileNotFoundError('Train-Validate-Test Pickled Data directory not found')

tvt_data_import_assets_dir = assets_dir / 'original' 
tvt_data_import_assets_dir = tvt_data_import_assets_dir.resolve()
if not tvt_data_import_assets_dir.exists():
    raise FileNotFoundError('Train-Validate-Test Pickled Data directory not found')

## 2. <a id='toc2_'></a>[Load scaled parquet into Pandas DataFrame](#toc2_)

In [6]:
# Define the paths to the pickle files
train_pickle_path = scaled_tvt_data_import_assets_dir / 'scaled_cleaned_extended_train_df.parquet'
validate_pickle_path = scaled_tvt_data_import_assets_dir / 'scaled_cleaned_extended_validate_df.parquet'
test_pickle_path = scaled_tvt_data_import_assets_dir / 'scaled_cleaned_extended_test_df.parquet'

train_df = load_df_to_dataset(train_pickle_path).data
validate_df = load_df_to_dataset(validate_pickle_path).data
test_df = load_df_to_dataset(test_pickle_path).data

In [7]:
train_df

Unnamed: 0,epoch,datetime,obj_id,traj_id,month_sin,month_cos,hour_sin,hour_cos,season,part_of_day,...,rot_c,distance_c,dist_ww,dist_ra,dist_cl,dist_ma,speed_c,acc_c,lon,lat
0,1657070180,2022-07-06 01:16:20,255806357,0,-0.5,-0.866025,0.258819,0.965926,1,2,...,0.000532,0.000004,0.005650,0.002328,0.005967,0.002328,-0.311346,0.514889,0.170468,0.069297
1,1657070190,2022-07-06 01:16:30,255806357,0,-0.5,-0.866025,0.258819,0.965926,1,2,...,0.000423,0.000004,0.005645,0.002323,0.005961,0.002323,-0.294849,0.446576,0.170476,0.069296
2,1657070200,2022-07-06 01:16:40,255806357,0,-0.5,-0.866025,0.258819,0.965926,1,2,...,-0.000151,0.000005,0.005640,0.002317,0.005955,0.002317,-0.271051,0.640817,0.170484,0.069295
3,1657070210,2022-07-06 01:16:50,255806357,0,-0.5,-0.866025,0.258819,0.965926,1,2,...,-0.000321,0.000005,0.005634,0.002310,0.005948,0.002310,-0.235102,0.964070,0.170493,0.069294
4,1657070220,2022-07-06 01:17:00,255806357,0,-0.5,-0.866025,0.258819,0.965926,1,2,...,0.000129,0.000005,0.005628,0.002303,0.005941,0.002303,-0.212864,0.599314,0.170502,0.069293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14705495,1683331160,2023-05-05 23:59:20,210524000,1,0.5,-0.866025,-0.258819,0.965926,0,2,...,0.000073,0.000013,0.003270,0.003568,0.004211,0.003568,0.421010,3.151177,0.172816,0.070937
14705496,1683331170,2023-05-05 23:59:30,210524000,1,0.5,-0.866025,-0.258819,0.965926,0,2,...,-0.000481,0.000013,0.003256,0.003554,0.004198,0.003554,0.460453,1.057014,0.172808,0.070925
14705497,1683331180,2023-05-05 23:59:40,210524000,1,0.5,-0.866025,-0.258819,0.965926,0,2,...,-0.000376,0.000014,0.003242,0.003540,0.004184,0.003540,0.521192,1.623529,0.172801,0.070913
14705498,1683331190,2023-05-05 23:59:50,210524000,1,0.5,-0.866025,-0.258819,0.965926,0,2,...,0.000130,0.000014,0.003228,0.003526,0.004170,0.003526,0.512700,-0.218176,0.172793,0.070900


## 3. <a id='toc3_'></a>[Downsampling](#toc3_)

In [8]:
def downsample_and_save(df, save_path, frequency='10T'):
    """
    Downsamples a DataFrame and saves it in .parquet format.

    Parameters:
    - df: The DataFrame to be processed.
    - save_path: The path where the downsampled DataFrame will be saved.
    - frequency: The frequency for downsampling, default is every 10 minutes.
    """
    # Ensure the datetime column is of datetime type
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    # Set the datetime column as the index
    df.set_index('datetime', inplace=True)
    
    # Downsample the DataFrame to the specified frequency, taking the first data point in each period
    df_downsampled = df.resample(frequency).first()
    
    # Reset the index so datetime becomes a column again
    df_downsampled.reset_index(inplace=True)
    
    # Save the downsampled DataFrame as a .parquet file
    df_downsampled.to_parquet(save_path, index=False)
    
    print(f'Downsampled DataFrame saved to {save_path}')

# Define different save paths for train_df, validate_df, and test_df
train_save_path = '/data1/sgao/repos/clean_sv/data/assets/scaled_cleaned_extended_train_df.parquet'
validate_save_path = '/data1/sgao/repos/clean_sv/data/assets/scaled_cleaned_extended_validate_df.parquet'
test_save_path = '/data1/sgao/repos/clean_sv/data/assets/scaled_cleaned_extended_test_df.parquet'



In [9]:
downsample_and_save(train_df, train_save_path)
downsample_and_save(validate_df, validate_save_path)
downsample_and_save(test_df, test_save_path)

  df_downsampled = df.resample(frequency).first()


Downsampled DataFrame saved to /data1/sgao/repos/clean_sv/data/assets/scaled_cleaned_extended_train_df.parquet


  df_downsampled = df.resample(frequency).first()


Downsampled DataFrame saved to /data1/sgao/repos/clean_sv/data/assets/scaled_cleaned_extended_validate_df.parquet


  df_downsampled = df.resample(frequency).first()


Downsampled DataFrame saved to /data1/sgao/repos/clean_sv/data/assets/scaled_cleaned_extended_test_df.parquet


In [10]:
train_df_downsampled = load_df_to_dataset(train_save_path).data
validate_df_downsampled = load_df_to_dataset(validate_save_path).data
test_df_downsampled = load_df_to_dataset(test_save_path).data

In [9]:
train_df_downsampled

Unnamed: 0,datetime,epoch,obj_id,traj_id,month_sin,month_cos,hour_sin,hour_cos,season,part_of_day,...,rot_c,distance_c,dist_ww,dist_ra,dist_cl,dist_ma,speed_c,acc_c,lon,lat
0,2022-03-24 00:00:00,1.648080e+09,211514540.0,0.0,1.000000e+00,6.123234e-17,0.000000,1.000000,0.0,2.0,...,0.000000,0.000000,0.005656,0.002333,0.005970,0.002333,-0.090820,0.007741,0.170469,0.069285
1,2022-03-24 00:10:00,1.648081e+09,244376000.0,0.0,1.000000e+00,6.123234e-17,0.000000,1.000000,0.0,2.0,...,-0.000362,0.000014,0.005084,0.001712,0.005312,0.001712,0.521791,3.263956,0.171289,0.069294
2,2022-03-24 00:20:00,1.648081e+09,244376000.0,0.0,1.000000e+00,6.123234e-17,0.000000,1.000000,0.0,2.0,...,0.000098,0.000014,0.004021,0.002460,0.004266,0.002460,0.503099,-1.596698,0.172284,0.069919
3,2022-03-24 00:30:00,1.648082e+09,244376000.0,0.0,1.000000e+00,6.123234e-17,0.000000,1.000000,0.0,2.0,...,0.000251,0.000017,0.003136,0.003425,0.004082,0.003425,0.755423,-0.166989,0.172823,0.070770
4,2022-03-24 00:40:00,1.648082e+09,244376000.0,0.0,1.000000e+00,6.123234e-17,0.000000,1.000000,0.0,2.0,...,0.000010,0.000018,0.004271,0.004545,0.005221,0.004545,0.835505,-0.017087,0.173720,0.071610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66812,2023-06-30 23:20:00,1.688167e+09,245656000.0,0.0,1.224647e-16,-1.000000e+00,-0.258819,0.965926,1.0,2.0,...,0.000773,0.000003,0.005666,0.002347,0.005986,0.002347,-0.391551,1.089638,0.170441,0.069303
66813,2023-06-30 23:30:00,1.688168e+09,245656000.0,0.0,1.224647e-16,-1.000000e+00,-0.258819,0.965926,1.0,2.0,...,-0.000630,0.000017,0.004794,0.001737,0.004999,0.001737,0.729541,-0.476962,0.171635,0.069388
66814,2023-06-30 23:40:00,1.688168e+09,245656000.0,0.0,1.224647e-16,-1.000000e+00,-0.258819,0.965926,1.0,2.0,...,-0.000308,0.000017,0.003649,0.002825,0.003928,0.002825,0.732914,0.992593,0.172576,0.070194
66815,2023-06-30 23:50:00,1.688169e+09,245656000.0,0.0,1.224647e-16,-1.000000e+00,-0.258819,0.965926,1.0,2.0,...,-0.000118,0.000023,0.003647,0.003928,0.004595,0.003928,1.232469,1.713837,0.173232,0.071145


In [11]:
validate_df.columns

Index(['epoch', 'obj_id', 'traj_id', 'month_sin', 'month_cos', 'hour_sin',
       'hour_cos', 'season', 'part_of_day', 'aad', 'cdd', 'dir_ccs', 'cog_c',
       'rot_c', 'distance_c', 'dist_ww', 'dist_ra', 'dist_cl', 'dist_ma',
       'speed_c', 'acc_c', 'lon', 'lat'],
      dtype='object')

In [12]:
validate_df_downsampled.columns

Index(['datetime', 'epoch', 'obj_id', 'traj_id', 'month_sin', 'month_cos',
       'hour_sin', 'hour_cos', 'season', 'part_of_day', 'aad', 'cdd',
       'dir_ccs', 'cog_c', 'rot_c', 'distance_c', 'dist_ww', 'dist_ra',
       'dist_cl', 'dist_ma', 'speed_c', 'acc_c', 'lon', 'lat'],
      dtype='object')