In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)

##> import libraries
import sys
from pathlib import Path
import random
import time
from itertools import product

root_dir = Path.cwd().resolve().parent
if root_dir.exists():
    sys.path.append(str(root_dir))
else:
    raise FileNotFoundError('Root directory not found')

#> import flower
import flwr as fl

#> import custom libraries
from src.load import load_df_to_dataset
from src.EAE import EvidentialTransformerDenoiseAutoEncoder, evidential_regression
from src.client import train_and_evaluate_local, evaluate_saved_model
from src.datasets import TrajectoryDataset
from src.plot import plot_loss, plot_tsne_with_uncertainty, plot_uncertainty

#> torch libraries
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import pandas as pd
import numpy as np
import statsmodels.api as sm

from sklearn.model_selection import train_test_split

#> Plot
import matplotlib.pyplot as plt
import seaborn as sns
# import scienceplots  # https://github.com/garrettj403/SciencePlots?tab=readme-ov-file
#plt.style.use(['science', 'grid', 'notebook'])  # , 'ieee'


# %matplotlib inline
#%matplotlib widget


In [None]:
# Define the dataset catalog
assets_dir = Path("/data1/aistraj/bin/tvt_assets").resolve()
print(f"Assets Directory: {assets_dir}")
if not assets_dir.exists():
    raise FileNotFoundError('Assets directory not found')


In [4]:
seq_len=960
batch_size=32

# train dataset
train_pickle_path_extend = assets_dir / 'extended' / 'cleaned_extended_train_df.parquet'
train_df_extend = load_df_to_dataset(train_pickle_path_extend).data

# Define the list of features to discard
drop_features_list = ['epoch', 'datetime', 'obj_id', 'traj_id', 'stopped', 'curv', 'abs_ccs']

In [5]:
def clean_outliers_by_quantile(dataframe, columns_to_clean, iqr_multiplier=1.5):
    """
    Clean outliers in the specified columns of a DataFrame using the IQR (Interquartile Range) method.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame to clean.
    columns_to_clean (list): List of column names to clean for outliers.
    iqr_multiplier (float): The multiplier for the IQR to define outlier limits (default is 1.5).

    Returns:
    pd.DataFrame: The cleaned DataFrame.
    """
    # Copy the original DataFrame to avoid modifying it directly
    cleaned_data = dataframe.copy()

    # Apply IQR filtering for each column
    for col in columns_to_clean:
        Q1 = cleaned_data[col].quantile(0.25)  # 1st quartile (25th percentile)
        Q3 = cleaned_data[col].quantile(0.75)  # 3rd quartile (75th percentile)
        IQR = Q3 - Q1  # Interquartile range

        # Calculate lower and upper limits
        lower_limit = Q1 - iqr_multiplier * IQR
        upper_limit = Q3 + iqr_multiplier * IQR

        # Print the calculated limits (optional)
        print(f"{col}: lower_limit = {lower_limit}, upper_limit = {upper_limit}")

        # Filter the DataFrame for the current column
        cleaned_data = cleaned_data[
            (cleaned_data[col] >= lower_limit) & (cleaned_data[col] <= upper_limit)
        ]

    # Print the number of rows before and after cleaning
    print(f"Total rows before cleaning: {len(dataframe)}")
    print(f"Total rows after cleaning: {len(cleaned_data)}")

    return cleaned_data


In [6]:
def count_na_rows(dataframe):
    
    # Total number of rows
    total_rows = len(dataframe)

    # Check if each row contains any <NA>
    na_rows = dataframe.isna().any(axis=1)

    # Count the number of rows with <NA>
    na_row_count = na_rows.sum()

    # Output the results
    print(f"Total rows: {total_rows}")
    print(f"Rows containing <NA>: {na_row_count}")

    return total_rows, na_row_count


In [7]:
def analyze_column_distribution(dataframe, columns, surrounding_range=3, bins=30):
    """
    Analyze the distribution of specified columns in a DataFrame and extract details about the maximum value.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame to analyze.
    columns (list): List of column names to analyze.
    surrounding_range (int): Number of rows to include around the maximum value (default is 3).
    bins (int): Number of bins for the histogram (default is 30).
    """
    for column in columns:
        print(column)
        
        # Plot histogram
        plt.figure()
        dataframe[column].hist(bins=bins, grid=False, edgecolor='black')
        
        # Find max value and its index
        max_index = dataframe[column].idxmax()
        max_value = dataframe[column].max()
        max_count = (dataframe[column] == max_value).sum()
        
        print(f"Max index and value: {max_index}, {max_value}, Count: {max_count}")
        
        # Get surrounding rows
        index_list = dataframe[column].index.to_list()
        max_pos = index_list.index(max_index)
        
        # Calculate surrounding indices
        start_pos = max(0, max_pos - surrounding_range)
        end_pos = min(len(index_list) - 1, max_pos + surrounding_range)
        
        # Extract corresponding rows
        result = dataframe[[column]].loc[index_list[start_pos:end_pos + 1]]
        
        print(result)
        plt.show()


### without clean outliers

In [None]:
total_rows, na_row_count = count_na_rows(train_df_extend)

In [9]:
# Create training and validation datasets
train_dataset_traj = TrajectoryDataset(
    train_df_extend,
    seq_len=seq_len,
    mode='ae',
    drop_features_list=drop_features_list,
    scaler="MinMaxScaler"
)

In [None]:
columns_to_analyze = ['stopped', 'cog_c', 'aad', 'rot_c', 'speed_c', 
                      'distance_c', 'acc_c', 'cdd', 'dir_ccs', 
                      'dist_ww', 'dist_ra', 'dist_cl', 'dist_ma', 
                      'lon', 'lat', 'season', 'part_of_day', 
                      'month_sin', 'month_cos', 'hour_sin', 'hour_cos']
analyze_column_distribution(train_dataset_traj.dataframe, columns_to_analyze)

### Clean Outlier and using MinMaxScaler

In [None]:
columns_to_clean = ['speed_c', 'lon', 'lat']  # Specify columns to clean
cleaned_train_data = clean_outliers_by_quantile(train_df_extend, columns_to_clean)

In [None]:
total_rows_clean, na_row_count_clean = count_na_rows(cleaned_train_data)

In [13]:
# Create training and validation datasets
clean_train_dataset_traj = TrajectoryDataset(
    cleaned_train_data,
    seq_len=seq_len,
    mode='ae',
    drop_features_list=drop_features_list,
    scaler="MinMaxScaler"
)

In [None]:
columns_to_analyze = ['stopped', 'cog_c', 'aad', 'rot_c', 'speed_c', 
                      'distance_c', 'acc_c', 'cdd', 'dir_ccs', 
                      'dist_ww', 'dist_ra', 'dist_cl', 'dist_ma', 
                      'lon', 'lat', 'season', 'part_of_day', 
                      'month_sin', 'month_cos', 'hour_sin', 'hour_cos']
analyze_column_distribution(clean_train_dataset_traj.dataframe, columns_to_analyze)

### Clean Outlier and using StandardScaler

In [15]:
# Create training and validation datasets
clean_train_dataset_traj_s = TrajectoryDataset(
    cleaned_train_data,
    seq_len=seq_len,
    mode='ae',
    drop_features_list=drop_features_list,
    scaler="StandardScaler"
)

In [None]:
analyze_column_distribution(clean_train_dataset_traj_s.dataframe, columns_to_analyze)

### Clean Outlier and using RobustScaler

In [17]:
# Create training and validation datasets
clean_train_dataset_traj_r = TrajectoryDataset(
    cleaned_train_data,
    seq_len=seq_len,
    mode='ae',
    drop_features_list=drop_features_list,
    scaler="RobustScaler"
)

In [None]:
analyze_column_distribution(clean_train_dataset_traj_r.dataframe, columns_to_analyze)

### QuantileTransformer

In [15]:
# Create training and validation datasets
clean_train_dataset_traj_r = TrajectoryDataset(
    cleaned_train_data,
    seq_len=seq_len,
    mode='ae',
    drop_features_list=drop_features_list,
    scaler="QuantileTransformer"
)

In [None]:
analyze_column_distribution(clean_train_dataset_traj_r.dataframe, columns_to_analyze)