## Training result analysis of Distilled LSTM 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json

# Load metrics
with open('overall_metrics.json', 'r') as f:
    overall_metrics = json.load(f)

# Convert flight_details to a Pandas DataFrame for easier plotting
df = pd.DataFrame(overall_metrics['flight_details'])

# Set a style for the plots
sns.set_theme(style="whitegrid")

# --- 1. Bar Chart: Max Position Error and Average Drift Percentage per Flight ---
plt.figure(figsize=(14, 7))

# Max Position Error
plt.subplot(1, 2, 1) # 1 row, 2 columns, first plot
sns.barplot(x='flight_counter', y='max_position_error', data=df, hue='is_divergent', palette={True: 'red', False: 'green'})
plt.title('Max Position Error per Flight')
plt.xlabel('Flight Counter')
plt.ylabel('Max Position Error (units)')
plt.xticks(rotation=45)
plt.legend(title='Is Divergent')

# Average Drift Percentage
plt.subplot(1, 2, 2) # 1 row, 2 columns, second plot
sns.barplot(x='flight_counter', y='average_drift_percentage', data=df, hue='is_divergent', palette={True: 'red', False: 'green'})
plt.title('Average Drift Percentage per Flight')
plt.xlabel('Flight Counter')
plt.ylabel('Average Drift Percentage (%)')
plt.axhline(10, color='red', linestyle='--', label='Divergence Threshold (10%)')
plt.xticks(rotation=45)
plt.legend(title='Is Divergent')

plt.tight_layout()
plt.show()


# --- 2. Scatter Plot: Sequence Length vs. Mean Position Error (with divergence as hue) ---
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sequence_length', y='mean_position_error', hue='is_divergent', data=df, s=100, palette={True: 'red', False: 'green'})
plt.title('Sequence Length vs. Mean Position Error')
plt.xlabel('Sequence Length (number of steps)')
plt.ylabel('Mean Position Error (units)')
plt.legend(title='Is Divergent')
plt.show()

# --- 3. Combined Plot: All key metrics for each flight (potentially using subplots or a grouped bar chart) ---
# A more complex plot to show multiple metrics per flight in a single view
plt.figure(figsize=(16, 8))

# Melt the DataFrame to easily plot multiple metrics per flight
df_melted = df.melt(id_vars=['flight_counter', 'is_divergent', 'sequence_length'],
                    value_vars=['max_position_error', 'mean_position_error',
                                'average_drift_percentage', 'max_drift_percentage'],
                    var_name='metric_type', value_name='metric_value')

sns.barplot(x='flight_counter', y='metric_value', hue='metric_type', data=df_melted, palette='viridis')
plt.title('Comparison of Key Metrics Across Flights')
plt.xlabel('Flight Counter')
plt.ylabel('Metric Value')
plt.xticks(rotation=45)
plt.legend(title='Metric Type')
plt.tight_layout()
plt.show()

# --- 4. Distribution of Divergent Flights ---
plt.figure(figsize=(6, 5))
sns.countplot(x='is_divergent', data=df, palette={True: 'red', False: 'green'})
plt.title(f'Number of Divergent vs. Non-Divergent Flights (Total: {overall_metrics["total_flights"]})')
plt.xlabel('Is Divergent')
plt.ylabel('Number of Flights')
plt.xticks(ticks=[0, 1], labels=['False (Non-Divergent)', 'True (Divergent)'])
plt.show()

print("\nJSON representation of overall_metrics for reference:")
print(json.dumps(overall_metrics, indent=4))

## Quantization practice

In [None]:
import numpy as np

# Generate random data
data = np.random.rand(5)
print(data)

# Calculate min and max of the data
max_value = np.max(data)
print("Max value:", max_value)
min_value = np.min(data)
print("Min value:", min_value)

# Define the target format - int8
q_min = -128
print("Min int8 value:", q_min)
q_max = 127
print("Max int8 value:", q_max)

# Calculate the scale
scale = (max_value - min_value) / (q_max - q_min)
print(f"Scale - {scale}")

# Calculate the zero point
zero_point_real = q_min - min_value / scale
print(f"Zero Point (real) - {zero_point_real}")
zero_point = np.clip(round(zero_point_real), q_min, q_max)
print(f"Zero Point (clipped) - {zero_point}")

# Quantize the data
# quantized_data = np.round(data / scale) + zero_point # ! There will be cases where this goes out of bounds of int8
# quantized_data = np.clip(np.round(data / scale + zero_point), np.round(q_min / scale + zero_point), np.round(q_max / scale + zero_point)) # ! here also the values are overshooting
quantized_data = np.clip(np.round(data / scale + zero_point), q_min, q_max).astype(np.int8)  # ! Final working version
print("Quantized Data:", quantized_data)

# De-quantize the data
de_quantized_data = scale*(quantized_data - zero_point)
print(f"The de_quantized_Data is - {de_quantized_data}")

In [None]:
import numpy as np

# Generate random data
data = np.random.rand(5)
print(data)

# Calculate min and max of the data
max_value = np.max(data)
print("Max value:", max_value)
min_value = np.min(data)
print("Min value:", min_value)

# Define the target format - int8
q_min = -128
print("Min int8 value:", q_min)
q_max = 127
print("Max int8 value:", q_max)

# Calculate the scale
scale = (max_value - min_value) / (q_max - q_min)
print(f"Scale - {scale}")

# Calculate the zero point
zero_point_real = q_min - min_value / scale
print(f"Zero Point (real) - {zero_point_real}")
zero_point = np.clip(round(zero_point_real), q_min, q_max)
print(f"Zero Point (clipped) - {zero_point}")

# Quantize the data
# quantized_data = np.round(data / scale) + zero_point # ! There will be cases where this goes out of bounds of int8
# quantized_data = np.clip(np.round(data / scale + zero_point), np.round(q_min / scale + zero_point), np.round(q_max / scale + zero_point)) # ! here also the values are overshooting
quantized_data = np.clip(np.round(data / scale + zero_point), q_min, q_max).astype(np.int8)  # ! Final working version
print("Quantized Data:", quantized_data)

# De-quantize the data
de_quantized_data = scale*(quantized_data - zero_point)
print(f"The de_quantized_Data is - {de_quantized_data}")

In [None]:
import numpy as np

# Generate random data
df = np.random.uniform(-10, 20, (3, 5, 5))
print(df)

for i in range(df.shape[0]):
    
    data = df[i]
    # Calculate min and max of the data
    max_value = np.max(data)
    print("Max value:", max_value)
    min_value = np.min(data)
    print("Min value:", min_value)

    # Define the target format - int8
    q_min = -128
    print("Min int8 value:", q_min)
    q_max = 127
    print("Max int8 value:", q_max)

    # Calculate the scale
    scale = (max_value - min_value) / (q_max - q_min)
    print(f"Scale - {scale}")

    # Calculate the zero point
    zero_point_real = q_min - min_value / scale
    print(f"Zero Point (real) - {zero_point_real}")
    zero_point = np.clip(round(zero_point_real), q_min, q_max)
    print(f"Zero Point (clipped) - {zero_point}")

    # Quantize the data
    # quantized_data = np.round(data / scale) + zero_point # ! There will be cases where this goes out of bounds of int8
    # quantized_data = np.clip(np.round(data / scale + zero_point), np.round(q_min / scale + zero_point), np.round(q_max / scale + zero_point)) # ! here also the values are overshooting
    quantized_data = np.clip(np.round(data / scale + zero_point), q_min, q_max).astype(np.int8)  # ! Final working version
    print("Quantized Data:", quantized_data)

    # De-quantize the data
    de_quantized_data = scale*(quantized_data - zero_point)
    print(f"The de_quantized_Data is - {de_quantized_data}")

## Data for TfLite

In [5]:
from torch.utils.data import Dataset
from tqdm.autonotebook import tqdm
import pandas as pd
import torch
import pymap3d as pm
import os


class IODatasetCpu(Dataset):
    def __init__(self, flight_paths, window_size=100, check=False):
        self.flight_csv_paths = flight_paths

        self.window = window_size

        self.x = None
        self.y = None

        self.mode_mapping = {
            "COLLISION": 0,
            "FS BATT": 1,
            "FS COMM": 2,
            "HOLD/30/F": 3,
            "HOME": 4,
            "HOME/40/F": 5,
            "HOME/40/V": 6,
            "HOVER": 7,
            "LAND": 8,
            "LAND/21/F": 9,
            "LAND/22/F": 10,
            "LAND/25/V": 11,
            "MANUAL E": 12,
            "OFF": 13,
            "OFF/0/V": 14,
            "RPV": 15,
            "STARTUP": 16,
            "STARTUP/99/V": 17,
            "TAKEOFF": 18,
            "TAKEOFF/10/V": 19,
            "TAKEOFF/11/V": 20,
            "TAKEOFF/12/T": 21,
            "TAKEOFF/12/V": 22,
            "TAKEOFF/13/T": 23,
        }

        self.flight_last_record_index = []
        self.flight_start_record_index = []
        self.flight_origins = []

        # self.load_data_preproc_every_11_rec()
        self._len = 0
        self.check = check
        self.load_data_preproc_every_1_sec(self.check)

        self.currentFlightIndex = 0
        self.indexOffset = 0

        # self._len = len(self.x) - len(self.flight_csv_paths) * (self.window - 1)

        self.create_index_map()

        assert not self.x.isnan().any(), "x contains NaN values"
        assert not self.y.isnan().any(), "y contains NaN values"
        print("Non NaN values in x and y.")
        print(f"x.device: {self.x.device}")
        print(f"y.device: {self.y.device}")
        print(f"x.shape: {self.x.shape}")
        print(f"y.shape: {self.y.shape}")
        print(f"self.new_flight_started: {len(self.new_flight_started)}")

    def __len__(self):
        return self._len

    def gps_to_ned(self, df):
        # ! Make sure to remove the zero GPS coordinates. Find the first non-zero GPS coordinates for reference
        # Use the first non-zero record as the reference lat, lon, and alt
        lat_ref = df["GPS Lat"].iloc[0]
        lon_ref = df["GPS Lon"].iloc[0]
        alt_ref = df["Altitude"].iloc[0]

        # Convert all GPS coordinates to NED using the reference point
        ned_coords = []
        for i, row in df.iterrows():
            lat, lon, alt = row["GPS Lat"], row["GPS Lon"], row["Altitude"]
            # Convert each (lat, lon, alt) to NED using the reference point
            north, east, down = pm.geodetic2ned(
                lat, lon, alt, lat_ref, lon_ref, alt_ref
            )
            ned_coords.append([north, east, -down])

        # Convert to a DataFrame for easier handling
        ned_df = pd.DataFrame(ned_coords, columns=["x", "y", "z"])
        ned_df["group"] = df["group"]
        ned_df["Mode"] = df["Mode"]

        return ned_df

    def add_delta_NED(self, df_ned_data):
        df_ned_data["delta_x"] = df_ned_data["x"].diff()
        df_ned_data.loc[0, "delta_x"] = 0
        df_ned_data["delta_y"] = df_ned_data["y"].diff()
        df_ned_data.loc[0, "delta_y"] = 0
        df_ned_data["delta_z"] = df_ned_data["z"].diff()
        df_ned_data.loc[0, "delta_z"] = 0

    # Custom aggregation function for delta NED
    def get_last_non_zero_or_last(self, group):
        """
        Return last non zero delta NED if exist else all zero delta NED
        """
        # Identify non-zero rows
        non_zero_rows = group[(group != 0).any(axis=1)]

        if not non_zero_rows.empty:
            # If there are non-zero rows, return the last non-zero row
            return non_zero_rows.iloc[-1]
        else:
            # Otherwise, return the last row (which will be zeros)
            return group.iloc[-1]

    def parse_gps_time(self, df):
        # Ensure GPS Date & Time are zero-padded
        df["date"] = df["GPS Date"].astype(str).str.zfill(6)
        df["time"] = df["GPS Time"].astype(str).str.zfill(6)

        # Combine into one string
        df["datetime_str"] = df["date"] + df["time"]

        # Convert to datetime (invalid values become NaT)
        df["datetime"] = pd.to_datetime(
            df["datetime_str"], format="%d%m%y%H%M%S", errors="coerce"
        )

        # Convert to Unix timestamp (seconds since epoch)
        df["timestamp_seconds"] = df["datetime"].astype("int64") // 10**9

        df.drop(
            columns=[
                "GPS Date",
                "GPS Time",
                "date",
                "time",
                "datetime_str",
                "datetime",
            ],
            inplace=True,
        )

    def get_non_hold_intervals(self, df, start, end):
        """
        Return index pairs (start, end) for intervals where 'Mode' is NOT 'HOLD/30/F'.
        Handles edge cases like starting at index 0 or ending at last row.
        """
        check_value = "OFF/0/V"
        check_mapped_value = self.mode_mapping[check_value]

        intervals = []
        in_non_hold = False
        interval_start = None

        for i in range(start, end):
            if df["Mode"].iloc[i] != check_mapped_value:
                if not in_non_hold:  # entering a non-hold region
                    in_non_hold = True
                    interval_start = i
            else:
                if in_non_hold:  # exiting a non-hold region
                    intervals.append((interval_start, i - 1))
                    in_non_hold = False

        # Edge case: if the sequence ended inside a non-hold region
        if in_non_hold:
            intervals.append((interval_start, end - 1))

        return intervals

    def load_data_preproc_every_1_sec(self, check=False):
        print("Loading data...")
        self.flight_start_record_index = [0]

        x = []
        y = []

        for i in tqdm(range(len(self.flight_csv_paths)), desc="Flight#", mininterval=5):
            input_columns = [
                "Gp",
                "Gq",
                "Gr",
                "Ax",
                "Ay",
                "Az",
                "Bx",
                "By",
                "Bz",
                "Altitude",
                "Mode",
            ]
            output_columns = ["GPS Lat", "GPS Lon", "GPS AGL"]
            other_columns = ["GPS Date", "GPS Time"]
            relevant_columns = other_columns + input_columns + output_columns

            sum_columns = ["Gp", "Gq", "Gr", "Ax", "Ay", "Az"]
            avg_columns = ["Bx", "By", "Bz", "Altitude"]
            
            df = pd.read_csv(
                self.flight_csv_paths[i], on_bad_lines="skip", low_memory=False
            )
            df = df[relevant_columns]

            df_non_zero = df[
                (df["GPS Lat"] != 0) & (df["GPS Lon"] != 0) & (df["GPS AGL"] != 0)
            ]
            df = df[df_non_zero.index[0] :]  # remove initial all-zero rows
            df = df.reset_index()

            self.parse_gps_time(df)
            df["group"] = df["timestamp_seconds"] - df["timestamp_seconds"].iloc[0]

            df["Mode"] = df["Mode"].map(self.mode_mapping)

            df_ned = df.copy(deep=True)

            agg_map = {col: "sum" for col in sum_columns}
            agg_map.update({col: "mean" for col in avg_columns})

            df = df.groupby(["group", "Mode"], as_index=False, sort=False).agg(agg_map)
            assert not df.isnull().values.any(), "df contains NaN values"

            # Collect the indices where there is a lag
            diff_df = df["group"].diff()
            mask = diff_df > 1
            lag_indices = list(
                mask[mask].index
            )  # ! remember to incorporate the 0th element.
            lag_indices = [0] + lag_indices + [len(diff_df)]

            df_in = df[input_columns].copy(deep=True)
            df_in["Altitude"] = df_in["Altitude"].diff()
            df_in.loc[df_in.index[0], "Altitude"] = 0

            df_ned = self.gps_to_ned(df_ned)
            df_ned = df_ned.groupby(["group", "Mode"]).apply(
                self.get_last_non_zero_or_last, include_groups=False
            )
            self.add_delta_NED(df_ned)
            df_delta_ned = df_ned[["delta_x", "delta_y", "delta_z"]]

            if len(df_in) != len(df_delta_ned):
                pass

            # check if df_in and df_delta_ned[['delta_x', 'delta_y', 'delta_z']] contains NaN values
            if df_in.isna().any().any():
                print(
                    f"df_in contains NaN values for flight {self.flight_csv_paths[i]}"
                )
                continue

            if df_delta_ned[["delta_x", "delta_y", "delta_z"]].isna().any().any():
                print(
                    f"df_delta_ned contains NaN values for flight {self.flight_csv_paths[i]}"
                )
                continue

            # Checking for lags and creating sequences accordingly
            for j in range(1, len(lag_indices)):
                # ! Since during differencing we subtract prev from curr means the curr index is after lag.
                prev = lag_indices[j - 1]
                curr = lag_indices[j]

                non_hold_indices = [(prev, curr)]

                if check:
                    non_hold_indices = self.get_non_hold_intervals(df_in, prev, curr)

                for prev, curr in non_hold_indices:
                    val = curr - prev
                    if val > 3 * self.window:
                        self._len += val - (self.window - 1)
                        x.extend(df_in.iloc[prev:curr].values.tolist())
                        y.extend(
                            df_delta_ned[["delta_x", "delta_y", "delta_z"]]
                            .iloc[prev:curr]
                            .values.tolist()
                        )

                        # record current flight's last record index
                        self.flight_last_record_index.append(len(x) - 1)

                        # record current flight's start record index
                        if j != len(lag_indices) - 1:
                            self.flight_start_record_index.append(len(x))

                        # record flight NED origin
                        self.flight_origins.append(
                            df_ned.iloc[prev][["x", "y", "z"]].values.tolist()
                        )

        self.x = torch.tensor(x, dtype=torch.float32).to("cpu")
        self.y = torch.tensor(y, dtype=torch.float32).to("cpu")

    def create_index_map(self):
        self.index_map = {}
        new_flight_started = []
        currentFlightIndex = 0
        indexOffset = 0

        for i in range(self._len):
            if (
                i + indexOffset + self.window - 1
                > self.flight_last_record_index[currentFlightIndex]
            ):
                currentFlightIndex += 1
                indexOffset += self.window - 1
                # new_flight_started[i] = True # for second flight onwards
                new_flight_started.append(True)
            else:
                if not new_flight_started:  # for very record i.e. start of first flight
                    new_flight_started.append(True)
                else:  # for subsequent flights
                    new_flight_started.append(False)

            self.index_map[i] = i + indexOffset

        self.new_flight_started = torch.tensor(new_flight_started, dtype=torch.bool).to(
            "cpu"
        )

    def __getitem__(self, index):
        if index < 0 or index >= self._len:
            raise StopIteration("Index out of bounds")

        i = self.index_map[index]

        return (
            self.x[i : i + self.window],
            self.y[i + self.window - 1],
            self.new_flight_started[index],
        )

    def __old_getitem__(self, index):
        if index < 0 or index >= self._len:
            raise StopIteration("Index out of bounds")

        if (
            index + self.indexOffset + self.window - 1
            >= self.flight_last_record_index[self.currentFlightIndex]
        ):
            if self.currentFlightIndex < len(self.flight_last_record_index) - 1:
                self.currentFlightIndex += 1
                self.indexOffset += self.window
            else:
                self.currentFlightIndex = 0
                self.indexOffset = 0
        _x = self.x[index + self.indexOffset : index + self.indexOffset + self.window]
        _y = self.y[index + self.indexOffset + self.window - 1]
        return _x, _y


  from tqdm.autonotebook import tqdm


In [6]:
import random

csv_files_dir = "/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs"
dir_paths = [] # <--- Leave few directores for blind test
for dir in os.listdir(csv_files_dir):
    dir_path = os.path.join(csv_files_dir, dir)
    
    if not os.path.isdir(dir_path):
        continue

    dir_paths.append(dir_path)

random.shuffle(dir_paths)
train_dirs = dir_paths[:-1]
print(f"Count of UAVs for training: {len(train_dirs)}")

test_dirs = dir_paths[-1:]
print(f"The testing directory is {test_dirs}")
print(f"Count of UAVs for testing: {len(test_dirs)}")

good_train_csvs = []
for dir_path in train_dirs:
    traj_dir = os.path.join(
        dir_path, "trajectory_report_data", "images", "good_trajectories"
    )
    if not os.path.isdir(traj_dir):
        continue

    for f in os.listdir(traj_dir):
        if f.endswith(".png"):
            csv_path = os.path.join(dir_path, f.replace(".png", ".csv"))
            good_train_csvs.append(csv_path)

good_test_csvs = []
for dir_path in test_dirs:
    traj_dir = os.path.join(
        dir_path, "trajectory_report_data", "images", "good_trajectories"
    )
    if not os.path.isdir(traj_dir):
        continue

    for f in os.listdir(traj_dir):
        if f.endswith(".png"):
            csv_path = os.path.join(dir_path, f.replace(".png", ".csv"))
            good_test_csvs.append(csv_path)

frac_of_flights_for_training = 1
frac_of_flights_for_testing = 1
print(f"Number of training flights: {len(good_train_csvs)}")
print(f"Number of testing flights: {len(good_test_csvs)}")

# Separate out the train and val data
flights = good_train_csvs[:]
random.shuffle(flights)
train_flight_paths = flights[ : int(frac_of_flights_for_training*len(good_train_csvs))]

# Separate out the fraction of test data
flights = good_test_csvs[:]
random.shuffle(flights)
test_flight_paths = flights[ : int(frac_of_flights_for_testing*len(good_test_csvs))]


n = len(train_flight_paths)
n_train = int(0.7 * n)
n_val = int(0.15 * n)

train_flights = train_flight_paths[: n_train]
val_flights = train_flight_paths[n_train : n_train + n_val]
test_flights = test_flight_paths[:]

Count of UAVs for training: 25
The testing directory is ['/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW41-121-122-123']
Count of UAVs for testing: 1
Number of training flights: 631
Number of testing flights: 17


In [7]:
train_data = IODatasetCpu(train_flights, window_size=100, check=True)
val_data = IODatasetCpu(val_flights, window_size=100, check=True)
test_data = IODatasetCpu(test_flights, window_size=100, check=True)

Loading data...


Flight#: 100%|██████████| 441/441 [25:20<00:00,  3.45s/it]


Non NaN values in x and y.
x.device: cpu
y.device: cpu
x.shape: torch.Size([368437, 11])
y.shape: torch.Size([368437, 3])
self.new_flight_started: 295870
Loading data...


Flight#: 100%|██████████| 94/94 [05:18<00:00,  3.39s/it]


Non NaN values in x and y.
x.device: cpu
y.device: cpu
x.shape: torch.Size([82810, 11])
y.shape: torch.Size([82810, 3])
self.new_flight_started: 66871
Loading data...


Flight#: 100%|██████████| 17/17 [00:34<00:00,  2.03s/it]

Non NaN values in x and y.
x.device: cpu
y.device: cpu
x.shape: torch.Size([6181, 11])
y.shape: torch.Size([6181, 3])
self.new_flight_started: 4795





In [9]:
import numpy as np
import torch

def convert_to_npy(data, name):
    X, Y = [], []
    temp_x, temp_y = [], []

    for i in range(len(data)):
        x, y, f = data[i]

        # Safely convert tensors -> NumPy via .tolist()
        if isinstance(x, torch.Tensor):
            x = np.array(x.detach().cpu().tolist())
        if isinstance(y, torch.Tensor):
            y = np.array(y.detach().cpu().tolist())

        # If a new trajectory starts, save the previous one (if any)
        if f and len(temp_x) > 0:
            X.append(np.array(temp_x))
            Y.append(np.array(temp_y))
            temp_x, temp_y = [], []

        temp_x.append(x)
        temp_y.append(y)

    # Append the final trajectory
    if len(temp_x) > 0:
        X.append(np.array(temp_x))
        Y.append(np.array(temp_y))

    # Try to stack if all trajectories same length
    try:
        X = np.stack(X)
        Y = np.stack(Y)
    except ValueError:
        X = np.array(X, dtype=object)
        Y = np.array(Y, dtype=object)

    # Save
    np.save(f"{name}_X.npy", X)
    np.save(f"{name}_Y.npy", Y)

    # Print shape info
    if X.dtype == object:
        print("Type:", type(X))
        print("Dtype:", X.dtype)
        print(f"X: {len(X)} trajectories with varying lengths")
        print(f"Y: {len(Y)} trajectories with varying lengths")
        print("Example trajectory shapes:")
        print(f"  X[0].shape = {X[0].shape}")
        print(f"  Y[0].shape = {Y[0].shape}")
        print("First element of trajectory 0 (sample):", X[0][0].shape)
        print("First target of trajectory 0:", Y[0][0])
    else:
        print(f"X shape: {X.shape}")
        print(f"Y shape: {Y.shape}")


In [10]:
# Convert train data to npy
convert_to_npy(train_data, "train_2")

Type: <class 'numpy.ndarray'>
Dtype: object
X: 733 trajectories with varying lengths
Y: 733 trajectories with varying lengths
Example trajectory shapes:
  X[0].shape = (309, 100, 11)
  Y[0].shape = (309, 3)
First element of trajectory 0 (sample): (100, 11)
First target of trajectory 0: [ -6.64892292 -16.83357239   1.50003219]


In [11]:
# Convert train data to npy
convert_to_npy(val_data, "val_2")

Type: <class 'numpy.ndarray'>
Dtype: object
X: 161 trajectories with varying lengths
Y: 161 trajectories with varying lengths
Example trajectory shapes:
  X[0].shape = (277, 100, 11)
  Y[0].shape = (277, 3)
First element of trajectory 0 (sample): (100, 11)
First target of trajectory 0: [-6.64855194 -5.94161081  1.0992614 ]


In [12]:
# Convert train data to npy
convert_to_npy(test_data, "test_2")

Type: <class 'numpy.ndarray'>
Dtype: object
X: 14 trajectories with varying lengths
Y: 14 trajectories with varying lengths
Example trajectory shapes:
  X[0].shape = (242, 100, 11)
  Y[0].shape = (242, 3)
First element of trajectory 0 (sample): (100, 11)
First target of trajectory 0: [  5.54718304 -12.80882454  -0.69965094]


In [31]:
import numpy as np

train_data = np.load("train_X.npy", allow_pickle=True)

# Assuming train_data is a list/array of numpy arrays,
# and each element has shape (216, 100, 11).
# Concatenate all arrays along a new axis, then reshape.

# Step 1: Stack all arrays from train_data into a single NumPy array
# If train_data is already a single array containing multiple arrays,
# you might need to adjust this depending on its structure.
# Let's assume train_data is a list of arrays.
all_data_stacked = np.concatenate(train_data, axis=0) # Or np.vstack if 2D arrays, or creating a new first dimension

# If each item in train_data is (216, 100, 11), and you concatenate them along axis=0,
# the result would be (len(train_data) * 216, 100, 11)

# Reshape the combined data to (total_observations, 11)
# total_observations = len(train_data) * 216 * 100
reshaped_all_data = all_data_stacked.reshape(-1, all_data_stacked.shape[-1])

# Find the global minimum and maximum for each column
global_min_values = np.min(reshaped_all_data, axis=0)
global_max_values = np.max(reshaped_all_data, axis=0)

print("Global Minimum values for each of the 11 columns")
for i, vals in enumerate(global_min_values):
    print(f"Column {i+1} = ", vals)
print("\n") 
print("********************************************")
print("\n")
print("Global Maximum values for each of the 11 columns")
for i, vals in enumerate(global_max_values):
    print(f"Column {i+1} = ", vals)

Global Minimum values for each of the 11 columns
Column 1 =  -12200.0
Column 2 =  -4981.0
Column 3 =  -14992.0
Column 4 =  -7696.0
Column 5 =  -4080.0
Column 6 =  -3840.0
Column 7 =  -86.0
Column 8 =  -105.0
Column 9 =  7.0
Column 10 =  -42.349998474121094
Column 11 =  1.0


********************************************


Global Maximum values for each of the 11 columns
Column 1 =  11166.0
Column 2 =  8805.0
Column 3 =  12368.0
Column 4 =  5408.0
Column 5 =  2448.0
Column 6 =  10410.0
Column 7 =  105.0
Column 8 =  86.0
Column 9 =  121.0
Column 10 =  78.75357055664062
Column 11 =  23.0


In [32]:
test_data = np.load("test_X.npy", allow_pickle=True)

# Assuming train_data is a list/array of numpy arrays,
# and each element has shape (216, 100, 11).
# Concatenate all arrays along a new axis, then reshape.

# Step 1: Stack all arrays from train_data into a single NumPy array
# If train_data is already a single array containing multiple arrays,
# you might need to adjust this depending on its structure.
# Let's assume train_data is a list of arrays.
all_data_stacked = np.concatenate(test_data, axis=0) # Or np.vstack if 2D arrays, or creating a new first dimension

# If each item in train_data is (216, 100, 11), and you concatenate them along axis=0,
# the result would be (len(train_data) * 216, 100, 11)

# Reshape the combined data to (total_observations, 11)
# total_observations = len(train_data) * 216 * 100
reshaped_all_data = all_data_stacked.reshape(-1, all_data_stacked.shape[-1])

# Find the global minimum and maximum for each column
global_min_values = np.min(reshaped_all_data, axis=0)
global_max_values = np.max(reshaped_all_data, axis=0)

print("Global Minimum values for each of the 11 columns")
for i, vals in enumerate(global_min_values):
    print(f"Column {i+1} = ", vals)
print("\n") 
print("********************************************")
print("\n")
print("Global Maximum values for each of the 11 columns")
for i, vals in enumerate(global_max_values):
    print(f"Column {i+1} = ", vals)

Global Minimum values for each of the 11 columns
Column 1 =  -5776.0
Column 2 =  -2832.0
Column 3 =  -8128.0
Column 4 =  -3232.0
Column 5 =  -1058.0
Column 6 =  410.0
Column 7 =  -80.0
Column 8 =  -84.0
Column 9 =  20.5
Column 10 =  -9.137499809265137
Column 11 =  3.0


********************************************


Global Maximum values for each of the 11 columns
Column 1 =  5320.0
Column 2 =  4904.0
Column 3 =  8624.0
Column 4 =  2284.0
Column 5 =  1080.0
Column 6 =  6656.0
Column 7 =  95.0
Column 8 =  82.0
Column 9 =  113.0
Column 10 =  9.25
Column 11 =  23.0


In [33]:
val_data = np.load("val_X.npy", allow_pickle=True)

# Assuming train_data is a list/array of numpy arrays,
# and each element has shape (216, 100, 11).
# Concatenate all arrays along a new axis, then reshape.

# Step 1: Stack all arrays from train_data into a single NumPy array
# If train_data is already a single array containing multiple arrays,
# you might need to adjust this depending on its structure.
# Let's assume train_data is a list of arrays.
all_data_stacked = np.concatenate(val_data, axis=0) # Or np.vstack if 2D arrays, or creating a new first dimension

# If each item in train_data is (216, 100, 11), and you concatenate them along axis=0,
# the result would be (len(train_data) * 216, 100, 11)

# Reshape the combined data to (total_observations, 11)
# total_observations = len(train_data) * 216 * 100
reshaped_all_data = all_data_stacked.reshape(-1, all_data_stacked.shape[-1])

# Find the global minimum and maximum for each column
global_min_values = np.min(reshaped_all_data, axis=0)
global_max_values = np.max(reshaped_all_data, axis=0)

print("Global Minimum values for each of the 11 columns")
for i, vals in enumerate(global_min_values):
    print(f"Column {i+1} = ", vals)
print("\n") 
print("********************************************")
print("\n")
print("Global Maximum values for each of the 11 columns")
for i, vals in enumerate(global_max_values):
    print(f"Column {i+1} = ", vals)

Global Minimum values for each of the 11 columns
Column 1 =  -5840.0
Column 2 =  -5384.0
Column 3 =  -9152.0
Column 4 =  -5536.0
Column 5 =  -2096.0
Column 6 =  288.0
Column 7 =  -81.5
Column 8 =  -123.0
Column 9 =  13.0
Column 10 =  -11.751667022705078
Column 11 =  3.0


********************************************


Global Maximum values for each of the 11 columns
Column 1 =  6336.0
Column 2 =  5056.0
Column 3 =  18639.0
Column 4 =  3648.0
Column 5 =  2616.0
Column 6 =  7240.0
Column 7 =  119.0
Column 8 =  91.0
Column 9 =  101.0
Column 10 =  14.131428718566895
Column 11 =  23.0


## Reducing the csv size

In [2]:
import os

csv_dir = "/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs"

csv_path = []
for dirs in os.listdir(csv_dir):
    dir_path = os.path.join(csv_dir, dirs)
    
    if not os.path.isdir(dir_path):
        continue
    print(dir_path)
    for files in os.listdir(dir_path):
        path = os.path.join(dir_path, files)
        
        if path.endswith(".csv"):
            csv_path.append(path)

/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW10-28-29-30
/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW35-103-04-105
/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW36-106-107-108
/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW05-13-14-15
/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW31-91-92-93
/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW04-10-11-12
/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW23-67-68-69
/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW32-94-95-96
/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW22-64-65-66
/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW06-16-17-18
/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW19-55-56-57
/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW30-88-89-90
/home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW11-31-32-33
/home/arjav.singh/Projects/DeepIO

In [4]:
import pandas as pd
import os

input_columns = [
    "Gp","Gq","Gr","Ax","Ay","Az",
    "Bx","By","Bz","Altitude","Mode",
]
output_columns = ["GPS Lat", "GPS Lon", "GPS AGL"]
other_columns = ["GPS Date", "GPS Time"]

relevant_columns = other_columns + input_columns + output_columns

for path in csv_path:
    if not os.path.exists(path):
        print(f"[WARNING] File not found: {path}")
        continue

    try:
        df = pd.read_csv(path, on_bad_lines='skip', low_memory=False)
    except Exception as e:
        print(f"[ERROR] Could not read {path}: {e}")
        continue

    # Handle empty CSV (0 rows OR 0 columns)
    if df.empty or df.shape[1] == 0:
        print(f"[WARNING] Empty or invalid CSV: {path}")
        continue

    # Clean column names
    df.columns = df.columns.str.strip()

    # Keep only columns that exist
    keep_cols = [c for c in relevant_columns if c in df.columns]

    # Handle case: no relevant columns found
    if len(keep_cols) == 0:
        print(f"[WARNING] No relevant columns in {path}. Skipping.")
        continue

    # Filter and save
    df = df[keep_cols].copy()

    df.to_csv(path, index=False)
    print(f"[OK] Processed: {path} (kept {len(keep_cols)} columns)")


[OK] Processed: /home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW10-28-29-30/20 July 2022 12_48_08.csv (kept 16 columns)
[OK] Processed: /home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW35-103-04-105/29 April 2022 08_48_07.csv (kept 16 columns)
[OK] Processed: /home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW35-103-04-105/27 March 2022 19_28_08.csv (kept 16 columns)
[OK] Processed: /home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW35-103-04-105/01 May 2022 07_01_30.csv (kept 16 columns)
[OK] Processed: /home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW35-103-04-105/01 May 2022 17_45_09.csv (kept 16 columns)
[OK] Processed: /home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW35-103-04-105/23 August 2022 18_59_15.csv (kept 16 columns)
[OK] Processed: /home/arjav.singh/Projects/DeepIO/Production_Data/SIDM/csvs/SW35-103-04-105/30 April 2022 12_02_11.csv (kept 16 columns)
[OK] Processed: /home/arjav.singh/Projects/Deep

# Finding the optimal batch size

In [None]:
import torch as t
import torch.nn as nn
import torch.nn.functional as F

def get_batch_size(
    model: nn.Module,
    device: torch.device,
    input_shape: t.Tuple[int, int, int],
    output_shape: t.Tuple[int],
    dataset_size: int,
    max_batch_size: int = None,
    num_iterations: int = 5,
) -> int:
    model.to(device)
    model.train(True)
    optimizer = torch.optim.Adam(model.parameters())

    batch_size = 2
    while True:
        if max_batch_size is not None and batch_size >= max_batch_size:
            batch_size = max_batch_size
            break
        if batch_size >= dataset_size:
            batch_size = batch_size // 2
            break
        try:
            for _ in range(num_iterations):
                # dummy inputs and targets
                inputs = torch.rand(*(batch_size, *input_shape), device=device)
                targets = torch.rand(*(batch_size, *output_shape), device=device)
                outputs = model(inputs)
                loss = F.mse_loss(targets, outputs)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            batch_size *= 2
        except RuntimeError:
            batch_size //= 2
            break
    del model, optimizer
    torch.cuda.empty_cache()
    return batch_size

# Model Architecture analysis

In [26]:
import torch.nn as nn

class LSTMModelV3(nn.Module):
    def __init__(
        self, in_dim = 11, hidden_size = 1400, output_size = 2, num_layers = 1
    ):
        super(LSTMModelV3, self).__init__()
        
        self.name = "LSTM V3"
        self.lstm_1 = nn.LSTM(in_dim, hidden_size, num_layers, batch_first=True)
        self.lstm_2 = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.lstm_3 = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x, _ = self.lstm_1(x)
        x, _ = self.lstm_2(x)
        x, _ = self.lstm_3(x)
        output = self.fc(x[:, -1, :])
        return output
        
        

In [29]:
import torch

def get_model_size(model, dtype_bytes=4):
    """
    dtype_bytes:
        float32 -> 4
        float16 -> 2
        int8    -> 1
    """
    total_params = sum(p.numel() for p in model.parameters())
    total_bytes = total_params * dtype_bytes
    total_mb = total_bytes / (1024 * 1024)
    total_gb = total_bytes / (1024 * 1024 * 1024)

    return total_params, total_mb, total_gb

model = LSTMModelV3(in_dim=11, hidden_size=1400, output_size=2)
params, mb, gb = get_model_size(model)

print(f"Total parameters: {params:,}")
print(f"Model size (float32): {mb:.2f} MB ({gb:.3f} GB)")


Total parameters: 39,298,002
Model size (float32): 149.91 MB (0.146 GB)


In [None]:
import torch
import torch.nn as nn

class DecoderOnlyTransformer(nn.Module):
    def __init__(
        self,
        in_dim=11,
        hidden_size=400,
        out_dim=2,
        num_layers=2,
        num_heads=8,
        ff_dim=2048,
        max_seq_len=2000,
        dropout=0.1,
    ):
        super().__init__()

        self.name = "DecoderOnlyTransformer"

        # Project 11 → hidden_size
        self.input_proj = nn.Linear(in_dim, hidden_size)

        # Learnable positional embeddings
        self.pos_emb = nn.Parameter(torch.randn(1, max_seq_len, hidden_size))

        # One decoder layer
        layer = nn.TransformerDecoderLayer(
            d_model=hidden_size,
            nhead=num_heads,
            dim_feedforward=ff_dim,
            dropout=dropout,
            batch_first=True,
            activation="gelu"
        )

        # Stack layers (2 to match your LSTM depth)
        self.decoder = nn.TransformerDecoder(layer, num_layers=num_layers)

        # Final linear layer to match LSTM output shape
        self.fc = nn.Linear(hidden_size, out_dim)

    def _causal_mask(self, T, device):
        # (T, T) upper triangular mask (GPT-style)
        mask = torch.triu(torch.ones(T, T, device=device), diagonal=1)
        return mask.masked_fill(mask == 1, float("-inf"))

    def forward(self, x):
        """
        x: (batch, seq_len, 11)
        returns: (batch, 2)
        """
        B, T, _ = x.shape
        device = x.device

        # 1. Input projection + positional encoding
        x = self.input_proj(x) + self.pos_emb[:, :T, :]

        # 2. Causal mask
        causal_mask = self._causal_mask(T, device)

        # 3. Decoder-only transformer (no encoder ⇒ memory=None)
        out = self.decoder(
            tgt=x,
            memory=None,
            tgt_mask=causal_mask
        )

        # 4. Predict only LAST time-step (LSTM-like behavior)
        out = self.fc(out[:, -1, :])

        return out

In [32]:
transformer_model = DecoderOnlyTransformer(in_dim=11, hidden_size=1000, out_dim=2, num_layers=3)
params, mb, gb = get_model_size(transformer_model)

print(f"Total parameters: {params:,}")
print(f"Model size (float32): {mb:.2f} MB ({gb:.3f} GB)")

Total parameters: 38,353,146
Model size (float32): 146.31 MB (0.143 GB)
