# External sort algorithm on large data files

## Imports, constants and setups

In [1]:
import os

notebook_mode: int = int(
    input(
        """
    Select notebook mode: 
    1. Google Colab  2. Local
    """
    )
)

if notebook_mode == 1:
    # Run on Colab.
    INPUT_PATH: str = "/content/drive/MyDrive/Ellinbank/video_observation/data/"
    SCRIPT_PATH: str = "/content/drive/MyDrive/Ellinbank/video_observation/training_testing/data_labelling/"
    OUTPUT_PATH: str = "/content/drive/MyDrive/Ellinbank/video_observation/output/"
    os.system(command="cp {}custom_model.py .".format(SCRIPT_PATH))
    os.system(command="cp {}inference.py .".format(SCRIPT_PATH))
    os.system(command="cp {}utils.py .".format(SCRIPT_PATH))
    os.system(command="cp {}operation.py .".format(SCRIPT_PATH))
elif notebook_mode == 2:
    INPUT_PATH: str = "../../../../data/"
    SCRIPT_PATH: str = "./"
    OUTPUT_PATH: str = "./out/"

In [2]:
os.system(command="rm -rf {}".format(OUTPUT_PATH))
try:
    os.mkdir(path=OUTPUT_PATH)
except FileExistsError:
    pass

In [3]:
import pandas as pd


files: list[str] = os.listdir(path=INPUT_PATH)
files = [f for f in files if f.endswith(".zip")]

sensor_names: list[str] = [name.split("_")[0] for name in files]
print(sensor_names)

['MOS2E03230475']


In [4]:
import random


window_size: int = 600  # 300: 10 seconds
window_per_epoch: int = 200
epoch: int = 1
batch_size: int = 64
# random.seed(715) # 715 looks good.
random.seed(785)  # 785 makes "other" looks bad, otherwise is good.

## Split the large data file and sort these partitions

In [5]:
import gc

import pandas.io.parsers.readers

for f in files:
    os.system("rm -rf {}{}_*.txt".format(OUTPUT_PATH, f.split("_")[0]))
    # Path("{}{}.txt".format(OUTPUT_PATH, f.split("_")[0])).touch()
    data_chunks: pandas.io.parsers.readers.TextFileReader = pd.read_csv(
        filepath_or_buffer="{}/{}".format(INPUT_PATH, f),
        # nrows=14000,
        chunksize=window_size * 100,
    )
    df_count: int = 1
    raw_data: pd.DataFrame
    for raw_data in data_chunks:
        # Sort values based on timestamps.
        raw_data.sort_values(
            by=["timestamps"],
            ascending=True,
            inplace=True,
        )
        raw_data = raw_data.reset_index(drop=True)
        raw_data.to_csv(
            path_or_buf="{}{}_{}.txt".format(
                OUTPUT_PATH,
                f.split("_")[0],
                df_count,
            ),
            header=True,
            index=False,
        )
        df_count += 1

        # if df_count == 6:
        #     break
        gc.collect()

In [6]:
from pathlib import Path

import numpy as np


def merge_external_files(
    f1: str,
    f2: str,
) -> None:
    """
    Read and merge-sort two text file and output the merged file.

    Parameters
    ----------
        f1: str
            Path to the first data file.

        f2: str
            Path to the second data file.

    Returns
    -------
    None
    """

    # Need a header row to keep the format correct.
    header_row: str = ...
    # Read the first file.
    with open(f1) as file:
        rows: list[str] = file.readlines()
        header_row = rows[0]
        rows = rows[1:]
        file1: np.ndarray = np.array([r.rstrip("\n") for r in rows])

    # Read the second file.
    with open(f2) as file:
        rows: list[str] = file.readlines()
        rows = rows[1:]
        file2: np.ndarray = np.array([r.rstrip("\n") for r in rows])

    len_1: int = file1.size
    len_2: int = file2.size
    i1: int = 0
    i2: int = 0

    file_uid: int = random.randint(0, 9999999)
    os.system(
        "rm -rf {}merged_{}.txt".format(
            OUTPUT_PATH,
            file_uid,
        )
    )
    Path(
        "{}merged_{}.txt".format(
            OUTPUT_PATH,
            file_uid,
        )
    ).touch()
    with open(
        file="{}merged_{}.txt".format(
            OUTPUT_PATH,
            file_uid,
        ),
        mode="a",
    ) as output_file:
        # Write the head row first.
        output_file.write(header_row)
        # Implementing the "merge" part of the merge sort algorithm externally.
        while i1 < len_1 and i2 < len_2:
            # if np.datetime64(file1[i1][4]) <= np.datetime64(file2[i2][4]):
            time1: np.datetime64 = np.datetime64(file1[i1].split(sep=",")[4])
            time2: np.datetime64 = np.datetime64(file2[i2].split(sep=",")[4])
            if time1 <= time2:
                # output_string: str = ",".join(file1[i1])\
                output_string: str = file1[i1] + "\n"
                output_file.write(output_string)
                i1 += 1
            else:
                # output_string: str = ",".join(file2[i2])
                output_string: str = file2[i2] + "\n"
                output_file.write(output_string)
                i2 += 1

        while i1 < len_1:
            output_string: str = file1[i1] + "\n"
            output_file.write(output_string)
            i1 += 1
        while i2 < len_2:
            output_string: str = file2[i2] + "\n"
            output_file.write(output_string)
            i2 += 1

        gc.collect()

    # Remove the two files after merged.
    os.system(command="rm -rf {} {}".format(f1, f2))

In [7]:
while True:
    split_files: list[str] = os.listdir(path=OUTPUT_PATH)
    no_of_files: int = len(split_files)

    if no_of_files == 1:
        break

    if no_of_files % 2 == 0:
        for file_index in range(0, no_of_files, 2):
            merge_external_files(
                f1="{}{}".format(
                    OUTPUT_PATH,
                    split_files[file_index],
                ),
                f2="{}{}".format(
                    OUTPUT_PATH,
                    split_files[file_index + 1],
                ),
            )
            gc.collect()
    else:
        for file_index in range(0, no_of_files - 1, 2):
            merge_external_files(
                f1="{}{}".format(
                    OUTPUT_PATH,
                    split_files[file_index],
                ),
                f2="{}{}".format(
                    OUTPUT_PATH,
                    split_files[file_index + 1],
                ),
            )
            gc.collect()

In [9]:
import pandas as pd

chunked_df: pandas.io.parsers.readers.TextFileReader = pd.read_csv(
    filepath_or_buffer="./out/merged_2785301.txt",
    chunksize=600,
    sep=",",
)

df: pd.DataFrame
for df in chunked_df:
    ts: np.ndarray = df["timestamps"].to_numpy()
    is_sorted = lambda a: np.all(a[:-1] <= a[1:])
    is_sorted(ts)

Unnamed: 0,serial_number,nickname,animalID,sample_rate,timestamps,acc_axis1,acc_axis2,acc_axis3,acc_mag,lpf_axis1,...,dis_axis1_denoised,dis_axis2_denoised,dis_axis3_denoised,dis_mag_denoised,tilt_axis1,tilt_axis2,tilt_axis3,tilt_axis1_denoised,tilt_axis2_denoised,tilt_axis3_denoised
0,MOS2E03230475,,,30,2023-04-14 18:01:01.000,0.645,0.309,0.715,1.011302,0.636648,...,1874.156185,1694.906462,192.005856,2534.173547,39.120814,18.021817,44.646077,39.105397,18.027755,44.634478
1,MOS2E03230475,,,30,2023-04-14 18:01:01.034,0.633,0.309,0.719,1.006544,0.636649,...,1875.197894,1695.848197,192.113417,2535.581948,39.120815,18.021889,44.64612,39.105401,18.027766,44.634488
2,MOS2E03230475,,,30,2023-04-14 18:01:01.067,0.629,0.313,0.715,1.002415,0.63665,...,1876.239602,1696.789933,192.220978,2536.990348,39.120819,18.021961,44.646166,39.105405,18.027776,44.634498
3,MOS2E03230475,,,30,2023-04-14 18:01:01.100,0.633,0.309,0.715,1.003691,0.636651,...,1877.281309,1697.731668,192.328538,2538.398747,39.12082,18.022032,44.64621,39.105409,18.027787,44.634509
4,MOS2E03230475,,,30,2023-04-14 18:01:01.134,0.633,0.309,0.719,1.006544,0.636651,...,1878.323016,1698.673403,192.436099,2539.807146,39.120823,18.022104,44.646254,39.105413,18.027798,44.634519


In [10]:
ts: np.ndarray = test_df["timestamps"].to_numpy()

is_sorted = lambda a: np.all(a[:-1] <= a[1:])
is_sorted(ts)

True