# External sort algorithm on large data files

## Imports, constants and setups

In [1]:
import os

notebook_mode: int = int(
    input(
        """
    Select notebook mode:
    1. Google Colab  2. Local
    """
    )
)

if notebook_mode == 1:
    # Run on Colab.
    INPUT_PATH: str = "/content/drive/MyDrive/Ellinbank/video_observation/data/"
    SCRIPT_PATH: str = "/content/drive/MyDrive/Ellinbank/video_observation/training_testing/data_labelling/"
    # OUTPUT_PATH: str = "/content/drive/MyDrive/Ellinbank/video_observation/output/"
    OUTPUT_PATH: str = "./out/"
    os.system(command="cp {}custom_model.py .".format(SCRIPT_PATH))
    os.system(command="cp {}inference.py .".format(SCRIPT_PATH))
    os.system(command="cp {}utils.py .".format(SCRIPT_PATH))
    os.system(command="cp {}operation.py .".format(SCRIPT_PATH))
elif notebook_mode == 2:
    INPUT_PATH: str = "../../../../data/"
    SCRIPT_PATH: str = "./"
    OUTPUT_PATH: str = "./out/"

In [2]:
os.system(command="rm -rf {}".format(OUTPUT_PATH))
try:
    os.mkdir(path=OUTPUT_PATH)
except FileExistsError:
    pass

In [3]:
import pandas as pd


files: list[str] = os.listdir(path=INPUT_PATH)
files = [f for f in files if f.endswith(".zip")]

sensor_names: list[str] = [name.split("_")[0] for name in files]
print(sensor_names)

['MOS2E03230475']


In [4]:
import random


window_size: int = 600  # 300: 10 seconds
window_per_epoch: int = 200
epoch: int = 1
batch_size: int = 64
# random.seed(715) # 715 looks good.
random.seed(785)  # 785 makes "other" looks bad, otherwise is good.

## Split the large data file and sort these partitions

In [5]:
import gc

import pandas.io.parsers.readers

for f in files:
    os.system("rm -rf {}{}_*.txt".format(OUTPUT_PATH, f.split("_")[0]))
    # Path("{}{}.txt".format(OUTPUT_PATH, f.split("_")[0])).touch()
    data_chunks: pandas.io.parsers.readers.TextFileReader = pd.read_csv(
        filepath_or_buffer="{}/{}".format(INPUT_PATH, f),
        # nrows=14000,
        chunksize=window_size * 1,
    )
    df_count: int = 1
    raw_data: pd.DataFrame
    for raw_data in data_chunks:
        # Sort values based on timestamps.
        raw_data.sort_values(
            by=["timestamps"],
            ascending=True,
            inplace=True,
        )
        raw_data = raw_data.reset_index(drop=True)
        raw_data.to_csv(
            path_or_buf="{}{}_{}.txt".format(
                OUTPUT_PATH,
                f.split("_")[0],
                df_count,
            ),
            header=True,
            index=False,
        )
        df_count += 1

        # if df_count == 6:
        #     break
        gc.collect()

In [6]:
from pathlib import Path

import numpy as np


def merge_external_files(
    f1: str,
    f2: str,
) -> None:
    """
    Read and merge-sort two text file and output the merged file.

    Parameters
    ----------
        f1: str
            Path to the first data file.

        f2: str
            Path to the second data file.

    Returns
    -------
    None
    """

    # Need a header row to keep the format correct.
    header_row: str = ...
    file_uid: int = random.randint(0, 9999999)
    os.system(
        "rm -rf {}merged_{}.txt".format(
            OUTPUT_PATH,
            file_uid,
        )
    )
    Path(
        "{}merged_{}.txt".format(
            OUTPUT_PATH,
            file_uid,
        )
    ).touch()
    with open(
        file="{}merged_{}.txt".format(
            OUTPUT_PATH,
            file_uid,
        ),
        mode="a",
    ) as output_file:
        # Implementing the "merge" part of the merge sort algorithm externally.
        # Read 2 files simutenously.
        with open(f1) as file1, open(f2) as file2:
            # Write the head row first.
            header_row = file1.readline()
            next(file2)
            output_file.write(header_row)

            line1: str
            line2: str
            line1_written: bool = True
            line2_written: bool = True
            while True:
                if line1_written:
                    line1 = file1.readline()
                    line1_written = False
                if line2_written:
                    line2 = file2.readline()
                    line2_written = False
                if not line1 or not line2:
                    break

                # if np.datetime64(file1[i1][4]) <= np.datetime64(file2[i2][4]):
                time1: np.datetime64 = np.datetime64(line1.split(sep=",")[4])
                time2: np.datetime64 = np.datetime64(line2.split(sep=",")[4])
                if time1 <= time2:
                    # output_string: str = ",".join(file1[i1])
                    output_file.write(line1)
                    line1_written = True
                else:
                    # output_string: str = ",".join(file2[i2])
                    output_file.write(line2)
                    line2_written = True

                gc.collect()

            if line1:
                output_file.write(line1)
                while True:
                    line1 = file1.readline()
                    if not line1:
                        break
                    output_file.write(line1)
                    gc.collect()
            if line2:
                output_file.write(line2)
                while True:
                    line2 = file2.readline()
                    if not line2:
                        break
                    output_file.write(line2)
                    gc.collect()

        gc.collect()

    # Remove the two files after merged.
    os.system(command="rm -rf {} {}".format(f1, f2))

In [7]:
while True:
    split_files: list[str] = os.listdir(path=OUTPUT_PATH)
    no_of_files: int = len(split_files)

    if no_of_files == 1:
        break

    if no_of_files % 2 == 0:
        for file_index in range(0, no_of_files, 2):
            merge_external_files(
                f1="{}{}".format(
                    OUTPUT_PATH,
                    split_files[file_index],
                ),
                f2="{}{}".format(
                    OUTPUT_PATH,
                    split_files[file_index + 1],
                ),
            )
            gc.collect()
    else:
        for file_index in range(0, no_of_files - 1, 2):
            merge_external_files(
                f1="{}{}".format(
                    OUTPUT_PATH,
                    split_files[file_index],
                ),
                f2="{}{}".format(
                    OUTPUT_PATH,
                    split_files[file_index + 1],
                ),
            )
            gc.collect()

In [8]:
import pandas as pd

chunked_df: pandas.io.parsers.readers.TextFileReader = pd.read_csv(
    filepath_or_buffer="./out/merged_6565788.txt",
    chunksize=600,
    sep=",",
)

df: pd.DataFrame
for df in chunked_df:
    ts: np.ndarray = df["timestamps"].to_numpy()
    is_sorted = lambda a: np.all(a[:-1] <= a[1:])
    print(is_sorted(ts))

True
True
True
True
True
