## Sorting algorithms
The following code block are the sorting algorithms and helper methods for sorting data

In [2]:
import time
import random
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
import csv
from array import array


In [3]:
def merge_sort_helper(arr):
    if len(arr) <= 1:
        return arr
    mid = len(arr) // 2
    left = merge_sort_helper(arr[:mid])
    right = merge_sort_helper(arr[mid:])
    return merge(left, right)

def merge(left, right):
    result = []
    i = j = 0
    while i < len(left) and j < len(right):
        if left[i] < right[j]:
            result.append(left[i])
            i += 1
        else:
            result.append(right[j])
            j += 1
    result.extend(left[i:])
    result.extend(right[j:])
    return result

def quick_sort_helper(arr):
    if len(arr) <= 1:
        return arr
    #pivot = arr[len(arr) // 2]
    pivot = random.choice(arr)
    left = [x for x in arr if x < pivot]
    middle = [x for x in arr if x == pivot]
    right = [x for x in arr if x > pivot]
    return quick_sort_helper(left) + middle + quick_sort_helper(right)

def heapify(arr, n, i):
    largest = i
    left = 2 * i + 1
    right = 2 * i + 2

    if left < n and arr[largest] < arr[left]:
        largest = left

    if right < n and arr[largest] < arr[right]:
        largest = right

    if largest != i:
        arr[i], arr[largest] = arr[largest], arr[i]
        heapify(arr, n, largest)

def counting_sort(arr, exp):
    n = len(arr)
    output = [0] * n
    count = [0] * 10

    for i in range(n):
        index = (arr[i] // exp) % 10
        count[index] += 1

    for i in range(1, 10):
        count[i] += count[i - 1]

    i = n - 1
    while i >= 0:
        index = (arr[i] // exp) % 10
        output[count[index] - 1] = arr[i]
        count[index] -= 1
        i -= 1

    for i in range(n):
        arr[i] = output[i:][0]

def bubble_sort(data):
    if isinstance(data, tuple) or isinstance(data, set):
        data = list(data)

    n = len(data)
    for i in range(n):
        for j in range(0, n - i - 1):
            if data[j] > data[j + 1]:
                data[j], data[j + 1] = data[j + 1], data[j]
    return data

def insertion_sort(data):
    if isinstance(data, tuple) or isinstance(data, set):
        data = list(data)
    for i in range(1, len(data)):
        key = data[i]
        j = i - 1
        while j >= 0 and key < data[j]:
            data[j + 1] = data[j]
            j -= 1
        data[j + 1] = key
    return data

def merge_sort(data):
    if isinstance(data, (tuple, set, pd.Series)):
        data = list(data)
    return merge_sort_helper(data)
"""
def quick_sort(data):
    if isinstance(data, tuple) or isinstance(data, set):
        data = list(data)
    return quick_sort_helper(data)
"""
def quick_sort(data):
    if isinstance(data, tuple) or isinstance(data, set):
        data = list(data)

    stack = [(0, len(data) - 1)]

    while stack:
        start, end = stack.pop()
        if start >= end:
            continue

        pivot = data[end]
        i = start
        for j in range(start, end):
            if data[j] < pivot:
                data[i], data[j] = data[j], data[i]
                i += 1
        data[i], data[end] = data[end], data[i]

        stack.append((start, i - 1))
        stack.append((i + 1, end))
    return data

def heap_sort(data):
    if isinstance(data, tuple) or isinstance(data, set):
        data = list(data)
    n = len(data)
    for i in range(n // 2 - 1, -1, -1):
        heapify(data, n, i)

    for i in range(n - 1, 0, -1):
        data[0], data[i] = data[i], data[0]
        heapify(data, i, 0)
    return data

def radix_sort(data):
    if isinstance(data, tuple) or isinstance(data, set):
        data = list(data)
    if len(data) == 0:
        return data

    max_num = max(data)
    exp = 1
    while max_num // exp > 0:
        counting_sort(data, exp)
        exp *= 10
    return data

def bucket_sort(data):
    #Must convert np array and pd Series to list because they
    #dont have clear and extend methods
    #Thats ok, just another quirk of the data the model will have to predict
    if isinstance(data, (tuple, set, np.ndarray, pd.Series)):
        data = list(data)
    if len(data) == 0:
        return data

    min_value = min(data)
    max_value = max(data)
    bucket_count = len(data)
    buckets = [[] for _ in range(bucket_count)]

    for num in data:
        index = int((num - min_value) * (bucket_count - 1) / (max_value - min_value))
        buckets[index].append(num)

    data.clear()
    for bucket in buckets:
        insertion_sort(bucket)
        data.extend(bucket)
    return data

## Methods to load our data from csv files

In [4]:
def load_data():
    with open('./data/data.csv', 'r') as data_file:
        reader = csv.reader(data_file)
        matrix = [row for row in reader]
    return matrix

def load_features():
    with open('./data/features.csv', 'r') as data_file:
        reader = csv.reader(data_file)
        matrix = [row for row in reader]
    return matrix

def convert_to_data_type():
    features = load_features()
    data = load_data()
    converted_data = data.copy()
    for j in range(0, len(features)):
        if features[j][4] == '1':
            pass
            #TODO: Fix
            #converted_data[j] = array(data[j])
        elif features[j][5] == '1':
            converted_data[j] = list(data[j])
        elif features[j][6] == '1':
            converted_data[j] = tuple(data[j])
        elif features[j][7] == '1':
            converted_data[j] = set(data[j])
        elif features[j][8] == '1':
            converted_data[j] = np.array(data[j])
        elif features[j][9] == '1':
            converted_data[j] = pd.Series(data[j])
    
        if isinstance(converted_data[j], np.ndarray):
            converted_data[j] = converted_data[j].astype(int)
        elif isinstance(converted_data[j], list):
            converted_data[j] = [int(x) for x in converted_data[j]]
        elif isinstance(converted_data[j], tuple):
            converted_data[j] = tuple(int(x) for x in converted_data[j])
        elif isinstance(converted_data[j], set):
            converted_data[j] = set(int(x) for x in converted_data[j])
        elif isinstance(converted_data[j], pd.Series):
            converted_data[j] = converted_data[j].astype(int)
        else:
            raise TypeError(f"Unsupported data type: {type(converted_data[j])}")


    return converted_data


## Methods to record and display results of sorting algorithms

Running on 10 collections without multiprocessing takes: 1 min 45 seconds (105 seconds)

Running on 100 collections without multiprocessing takes: 


In [None]:
from tqdm.notebook import tqdm  # <--- Jupyter-compatible tqdm
import time
import pandas as pd

def time_sort():
    timing_data = []
    data = convert_to_data_type()

    outer_bar = tqdm(range(100), desc="Datasets", unit="dataset", position=0)

    for i in outer_bar:
        dataset_results = {}
        dataset_results["Dataset"] = f"Dataset {i+1}"
        current_data = data[i]

        sorts = [
            ("Bubble", bubble_sort),
            ("Insertion", insertion_sort),
            ("Merge", merge_sort),
            ("Quick", quick_sort),
            ("Heap", heap_sort),
            ("Radix", radix_sort),
            ("Bucket", bucket_sort),
        ]

        inner_bar = tqdm(sorts, desc=f"Sorting Dataset {i+1}", unit="sort", position=1, leave=False)

        for sort_name, sort_fn in inner_bar:
            start = time.perf_counter()
            sort_fn(current_data)
            end = time.perf_counter()
            dataset_results[sort_name] = end - start

        timing_data.append(dataset_results)

    return timing_data

df = pd.DataFrame(time_sort())
print(df.to_string(index=False, float_format="{:.6f}".format))


Datasets:   0%|          | 0/10 [00:00<?, ?dataset/s]

Sorting Dataset 1:   0%|          | 0/7 [00:00<?, ?sort/s]

Sorting Dataset 2:   0%|          | 0/7 [00:00<?, ?sort/s]

Sorting Dataset 3:   0%|          | 0/7 [00:00<?, ?sort/s]

Sorting Dataset 4:   0%|          | 0/7 [00:00<?, ?sort/s]

Sorting Dataset 5:   0%|          | 0/7 [00:00<?, ?sort/s]

Sorting Dataset 6:   0%|          | 0/7 [00:00<?, ?sort/s]

Sorting Dataset 7:   0%|          | 0/7 [00:00<?, ?sort/s]

Sorting Dataset 8:   0%|          | 0/7 [00:00<?, ?sort/s]

Sorting Dataset 9:   0%|          | 0/7 [00:00<?, ?sort/s]

Sorting Dataset 10:   0%|          | 0/7 [00:00<?, ?sort/s]

   Dataset    Bubble  Insertion    Merge     Quick     Heap    Radix   Bucket
 Dataset 1  6.376637   0.001623 0.020738  3.197355 0.046633 0.843092 0.052007
 Dataset 2  0.654868   0.245273 0.006054  0.261019 0.009798 0.064619 0.002645
 Dataset 3  8.449229   0.002870 0.025780  5.692334 0.079053 0.257614 0.006440
 Dataset 4  5.177029   0.002780 0.018552  4.002104 0.063878 0.150931 0.005842
 Dataset 5  0.036435   0.014811 0.002501  0.007850 0.002682 0.006454 0.000934
 Dataset 6 22.442759   0.004290 0.044359 17.413733 0.144876 0.839602 0.015702
 Dataset 7  0.101074   0.041992 0.003156  0.017344 0.004963 0.013571 0.001463
 Dataset 8  0.462432   0.000484 0.004169  0.283591 0.014697 0.013490 0.001677
 Dataset 9 13.618844   0.003447 0.035283  9.481417 0.117404 0.473700 0.008580
Dataset 10  0.180640   0.072203 0.005438  0.002908 0.006116 0.023295 0.002297


In [8]:
def time_sort():
    timing_data = []
    data = convert_to_data_type()
    print(data[1])
    for i in range(10):
        dataset_results = {}
        name = f"Dataset {i+1}"
        dataset_results["Dataset"] = name

        dataset_results["Bubble"] = time.perf_counter()
        bubble_sort(data[i])
        dataset_results["Bubble"] = time.perf_counter() - dataset_results["Bubble"]
        print(f"{name} bubble sort finished")

        dataset_results["Insertion"] = time.perf_counter()
        insertion_sort(data[i])
        dataset_results["Insertion"] = time.perf_counter() - dataset_results["Insertion"]
        print(f"{name} insertion sort finished")

        dataset_results["Merge"] = time.perf_counter()
        merge_sort(data[i])
        dataset_results["Merge"] = time.perf_counter() - dataset_results["Merge"]
        print(f"{name} merge sort finished")

        dataset_results["Quick"] = time.perf_counter()
        quick_sort(data[i])
        dataset_results["Quick"] = time.perf_counter() - dataset_results["Quick"]
        print(f"{name} quick sort finished")

        dataset_results["Heap"] = time.perf_counter()
        heap_sort(data[i])
        dataset_results["Heap"] = time.perf_counter() - dataset_results["Heap"]
        print(f"{name} heap sort finished")

        dataset_results["Radix"] = time.perf_counter()
        radix_sort(data[i])
        dataset_results["Radix"] = time.perf_counter() - dataset_results["Radix"]
        print(f"{name} radix sort finished")

        dataset_results["Bucket"] = time.perf_counter()
        bucket_sort(data[i])
        dataset_results["Bucket"] = time.perf_counter() - dataset_results["Bucket"]
        print(f"{name} bucket sort finished")

        timing_data.append(dataset_results)
    return timing_data

df = pd.DataFrame(time_sort())
print(df.to_string(index=False, float_format="{:.6f}".format))

{0, 3, 4, 6, 8, 10, 14, 16, 18, 20, 22, 24, 26, 33, 35, 37, 43, 45, 47, 51, 53, 55, 56, 58, 60, 62, 66, 68, 70, 72, 74, 76, 78, 82, 83, 87, 89, 91, 93, 95, 97, 101, 105, 107, 108, 110, 112, 114, 116, 118, 124, 128, 130, 132, 134, 135, 137, 147, 149, 153, 155, 157, 159, 164, 166, 168, 170, 172, 174, 176, 180, 182, 184, 186, 191, 199, 203, 205, 211, 214, 220, 222, 228, 230, 232, 234, 236, 238, 240, 241, 251, 253, 259, 261, 263, 265, 266, 268, 270, 274, 276, 278, 282, 284, 288, 292, 293, 295, 297, 299, 301, 303, 307, 311, 313, 319, 320, 322, 324, 328, 330, 332, 334, 338, 342, 344, 345, 349, 353, 355, 363, 365, 369, 371, 372, 378, 380, 382, 384, 386, 388, 390, 392, 394, 398, 401, 403, 405, 407, 409, 411, 413, 415, 419, 423, 424, 428, 434, 436, 438, 440, 442, 448, 450, 451, 453, 455, 457, 459, 461, 463, 467, 469, 471, 475, 478, 480, 482, 484, 486, 488, 492, 494, 496, 498, 502, 505, 507, 509, 513, 515, 521, 523, 525, 527, 530, 534, 538, 540, 544, 546, 550, 552, 554, 556, 557, 561, 563, 565, 

KeyboardInterrupt: 