## SORTS (comparisons)

### Insertion sort

In [1]:
def insertion_sort(A):
    comparisons = 0
    for i in range(1, len(A)):
        key = A[i]
        j = i - 1
        comparisons += 1
        while j >= 0 and A[j] > key:
            comparisons += 1
            A[j + 1] = A[j]
            j -= 1
        A[j + 1] = key
    return comparisons

### Selection sort

In [2]:
def selection_sort(A):
    comparisons = 0
    n = len(A)
    for i in range(n - 1):
        min_index = i
        for j in range(i + 1, n):
            comparisons += 1
            if A[j] < A[min_index]:
                min_index = j
        A[i], A[min_index] = A[min_index], A[i]
    return comparisons

### Merge sort

In [3]:
def merge_sort(A):
    if len(A) <= 1:
        return 0
    mid = len(A) // 2
    left = A[:mid]
    right = A[mid:]

    # Recursive call on each half
    comparecount = merge_sort(left) + merge_sort(right)

    # Two iterators for traversing the two halves
    i = 0
    j = 0
    
    # Iterator for the main list
    k = 0
    
    while i < len(left) and j < len(right):
        comparecount += 1
        if left[i] <= right[j]:
            # The value from the left half has been used
            A[k] = left[i]
            # Move the iterator forward
            i += 1
        else:
            A[k] = right[j]
            j += 1
        # Move to the next slot
        k += 1

    # For all the remaining values
    while i < len(left):
        comparecount += 1
        A[k] = left[i]
        i += 1
        k += 1

    while j < len(right):
        comparecount += 1
        A[k]=right[j]
        j += 1
        k += 1

    return comparecount

### Heap sort

In [4]:
def heapify(A, n, i, comparisons):
    largest = i
    l = 2 * i + 1
    r = 2 * i + 2

    if l < n and A[l] > A[largest]:
        largest = l
        comparisons[0] += 1

    if r < n and A[r] > A[largest]:
        largest = r
        comparisons[0] += 1

    if largest != i:
        A[i], A[largest] = A[largest], A[i]
        heapify(A, n, largest, comparisons)

def heap_sort(A):
    comparisons = [0]
    n = len(A)

    for i in range(n // 2 - 1, -1, -1):
        heapify(A, n, i, comparisons)

    for i in range(n - 1, 0, -1):
        A[i], A[0] = A[0], A[i]
        heapify(A, i, 0, comparisons)

    return comparisons[0]

### Quick sort

In [5]:
def quicksort(A):
    comparisons = [0]
    quickSorter(A, 0, len(A) - 1, comparisons)
    return comparisons[0]

def partition(A, low, high, comparisons):
    pivot = A[high]
    i = low - 1
    for j in range(low, high):
        comparisons[0] += 1
        if A[j] <= pivot:
            i += 1
            A[i], A[j] = A[j], A[i]
    A[i + 1], A[high] = A[high], A[i + 1]
    return i + 1

def quickSorter(A, low, high, comparisons):
    if low < high:
        pi = partition(A, low, high, comparisons)
        quickSorter(A, low, pi - 1, comparisons)
        quickSorter(A, pi + 1, high, comparisons)

### Timsort

In [6]:
MINIMUM = 32

def find_minrun(n): 
    r = 0
    while n >= MINIMUM: 
        r |= n & 1
        n >>= 1
    return n + r 

def tim_insertion_sort(array, left, right): 
    global comparisons
    for i in range(left + 1, right + 1):
        key = array[i]
        j = i - 1
        comparisons += 1
        while j >= left and key < array[j]:
            array[j + 1] = array[j]
            j -= 1
            comparisons += 1
        array[j + 1] = key
    return array
              
def tim_merge(array, l, m, r): 
    global comparisons
    array_length1 = m - l + 1
    array_length2 = r - m 
    left = []
    right = []
    for i in range(array_length1): 
        left.append(array[l + i]) 
    for i in range(array_length2): 
        right.append(array[m + 1 + i]) 
  
    i = 0
    j = 0
    k = l
   
    while j < array_length2 and i < array_length1: 
        if left[i] <= right[j]: 
            array[k] = left[i] 
            i += 1
        else: 
            array[k] = right[j] 
            j += 1
        k += 1
        comparisons += 1
  
    while i < array_length1: 
        array[k] = left[i] 
        k += 1
        i += 1
        comparisons += 1
  
    while j < array_length2: 
        array[k] = right[j] 
        k += 1
        j += 1
        comparisons += 1
  
def timsort(array): 
    n = len(array) 
    minrun = find_minrun(n) 
  
    for start in range(0, n, minrun): 
        end = min(start + minrun - 1, n - 1) 
        tim_insertion_sort(array, start, end) 
   
    size = minrun 
    while size < n: 
        for left in range(0, n, 2 * size): 
            mid = min(n - 1, left + size - 1) 
            right = min((left + 2 * size - 1), (n - 1)) 
            tim_merge(array, left, mid, right) 
        size = 2 * size

    return comparisons

### Introsort

In [7]:
def introsort(arr):

    def partition(arr, low, high):
        pivot = arr[high]
        i = low - 1
        for j in range(low, high):
            comparisons[0] += 1
            if arr[j] <= pivot:
                i += 1
                arr[i], arr[j] = arr[j], arr[i]
        arr[i + 1], arr[high] = arr[high], arr[i + 1]
        return i + 1

    def insertion_sort(arr, low, high):
        for i in range(low + 1, high + 1):
            key = arr[i]
            j = i - 1
            comparisons[0] += 1
            while j >= low and arr[j] > key:
                comparisons[0] += 1
                arr[j + 1] = arr[j]
                j -= 1
            arr[j + 1] = key

    def heap_sort(arr):
        def heapify(arr, n, i):
            largest = i
            l = 2 * i + 1
            r = 2 * i + 2

            if l < n and arr[i] < arr[l]:
                largest = l

            if r < n and arr[largest] < arr[r]:
                largest = r

            if largest != i:
                arr[i], arr[largest] = arr[largest], arr[i]
                heapify(arr, n, largest)

        n = len(arr)

        for i in range(n // 2 - 1, -1, -1):
            heapify(arr, n, i)

        for i in range(n - 1, 0, -1):
            arr[i], arr[0] = arr[0], arr[i]
            heapify(arr, i, 0)

    def introsort_util(arr, low, high, depth_limit):
        size = high - low + 1

        if size < 16:
            insertion_sort(arr, low, high)
            return

        if depth_limit == 0:
            heap_sort(arr)
            return

        pivot = partition(arr, low, high)

        introsort_util(arr, low, pivot - 1, depth_limit - 1)
        introsort_util(arr, pivot + 1, high, depth_limit - 1)

    comparisons = [0]
    introsort_util(arr, 0, len(arr) - 1, 2 * math.log(len(arr)))
    return comparisons[0]

## PRESORTEDNESS

### Number of Runs
The number of runs, is the number of increasing sequences in an array minus one.

In [8]:
def runs(arr):
    count = 0

    for key in range(1,len(arr)):
        if arr[key] < arr[key-1]:
            count += 1

    return count

arr = [3, 4, 4, 7]
print(runs(arr))

0


In [9]:
def runs_comp(arr):
    count = 0
    comparisons = 0
    for key in range(1,len(arr)):
        comparisons += 1
        if arr[key] < arr[key-1]:
            count += 1

    return comparisons

### Number of Deletions
minimum number of elements that need to be removed from array to obtain a sorted sequence.

In [10]:
def deletions(arr):
    def ceil_index(sub, val):
        l, r = 0, len(sub)-1
        while l <= r:
            mid = (l + r) // 2
            if sub[mid] >= val:
                r = mid - 1
            else:
                l = mid + 1
        return l
 
    sub = [arr[0]]
    for i in range(1, len(arr)):
        if arr[i] >= sub[-1]:
            sub.append(arr[i])
        else:
            sub[ceil_index(sub, arr[i])] = arr[i]
 
    return len(arr) - len(sub)

arr = [3, 4, 5, 7]
print(deletions(arr))

0


In [11]:
def deletions_comp(arr):
    global comparisons
    comparisons = 0
    def ceil_index(sub, val):
        global comparisons
        l, r = 0, len(sub)-1
        while l <= r:
            mid = (l + r) // 2
            comparisons += 1
            if sub[mid] >= val:
                r = mid - 1
            else:
                l = mid + 1
        return l
 
    sub = [arr[0]]
    for i in range(1, len(arr)):
        comparisons += 1
        if arr[i] >= sub[-1]:
            sub.append(arr[i])
        else:
            sub[ceil_index(sub, arr[i])] = arr[i]
 
    return comparisons

### Number of Inversions
The number of inversion in an array, is the number of pairs j < key such that arr[j] > key.

In [12]:
def inversions(arr):
    count = 0

    for key in range(len(arr)):
        for j in range(key):
            if arr[key] < arr[j]:
                count += 1

    return count

arr = [3, 4, 4, 7]
print(inversions(arr))

0


In [13]:
def inversions_comp(arr):
    count = 0
    comparisons = 0
    for key in range(len(arr)):
        for j in range(key):
            comparisons += 1
            if arr[key] < arr[j]:
                count += 1

    return comparisons

### Max Distance by inversion
Computes the longest distance between two elements that have to be inverted.

In [14]:
def max_dist_inversion(arr):
    c_max_dist = 0

    for key in range(len(arr)):
        for j in range(key):
            if arr[key] < arr[j]:
                c_max_dist = max(key-j,c_max_dist)

    return c_max_dist

arr = [3, 4, 4, 7]
print(max_dist_inversion(arr))

0


In [15]:
def max_dist_inversion_comp(arr):
    c_max_dist = 0
    count = 0

    for key in range(len(arr)):
        for j in range(key):
            count += 1
            if arr[key] < arr[j]:
                c_max_dist = max(key-j,c_max_dist)

    return count

### Inv(arr) and Dis(arr) combination
Note that the amount of comparisons needed to perform inversion(arr) and max_dist_inversion(arr) are exactly the same. We can combine the two algorithms in to one without having to do any more comparison than just doing one of the two.

In [16]:
def inv_dis(arr):
    c_max_dist = 0
    inv = 0
    
    for key in range(len(arr)):
        for j in range(key):
            if arr[key] < arr[j]:
                c_max_dist = max(key-j,c_max_dist)
                inv += 1

    return inv, c_max_dist

arr = [3, 4, 4, 7]
print(inv_dis(arr))

(0, 0)


In [17]:
def inv_dis_comp(arr):
    c_max_dist = 0
    inv = 0
    comparisons = 0
    
    for key in range(len(arr)):
        for j in range(key):
            comparisons += 1
            if arr[key] < arr[j]:
                c_max_dist = max(key-j,c_max_dist)
                inv += 1

    return comparisons

## Training Set creation (sorting)

In [18]:
import pickle
import os
import pandas as pd
import numpy as np
import math
import random
import matplotlib.pyplot as plt
random.seed(42)
np.random.seed(42)


results = []

with open('dataset_dfs.pkl', 'rb') as f:
    dataset_dfs = pickle.load(f)

for key, df in dataset_dfs.items():
    for column in df.columns:
        arr = df[column].values
        if len(arr) < 200:
            continue

        # SAMPLING STRATEGIES: RANDOM, BEGINNING MIDDLE END
        """indices = np.random.choice(len(arr), size=sample_size, replace=False)
        indices.sort()
        arr_random = arr[indices]
        
        sample_size3 = math.floor(sample_size/3)
        middle = len(arr)//2
        arr1 = arr[:sample_size3]
        arr2 = arr[middle - sample_size3//2:middle+sample_size3//2]
        arr3 = arr[-sample_size3-1:-1]
        arr_mixed = np.concatenate((arr1,arr2,arr3))"""
        
        # DISTRIBUTED SAMPLING STRATEGY
        sample_size = 10
        step = len(arr) // sample_size
        arr_dist = [arr[i] for i in range(0, len(arr), step)]
        
        step = 10

        
        def sampling(sample_size):
            return [arr[i] for i in range(0, sample_size * 10, 10)]
        
        # these sampling strategies for presortedness calculation dont perform as well
        """runs_val_random = runs(arr_random)
        inversions_val_random = inversions(arr_random)
        deletions_val_random = deletions(arr_random)
        
        runs_val_mixed = runs(arr_mixed)
        inversions_val_mixed = inversions(arr_mixed)
        deletions_val_mixed = deletions(arr_mixed)"""

        # distributed sampling presortedness calculation (performs best)
        runs_val_dist = runs(arr_dist)
        inversions_val_dist, dis_val_dist = inv_dis(arr_dist)
        deletions_val_dist = deletions(arr_dist)

        #inversions_comp_dist = inversions_comp(arr_dist)
        inv_dis_comp_dist = inv_dis_comp(arr_dist)
        deletions_comp_dist = deletions_comp(arr_dist)
        runs_comp_dist = runs_comp(arr_dist)

        # full array presortedness
        runs_val = runs(arr)
        inversions_val, dis_val = inv_dis(arr)
        deletions_val = deletions(arr)

        # sorting algorithm comparison calculation
        comp_merge = merge_sort(arr.copy())
        comp_selection = selection_sort(arr.copy())
        comp_quick = quicksort(arr.copy())
        comp_intro = introsort(arr.copy())
        comp_insertion = insertion_sort(arr.copy())
        global comparisons
        comparisons = 0
        comp_tim = timsort(arr.copy())
        comparisons = 0
        comp_heap = heap_sort(arr.copy())
        
        comparison_counts = {
            'introsort': comp_intro,
            'insertion_sort': comp_insertion,
            'merge_sort': comp_merge,
            'timsort': comp_tim,
            'quick_sort': comp_quick,
            'selection_sort': comp_selection,
            #'heap_sort': comp_heap, this is bugged (sometimes outputs 0)
        }

        min_algorithm = min(comparison_counts, key=comparison_counts.get)
        min_comparisons = comparison_counts[min_algorithm]

        results.append({
            'Dataset': key,
            'Column': column,
            'Algorithm': min_algorithm,
            'Comparisons': min_comparisons,
            'Runs': runs_val,
            'Inversions': inversions_val,
            'Deletions': deletions_val,
            'Dis': dis_val,
            
            'deletions_val_dist1': deletions(sampling(1)),
            'runs_val_dist1': runs(sampling(1)),
            'deletions_comp_dist1': deletions_comp(sampling(1)),
            'runs_comp_dist1': runs_comp(sampling(1)),

            'deletions_val_dist2': deletions(sampling(2)),
            'runs_val_dist2': runs(sampling(2)),
            'deletions_comp_dist2': deletions_comp(sampling(2)),
            'runs_comp_dist2': runs_comp(sampling(2)),

            'deletions_val_dist3': deletions(sampling(3)),
            'runs_val_dist3': runs(sampling(3)),
            'deletions_comp_dist3': deletions_comp(sampling(3)),
            'runs_comp_dist3': runs_comp(sampling(3)),

            'deletions_val_dist4': deletions(sampling(4)),
            'runs_val_dist4': runs(sampling(4)),
            'deletions_comp_dist4': deletions_comp(sampling(4)),
            'runs_comp_dist4': runs_comp(sampling(4)),

            'deletions_val_dist5': deletions(sampling(5)),
            'runs_val_dist5': runs(sampling(5)),
            'deletions_comp_dist5': deletions_comp(sampling(5)),
            'runs_comp_dist5': runs_comp(sampling(5)),

            'deletions_val_dist6': deletions(sampling(6)),
            'runs_val_dist6': runs(sampling(6)),
            'deletions_comp_dist6': deletions_comp(sampling(6)),
            'runs_comp_dist6': runs_comp(sampling(6)),

            'deletions_val_dist7': deletions(sampling(7)),
            'runs_val_dist7': runs(sampling(7)),
            'deletions_comp_dist7': deletions_comp(sampling(7)),
            'runs_comp_dist7': runs_comp(sampling(7)),

            'deletions_val_dist8': deletions(sampling(8)),
            'runs_val_dist8': runs(sampling(8)),
            'deletions_comp_dist8': deletions_comp(sampling(8)),
            'runs_comp_dist8': runs_comp(sampling(8)),

            'deletions_val_dist9': deletions(sampling(9)),
            'runs_val_dist9': runs(sampling(9)),
            'deletions_comp_dist9': deletions_comp(sampling(9)),
            'runs_comp_dist9': runs_comp(sampling(9)),

            'deletions_val_dist10': deletions(sampling(10)),
            'runs_val_dist10': runs(sampling(10)),
            'deletions_comp_dist10': deletions_comp(sampling(10)),
            'runs_comp_dist10': runs_comp(sampling(10)),

            'deletions_val_dist11': deletions(sampling(11)),
            'runs_val_dist11': runs(sampling(11)),
            'deletions_comp_dist11': deletions_comp(sampling(11)),
            'runs_comp_dist11': runs_comp(sampling(11)),

            'deletions_val_dist12': deletions(sampling(12)),
            'runs_val_dist12': runs(sampling(12)),
            'deletions_comp_dist12': deletions_comp(sampling(12)),
            'runs_comp_dist12': runs_comp(sampling(12)),

            'deletions_val_dist13': deletions(sampling(13)),
            'runs_val_dist13': runs(sampling(13)),
            'deletions_comp_dist13': deletions_comp(sampling(13)),
            'runs_comp_dist13': runs_comp(sampling(13)),

            'deletions_val_dist14': deletions(sampling(14)),
            'runs_val_dist14': runs(sampling(14)),
            'deletions_comp_dist14': deletions_comp(sampling(14)),
            'runs_comp_dist14': runs_comp(sampling(14)),

            'deletions_val_dist15': deletions(sampling(15)),
            'runs_val_dist15': runs(sampling(15)),
            'deletions_comp_dist15': deletions_comp(sampling(15)),
            'runs_comp_dist15': runs_comp(sampling(15)),

            'deletions_val_dist16': deletions(sampling(16)),
            'runs_val_dist16': runs(sampling(16)),
            'deletions_comp_dist16': deletions_comp(sampling(16)),
            'runs_comp_dist16': runs_comp(sampling(16)),

            'deletions_val_dist17': deletions(sampling(17)),
            'runs_val_dist17': runs(sampling(17)),
            'deletions_comp_dist17': deletions_comp(sampling(17)),
            'runs_comp_dist17': runs_comp(sampling(17)),

            'deletions_val_dist18': deletions(sampling(18)),
            'runs_val_dist18': runs(sampling(18)),
            'deletions_comp_dist18': deletions_comp(sampling(18)),
            'runs_comp_dist18': runs_comp(sampling(18)),

            'deletions_val_dist19': deletions(sampling(19)),
            'runs_val_dist19': runs(sampling(19)),
            'deletions_comp_dist19': deletions_comp(sampling(19)),
            'runs_comp_dist19': runs_comp(sampling(19)),

            'deletions_val_dist20': deletions(sampling(20)),
            'runs_val_dist20': runs(sampling(20)),
            'deletions_comp_dist20': deletions_comp(sampling(20)),
            'runs_comp_dist20': runs_comp(sampling(20)),

            'introsort': comp_intro,
            'insertion_sort': comp_insertion,
            'merge_sort': comp_merge,
            'timsort': comp_tim,
            'heap_sort': comp_heap,
            'quick_sort': comp_quick,
            'selection_sort': comp_selection,
        })
                

df_results = pd.DataFrame(results)
print(df_results)
df_results.to_csv('trainingData.csv')

                 Dataset           Column       Algorithm  Comparisons  Runs  \
0             cp_ratings       Unnamed: 0  insertion_sort          199     0   
1             cp_ratings       max_rating      quick_sort         1541    99   
2             cp_ratings         contest1      quick_sort         1454   108   
3             cp_ratings         contest2      merge_sort         1544    99   
4             cp_ratings         contest3      merge_sort         1544    98   
...                  ...              ...             ...          ...   ...   
2246               train            Parch         timsort         1181    38   
2247               train             Fare      merge_sort         1544    95   
2248  c4_epa_air_quality       Unnamed: 0  insertion_sort          199     0   
2249  c4_epa_air_quality  arithmetic_mean      merge_sort         1544    92   
2250  c4_epa_air_quality              aqi      merge_sort         1544    77   

      Inversions  Deletions  Dis  delet