## SORTS

### Insertion sort comparison count

In [31]:
def insertion_comparisons(A):
    x=0
    for i in range(1, len(A)):
        key = A[i]
        j = i - 1
        x += 1
        while j >= 0 and A[j] > key:
            x += 1
            A[j + 1] = A[j]
            j -= 1
        A[j + 1] = key
    return x, A

## PRESORTEDNESS

### Number of Runs
The number of runs, is the number of increasing sequences in an array minus one.

In [32]:
def runs(arr):
    count = 0

    for key in range(1,len(arr)):
        if arr[key] < arr[key-1]:
            count += 1

    return count

arr = [3, 4, 4, 7]
print(runs(arr))

0


### Number of Deletions
minimum number of elements that need to be removed from array to obtain a sorted sequence.

In [33]:
def deletions(arr):
    def ceil_index(sub, val):
        l, r = 0, len(sub)-1
        while l <= r:
            mid = (l + r) // 2
            if sub[mid] >= val:
                r = mid - 1
            else:
                l = mid + 1
        return l
 
    sub = [arr[0]]
    for i in range(1, len(arr)):
        if arr[i] >= sub[-1]:
            sub.append(arr[i])
        else:
            sub[ceil_index(sub, arr[i])] = arr[i]
 
    return len(arr) - len(sub)

arr = [3, 4, 5, 7]
print(deletions(arr))

0


### Number of Inversions
The number of inversion in an array, is the number of pairs j < key such that arr[j] > key.

In [34]:
def inversions(arr):
    count = 0

    for key in range(len(arr)):
        for j in range(key):
            if arr[key] < arr[j]:
                count += 1

    return count

arr = [3, 4, 4, 7]
print(inversions(arr))

0


## Setup and Dataset import/cleaning

In [35]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
os.getcwd()

dataset_directory = './kaggledatasets'
dataset_dfs = {}

# import files in the directory
for file_name in os.listdir(dataset_directory):
        dataset_name = os.path.splitext(file_name)[0]
        dataset_dfs[dataset_name] = pd.read_csv(os.path.join(dataset_directory, file_name))

print("amount of dataframes imported:")
print(len(dataset_dfs))

for key, df in dataset_dfs.items():
    for column in df.columns:
        if not df[column].apply(lambda x: isinstance(x, (int, float))).all():
            df.drop(column, axis=1, inplace=True)
            
    dataset_dfs[key] = df.dropna().head(30)


amount of dataframes imported:
15


## Sorting and training set creation

In [36]:
results = []


print(df.iloc[5,1])

for key, df in dataset_dfs.items():
    for column in df.columns:
        arr = df[column].values
        runs_val = runs(arr)
        inversions_val = inversions(arr)
        deletions_val = deletions(arr)
        comp, _ = insertion_comparisons(arr)

        results.append({
            'Dataset': key,
            'Column': column,
            'Comparisons': comp,
            'Runs': runs_val,
            'Inversions': inversions_val,
            'Deletions': deletions_val
        })

df_results = pd.DataFrame(results)
print(df_results)

3481324860
                           Dataset                  Column  Comparisons  Runs  \
0               amazon_kitchenware                   stars          224    12   
1               amazon_kitchenware            reviewsCount          329    14   
2               amazon_kitchenware             price/value          247    12   
3                 Cartoon_datasets                  Rating          278    16   
4                    freedom_index              Unnamed: 0           29     0   
5                    freedom_index                    Year           29     0   
6                    freedom_index           Overall Score          458    24   
7                    freedom_index         Property Rights          307    18   
8                    freedom_index    Government Integrity          336    15   
9                    freedom_index  Judicial Effectiveness          323    15   
10                   freedom_index              Tax Burden          233    15   
11               

## Linear Regression model

The Presortedness metric "Inversions" equals to the amount of comparisons insertion sort requires minus 29 (n-1, because "Inversions" doesn't have to verify with a comparison that the position of an element is correct).

Therefore a linear regression model can perfectly predict how many comparison insertion sort will need.

But even if we remove the Inversion metric from the features we can accurately predict the amount of comparisons needed.

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

# features with inversions and without inversions
X_inv = df_results[['Runs',  'Inversions', 'Deletions']]
X = df_results[['Runs', 'Deletions']]

y = df_results['Comparisons']

# Split the dataset into training and testing sets
X_train_inv, X_test_inv, y_train_inv, y_test_inv = train_test_split(X_inv, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_inv = LinearRegression()
model_inv.fit(X_train_inv, y_train_inv)
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model with inversions and without
y_pred_inv = model_inv.predict(X_test_inv)
mse_inv = mean_squared_error(y_test_inv, y_pred_inv)
print("Mean Squared Error with inversions:", mse_inv)
mae_inv = mean_absolute_error(y_test_inv, y_pred_inv)
print("Mean Absolute Error with inversions:", mae_inv)
print("Coefficients:", model_inv.coef_)
print("Intercept:", model_inv.intercept_)
print("\n")

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Mean Squared Error with inversions: 2.6253290925755273e-27
Mean Absolute Error with inversions: 3.0790185216271006e-14
Coefficients: [-1.38803070e-14  1.00000000e+00  2.34534614e-15]
Intercept: 28.99999999999997


Mean Squared Error: 2422.0382395328256
Mean Absolute Error: 32.6874654308263
Coefficients: [11.27764925  3.63507379]
Intercept: 17.684493395612748
