In [1]:
from MiniProject1 import Reader
from MiniProject1 import Sorter
from MiniProject1 import TimeoutException
from datetime import datetime as dt
from collections import defaultdict
import time
import pandas as pd
from statistics import mean
from pprint import pprint as pp
from matplotlib import pyplot as plt

In [2]:
reader = Reader('shakespeare-complete-works.txt')
n = len(reader.words)
print(n, 'words')
sorter = Sorter()

limits = [int(n*1/2**9),  # 1/512
          int(n*1/2**8),  # 1/256
          int(n*1/2**7),  # 1/128
          int(n*1/2**6),  # 1/64
          int(n*1/2**5),  # 1/32
          int(n*1/2**4),  # 1/16
          int(n*1/2**3),  # 1/8
          int(n*1/2**2),  # 1/4
          int(n*1/2**1),  # 1/2
          int(n*1/2**0)]  # 1/1

904153 words


In [3]:
array_to_sort = ['selection', 'insertion', 'heap', 'merge', 'bubble', 'quick']

print('Unsorted'.ljust(10), array_to_sort)
print('Selection'.ljust(10), sorter.selection(array_to_sort[:]))
print('Insertion'.ljust(10), sorter.insertion(array_to_sort[:]))
print('Heap'.ljust(10), sorter.heap(array_to_sort[:]))
print('Merge'.ljust(10), sorter.merge(array_to_sort[:]))
print('Bubble'.ljust(10), sorter.bubble(array_to_sort[:]))
print('Quick'.ljust(10), sorter.quick(array_to_sort[:]))

Unsorted   ['selection', 'insertion', 'heap', 'merge', 'bubble', 'quick']
Selection  ['bubble', 'heap', 'insertion', 'merge', 'quick', 'selection']
Insertion  ['bubble', 'heap', 'insertion', 'merge', 'quick', 'selection']
Heap       ['bubble', 'heap', 'insertion', 'merge', 'quick', 'selection']
Merge      ['bubble', 'heap', 'insertion', 'merge', 'quick', 'selection']
Bubble     ['bubble', 'heap', 'insertion', 'merge', 'quick', 'selection']
Quick      ['bubble', 'heap', 'insertion', 'merge', 'quick', 'selection']


In [4]:
def log_time(name, runtime, n):
    with open('sorting-runtimes.csv', 'a') as file:
        file.write(f'{name}, {runtime}, {n}, {dt.now()}\n')

# Selection Sort

In [5]:
for limit in limits:
    data = reader.words[:limit]
    try:
        start = time.time() * 1000
        sorter.selection(data)
        end = time.time() * 1000
        runtime = int(round(end - start))
        print(
            f'Data size: {str(limit).rjust(7)}: Execution time: {str(runtime).rjust(10)} ms')
        log_time('selection', runtime, len(data))
    except TimeoutException:
        print(f'Data size: {limit}: Too much data')
        break

Data size:    1765: Execution time:        143 ms
Data size:    3531: Execution time:        586 ms
Data size:    7063: Execution time:       2311 ms
Data size:   14127: Execution time:       9084 ms
Data size:   28254: Execution time:      35944 ms
Data size:   56509: Execution time:     146845 ms
Data size: 113019: Too much data


# Insertion Sort

In [6]:
for limit in limits:
    data = reader.words[:limit]
    try:
        start = time.time() * 1000
        sorter.insertion(data)
        end = time.time() * 1000
        runtime = int(round(end - start))
        print(
            f'Data size: {str(limit).rjust(7)}: Execution time: {str(runtime).rjust(10)} ms')
        log_time('insertion', runtime, len(data))
    except TimeoutException:
        print(f'Data size: {limit}: Too much data')
        break

Data size:    1765: Execution time:        151 ms
Data size:    3531: Execution time:        647 ms
Data size:    7063: Execution time:       2532 ms
Data size:   14127: Execution time:      10195 ms
Data size:   28254: Execution time:      40800 ms
Data size:   56509: Execution time:     171317 ms
Data size: 113019: Too much data


# Heap Sort

In [7]:
for limit in limits:
    data = reader.words[:limit]
    start = time.time() * 1000
    sorter.heap(data)
    end = time.time() * 1000
    runtime = int(round(end - start))
    print(
        f'Data size: {str(limit).rjust(7)}: Execution time: {str(runtime).rjust(10)} ms')
    log_time('heap', runtime, len(data))

Data size:    1765: Execution time:         13 ms
Data size:    3531: Execution time:         27 ms
Data size:    7063: Execution time:         57 ms
Data size:   14127: Execution time:        129 ms
Data size:   28254: Execution time:        266 ms
Data size:   56509: Execution time:        593 ms
Data size:  113019: Execution time:       1312 ms
Data size:  226038: Execution time:       2728 ms
Data size:  452076: Execution time:       5768 ms
Data size:  904153: Execution time:      12173 ms


# Merge Sort

In [8]:
for limit in limits:
    data = reader.words[:limit]
    start = time.time() * 1000
    sorter.merge(data)
    end = time.time() * 1000
    runtime = int(round(end - start))
    print(
        f'Data size: {str(limit).rjust(7)}: Execution time: {str(runtime).rjust(10)} ms')
    log_time('merge', runtime, len(data))

Data size:    1765: Execution time:         10 ms
Data size:    3531: Execution time:         20 ms
Data size:    7063: Execution time:         44 ms
Data size:   14127: Execution time:         91 ms
Data size:   28254: Execution time:        205 ms
Data size:   56509: Execution time:        423 ms
Data size:  113019: Execution time:        918 ms
Data size:  226038: Execution time:       1949 ms
Data size:  452076: Execution time:       4104 ms
Data size:  904153: Execution time:       8715 ms


# Bubble Sort

In [9]:
for limit in limits:
    data = reader.words[:limit]
    try:
        start = time.time() * 1000
        sorter.bubble(data)
        end = time.time() * 1000
        runtime = int(round(end - start))
        print(
            f'Data size: {str(limit).rjust(7)}: Execution time: {str(runtime).rjust(10)} ms')
        log_time('bubble', runtime, len(data))
    except TimeoutException:
        print(f'Data size: {limit}: Too much data')
        break

Data size:    1765: Execution time:       1280 ms
Data size:    3531: Execution time:       5105 ms
Data size:    7063: Execution time:      20086 ms
Data size:   14127: Execution time:      80293 ms
Data size: 28254: Too much data


# Quick sort

In [10]:
for limit in limits:
    try:
        data = reader.words[:limit]
        start = time.time() * 1000
        sorter.quick(data)
        end = time.time() * 1000
        runtime = int(round(end - start))
        print(
            f'Data size: {str(limit).rjust(7)}: Execution time: {str(runtime).rjust(10)} ms')
        log_time('quick', runtime, len(data))
    except:
        print(f'Data size: {limit}: Too much data')
        break

Data size:    1765: Execution time:          6 ms
Data size:    3531: Execution time:         27 ms
Data size:    7063: Execution time:         29 ms
Data size:   14127: Execution time:         89 ms
Data size:   28254: Execution time:        269 ms
Data size:   56509: Execution time:        860 ms
Data size: 113019: Too much data


In [7]:
%matplotlib notebook
def get_values(_dict):
    keys = []
    values = []
    for key, value in _dict.items():
        keys.append(key)
        values.append(mean(value))
    return keys, values


df = pd.read_csv('sorting-runtimes.csv')

sel_times = defaultdict(list)
ins_times = defaultdict(list)
heap_times = defaultdict(list)
merge_times = defaultdict(list)
bubble_times = defaultdict(list)
quick_times = defaultdict(list)

for idx, row in df.iterrows():
    alg = row['Algorithm']
    time = row['Runtime']
    size = row['Data size']

    if alg == 'selection':
        sel_times[size].append(time)  # /1000)
    elif alg == 'insertion':
        ins_times[size].append(time)  # /1000)
    elif alg == 'heap':
        heap_times[size].append(time)  # /1000)
    elif alg == 'merge':
        merge_times[size].append(time)  # /1000)
    elif alg == 'bubble':
        bubble_times[size].append(time)  # /1000)
    elif alg == 'quick':
        quick_times[size].append(time)  # /1000)


sel_xys = get_values(sel_times)
ins_xys = get_values(ins_times)
heap_xys = get_values(heap_times)
merge_xys = get_values(merge_times)
bubble_xys = get_values(bubble_times)
quick_xys = get_values(quick_times)

plt.loglog(sel_xys[0], sel_xys[1])
plt.loglog(ins_xys[0], ins_xys[1])
plt.loglog(heap_xys[0], heap_xys[1])
plt.loglog(merge_xys[0], merge_xys[1])
plt.loglog(bubble_xys[0], bubble_xys[1])
plt.loglog(quick_xys[0], quick_xys[1])
plt.xlabel('Data size')
plt.ylabel('Time (ms)')
plt.legend(array_to_sort)

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x1fd52fc42c8>