In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

%matplotlib inline

# БПМИ203 old edition...
np.random.seed(203)

%config InlineBackend.figure_format = 'retina'

# sns.set(style='whitegrid', palette='deep')
# sns.set(style='darkgrid', palette='rocket')
sns.set(style='darkgrid', palette='deep')

plt.rcParams['figure.figsize'] = 8, 5
plt.rcParams['font.size'] = 12
plt.rcParams['savefig.format'] = 'pdf'


In [None]:
def parse_type(name):
    return "/".join(name.split('/')[1:-1])

def parse_size(name):
    return int(name.split('/')[-1].split('.')[0])

def prepare_df(df):
    kMaxSize = None
    df["test_type"] = df["name"].apply(parse_type)
    df["rows"] = df["name"].apply(parse_size)
    if kMaxSize is not None:
        df = df[df["rows"] < kMaxSize]
    return df


In [None]:
df_info = pd.read_csv("tests_results/tests_info.csv")
df_info = prepare_df(df_info)
df_info


In [None]:
df_algo = pd.read_csv("tests_results/algorithms_results.csv")
df_algo = prepare_df(df_algo)
df_algo


In [None]:
algorithm_names = df_algo["algorithm_name"].unique()
algorithm_names


In [None]:
tests_types = df_algo["test_type"].unique()
tests_types


In [None]:
def plot_for_test_type_compression_sorts(test_type):
    plt.ion()
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 20))

    fig.suptitle(f'Analysis of algorithms for tests from the set {test_type}', fontsize=22, y = 0.93)

    info = df_info[df_info["test_type"] == test_type].copy()
    info.index = np.arange(len(info))

    for algorithm_name in algorithm_names:
        subset = df_algo[(df_algo["test_type"] == test_type) & (df_algo["algorithm_name"] == algorithm_name)].copy()
        if len(subset) == 0:
            continue
        assert len(subset) == len(info)
        subset.index = np.arange(len(subset))
        prettified_name = algorithm_name
        if algorithm_name.split('-')[-1].isdigit():
            prettified_name = " ".join(algorithm_name.split("-")[:-1]) + " budget: " + str(int(algorithm_name.split("-")[-1]) // 10**6) + "ms"
        ax1.plot(subset['rows'], subset['find_permutation_time_ns'] / 1000000000, label=prettified_name, marker='o', markersize=3, alpha=0.75)
        ax2.plot(subset['rows'], subset['compression_time_ns'] / 1000000000, label=prettified_name, marker='o', markersize=3, alpha=0.75)
        ax3.plot(subset['rows'], subset['compressed_size'], label=prettified_name, marker='o', markersize=3, alpha=0.75)
        ax4.plot(subset['rows'], 1 - subset['compressed_size'] / info["serialized_size"], label=prettified_name, marker='o', markersize=3, alpha=0.75)
    
    ax3.plot(info['rows'], info['serialized_size'], label="without compression")

    ax1.legend(loc='upper left')
    ax2.legend(loc='upper left')
    ax3.legend(loc='upper left')
    ax4.legend(loc='best')

    # ax1.set_xscale('log')
    # ax2.set_xscale('log')
    # ax3.set_xscale('log')

    # ax1.set_yscale('log')
    # ax2.set_yscale('log')
    # ax3.set_yscale('log')

    ax4.set_xscale('log')

    ax1.set_title("Dependence of the algorithm's running time", fontsize = 16)
    ax2.set_title('Dependence of the compression time after applying the permutation', fontsize = 16)
    ax3.set_title('Dependence of the compressed size', fontsize = 16)
    ax4.set_title('Dependence of the compression ratio', fontsize = 16)
  
    ax1.set_xlabel('Number of lines', fontsize = 15)
    ax1.set_ylabel('Seconds', fontsize = 15)

    ax2.set_xlabel('Number of lines', fontsize = 15)
    ax2.set_ylabel('Seconds', fontsize = 15)

    ax3.set_xlabel('Number of lines', fontsize = 15)
    ax3.set_ylabel('Bytes', fontsize = 15)

    ax4.set_xlabel('Number of lines', fontsize = 15)
    ax4.set_ylabel('Compression ratio', fontsize = 15)

    fig.savefig(f'tests_results/{test_type.split("/")[-1]}.svg')

    plt.show()


In [None]:
for test_type in tests_types:
    plot_for_test_type_compression_sorts(test_type)


In [None]:
df_clickhouse = pd.read_csv("tests_results/clickhouse_tests.csv")
df_clickhouse = prepare_df(df_clickhouse)
df_clickhouse


In [None]:
tests_types = df_clickhouse["test_type"].unique()
tests_types


In [None]:
def plot_for_test_type_clickhouse(test_type):
    plt.ion()
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

    fig.suptitle(f'Analysis of the effectiveness of compression in the ClickHouse for the set {test_type}', fontsize=20)
    
    kStart = 1000

    info = df_info[df_info["test_type"] == test_type].copy()
    kMinRows = max(10, max(info["rows"]) / kStart)
    info = info[info["rows"] > kMinRows]
    info.index = np.arange(len(info))

    for use_compression_optimization in [True, False]:
        subset = df_clickhouse[(df_clickhouse["test_type"] == test_type) & (df_clickhouse["use_compression_optimization"] == use_compression_optimization)].copy()
        subset = subset[subset["rows"] > kMinRows]
        assert len(subset) == len(info)
        subset.index = np.arange(len(info))
        color = "blueviolet"
        if use_compression_optimization:
            color = "darkgreen"
        ax1.plot(subset['rows'], subset['size'], label=f"optimize={use_compression_optimization}", color=color, marker='o', markersize=3, alpha=0.75)
        ax2.plot(subset['rows'], 1 - subset['size'] / info['serialized_size'], label = f"allow optimize = {use_compression_optimization}", color=color, marker='o', markersize=3, alpha=0.75)

    ax2.set_xscale('log')

    ax1.legend(loc='upper left')
    ax2.legend(loc='upper left')

    ax1.set_title('Dependence of the compressed size', fontsize = 16)
    ax2.set_title('Dependence of the compression ratio', fontsize = 16)

    ax1.set_xlabel('Number of lines', fontsize = 15)
    ax1.set_ylabel('Bytes', fontsize = 15)
    
    ax2.set_xlabel('Number of lines', fontsize = 15)
    ax2.set_ylabel('Compression ratio', fontsize = 15)

    fig.savefig(f'tests_results/clickhouse_{test_type.split("/")[-1]}.svg')

    plt.show()


In [None]:
for test_type in tests_types:
    plot_for_test_type_clickhouse(test_type)
