# Elastic Sizing UberCalculator


## Step 1 - Loading Prereqs

Here we're going to load up some prereq libraries and establish some of the objects we'll be using to make this sizing work.

In [37]:
import pandas as pd
import numpy as np

HOT = 'HOT'
WARM = 'WARM'
COLD = 'COLD'
FROZEN = 'FROZEN'
MASTER = 'MASTER'

class TimeSeriesIngestWorkloadTier:
    def __init__(self, tier, retention_days, replicas, rollover_buffer_days, \
                 size_for_failure):
        self.tier = tier
        self.retention_days = retention_days
        self.replicas = replicas
        self.rollover_buffer_days = rollover_buffer_days
        self.size_for_failure = size_for_failure
        self.previous_retention = 0
    
    
    def slice_retention(self, offset, size_array):
        f = offset + 1 - self.previous_retention - self.retention_days - \
            self.rollover_buffer_days
        t = f + self.retention_days + self.rollover_buffer_days
        return sum(size_array[max(0, f) : max(0, t)])


class TimeSeriesIngestWorkload:
    def __init__(self, name, raw_size_mb, expansion_factor, \
                 contingency_factor, tiers):
        self.name = name
        self.raw_size_mb = raw_size_mb
        self.expansion_factor = expansion_factor
        self.contingency_factor = contingency_factor
        self.tiers = {}

        previous_retention = 0
        for t in tiers:
            self.tiers[t.tier] = t
            t.previous_retention = previous_retention
            previous_retention += t.retention_days
        self.estimated_size_mb = [np.ceil(r * expansion_factor * contingency_factor) for r in raw_size_mb]

    def requirements(self):
        return pd.DataFrame(
            columns = [ 'YEAR', 'MONTH', 'TIER', 'REPLICAS', 'FAILURE_ZONES', \
                       'DATA_DISK_GB' ],
            data = [
                [
                    np.floor(d / 366 + 1),
                    np.floor(d / 30.5 % 12 + 1),
                    t.tier,
                    t.replicas,
                    1 if t.size_for_failure else 0,
                    np.float64(np.ceil(t.slice_retention(d, self.estimated_size_mb) \
                                       / 1024))
                ] for t in self.tiers.values() for d in range(0, len(self.estimated_size_mb))
            ]
        ).groupby(
            by = [ 'YEAR', 'MONTH', 'TIER' ],
            as_index = False
        ).max()

class SearchWorkload:
    def __init__(self, name, average_search_response_ms, threads_per_core, \
                 peak_searches_per_second):
        self.name = name
        self.average_search_response_ms = average_search_response_ms
        self.threads_per_core = threads_per_core
        self.peak_searches_per_second = peak_searches_per_second

    def requirements(self):
        return pd.DataFrame(
            columns = [ 'YEAR', 'MONTH', 'VCPU' ],
            data = [
                [   
                    np.floor(d / 366 + 1),
                    np.floor(d / 30.5 % 12 + 1),
                    np.float64(np.ceil(
                        np.ceil(
                            np.ceil(self.peak_searches_per_s[d] * self.average_search_response_ms / 1000) * 1.5 + 1
                        ) / self.threads_per_core
                    ))
                ] for d in range(len(self.peak_searches_per_s))
            ]
        ).groupby(
            by = [ 'YEAR', 'MONTH' ],
            as_index = False
        ).max()

class Workload:
    def __init__(self, name, workloads):
        self.name = name
        self.workloads = workloads
    
    def requirements(self):
        reqs = pd.DataFrame(
            columns = [ 'YEAR', 'MONTH', 'TIER', 'REPLICAS', 'FAILURE_ZONES', \
                        'VCPU', 'RAM_GB', 'DATA_DISK_GB' ]
        )
        for w in self.workloads:
            reqs = reqs.append(
                other = w.requirements(),
                ignore_index = True
            )
        return reqs.groupby(
            by = [ 'YEAR', 'MONTH', 'TIER', 'REPLICAS', 'FAILURE_ZONES' ],
            as_index = False
        ).sum()

class Zone:
    def __init__(self, name, master_only = False):
        self.name = name
        self.master_only = master_only


class ClusterTier:
    def __init__(self, tier, reserved_storage, data_zones):
        self.tier = tier
        self.reserved_storage = reserved_storage
        self.data_zones = data_zones
    
    def requirements(self, data):
        object_store = pd.DataFrame(
            columns = [ 'YEAR', 'MONTH', 'TIER', 'ZONE', 'VCPU', 'RAM_GB', \
                        'DATA_DISK_GB', 'OBJECT_GB' ]
        )
        if self.tier in [ COLD, FROZEN ]:
            object_store[[ 'YEAR', 'MONTH', 'TIER', 'OBJECT_GB' ]] = \
                data[[ 'YEAR', 'MONTH', 'TIER', 'DATA_DISK_GB' ]]
            if self.tier == COLD:
                object_store[['ZONE']] = '*'
            else:
                data['DATA_DISK_GB'] = np.ceil(np.divide(data['DATA_DISK_GB'], 10))

        zone_multiplier = np.divide(np.add(data['REPLICAS'], 1), \
                                    np.subtract(len(self.data_zones), \
                                                data['FAILURE_ZONES']))
        data = data.drop(
            columns = [ 'REPLICAS', 'FAILURE_ZONES' ]
        )
        vcpu = data[data['VCPU'].isnull() == False]['VCPU']
        ram_gb = data[data['RAM_GB'].isnull() == False]['RAM_GB']
        data_disk_gb = data[data['DATA_DISK_GB'].isnull() == False]['DATA_DISK_GB']
        data['VCPU'] = np.float64(np.ceil(vcpu * zone_multiplier)) if len(vcpu) > 0 else None
        data['RAM_GB'] = np.float64(np.ceil(ram_gb * zone_multiplier)) if len(ram_gb) > 0 else None
        data['DATA_DISK_GB'] = np.float64(np.ceil(data_disk_gb * zone_multiplier / (1 - self.reserved_storage))) if len(data_disk_gb) > 0 else None

        reqs = pd.DataFrame(
            columns = [ 'YEAR', 'MONTH', 'TIER', 'ZONE', 'REPLICAS', \
                        'FAILURE_ZONES', 'VCPU', 'RAM_GB', 'DATA_DISK_GB', \
                        'OBJECT_GB' ]
        )
        for z in self.data_zones:
            zonereqs = data.copy()
            zonereqs['ZONE'] = z.name
            reqs = reqs.append(
                other = zonereqs,
                ignore_index = True
            )
        return reqs.groupby(
            by = [ 'YEAR', 'MONTH', 'TIER', 'ZONE' ],
            as_index = False
        ).sum().append(
            other = object_store,
            ignore_index = True
        )

class Cluster:
    def __init__(self, name, zones, tiers, workloads):
        self.name = name
        self.zones = [ z for z in zones if not z.master_only ] + \
                     [ z for z in zones if z.master_only ]
        self.tiers = { t.tier : t for t in tiers }
        self.workloads = workloads

    def requirements(self):
        initial_reqs = pd.DataFrame(
            columns = [ 'YEAR', 'MONTH', 'TIER', 'REPLICAS', 'FAILURE_ZONES', \
                        'VCPU', 'RAM_GB', 'DATA_DISK_GB', 'OBJECT_GB' ]
        )
        for w in self.workloads:
            initial_reqs = initial_reqs.append(
                other = w.requirements(),
                ignore_index = True
            )
        reqs = pd.DataFrame()
        for cluster_tier in self.tiers.values():
            init_reqs = pd.DataFrame(
                data = initial_reqs[ initial_reqs['TIER'] == cluster_tier.tier ]
            )
            reqs = reqs.append(
                other = cluster_tier.requirements(init_reqs),
                ignore_index = True
            )
        return reqs



## Step 2 - Add some workload(s)!

In [38]:
workloads = [
    Workload(
        'workload_1',
        [
            TimeSeriesIngestWorkload(
                name               = "ingest_workload_1",
                raw_size_mb        = [1048576] * 732,
                expansion_factor   = 1.2,
                contingency_factor = 1.3,
                tiers              = [
                    TimeSeriesIngestWorkloadTier(
                        tier                 = HOT,
                        retention_days       = 7,
                        replicas             = 1,
                        rollover_buffer_days = 1,
                        size_for_failure     = True
                    ),
                    TimeSeriesIngestWorkloadTier(
                        tier                 = WARM,
                        retention_days       = 24,
                        replicas             = 1,
                        rollover_buffer_days = 0,
                        size_for_failure     = True
                    ),
                    TimeSeriesIngestWorkloadTier(
                        tier                 = FROZEN,
                        retention_days       = 335,
                        replicas             = 0,
                        rollover_buffer_days = 0,
                        size_for_failure     = False
                    )
                ]
            ),
            TimeSeriesIngestWorkload(
                name               = "ingest_workload_2",
                raw_size_mb        = [100] * 732,
                expansion_factor   = 1.2,
                contingency_factor = 1.3,
                tiers              = [
                    TimeSeriesIngestWorkloadTier(
                        tier                 = HOT,
                        retention_days       = 14,
                        replicas             = 1,
                        rollover_buffer_days = 1,
                        size_for_failure     = True
                    ),
                ]
            )
        ]
    )
]

## Step 3 - Add one or more Clusters

In [39]:
zones = [
    Zone(name='Zone 1'),
    Zone(name='Zone 2'),
    #Zone(name='Witness Zone', master_only=True)
    Zone(name='Zone 3')
]

clusters = [
    Cluster(
        name = "cluster",
        zones = zones,
        tiers = [
            ClusterTier(
                tier = HOT,
                reserved_storage = 0.2,
                data_zones = [z for z in zones if not z.master_only]
            ),
            ClusterTier(
                tier = WARM,
                reserved_storage = 0.2,
                data_zones = [z for z in zones if not z.master_only]
            ),
            ClusterTier(
                tier = FROZEN,
                reserved_storage = 0.1,
                data_zones = zones[0:1]
            )
        ],
        workloads = workloads
    )
]

# Step 4 - Determine Raw Cluster needs by Tier by Month



In [41]:
pd.set_option('display.max_rows', None)
for cluster in clusters:
    print('Cluster: ' + cluster.name)
    print(cluster.requirements())

Cluster: cluster
     YEAR  MONTH    TIER    ZONE DATA_DISK_GB VCPU RAM_GB  OBJECT_GB
0     1.0    1.0     HOT  Zone 1      15979.0  NaN    NaN        NaN
1     1.0    1.0     HOT  Zone 2      15979.0  NaN    NaN        NaN
2     1.0    1.0     HOT  Zone 3      15979.0  NaN    NaN        NaN
3     1.0    2.0     HOT  Zone 1      15979.0  NaN    NaN        NaN
4     1.0    2.0     HOT  Zone 2      15979.0  NaN    NaN        NaN
5     1.0    2.0     HOT  Zone 3      15979.0  NaN    NaN        NaN
6     1.0    3.0     HOT  Zone 1      15979.0  NaN    NaN        NaN
7     1.0    3.0     HOT  Zone 2      15979.0  NaN    NaN        NaN
8     1.0    3.0     HOT  Zone 3      15979.0  NaN    NaN        NaN
9     1.0    4.0     HOT  Zone 1      15979.0  NaN    NaN        NaN
10    1.0    4.0     HOT  Zone 2      15979.0  NaN    NaN        NaN
11    1.0    4.0     HOT  Zone 3      15979.0  NaN    NaN        NaN
12    1.0    5.0     HOT  Zone 1      15979.0  NaN    NaN        NaN
13    1.0    5.0 