# General Analysis Notebook
This notebook intents to tackle the problem of analyzing a whole batch. It mainly iterates over the many directories and process each one. Then makes a graph indicating the resulting value and the expected one. This is a neat way to understand how well is the analyzer performing generally.

## Requirements
This notebook requires that the root directory where reports are contains a ```description.yml``` file, indicating the various properties of the batch. Including the connection's top speed, the throttling configuration, the start time for each change and the time lapse for each throttling speed.

In [13]:
# Setup and import everything
%matplotlib notebook

import pandas as pd
import yaml

from os import getcwd, listdir
from os.path import join, isdir
from math import fabs

from processor import analysis, reports

from IPython.display import display

base_directory = join(getcwd(), 'batch-test-reports')
with open(join(base_directory, 'description.yml')) as fp:
    batch_description = yaml.load(fp)
batch_description

{'connection_speed': 1500,
 'experiment': [{'lapse': 3600, 'speed': 1500, 'start': 1511625600},
  {'lapse': 3600, 'speed': 1125, 'start': 1511629200},
  {'lapse': 3600, 'speed': 750, 'start': 1511632800},
  {'lapse': 3600, 'speed': 375, 'start': 1511636400},
  {'lapse': 3600, 'speed': 0, 'start': 1511640000}]}

In [14]:
def get_analyzer_results(reports_batch_dir):
    rh = reports.ReportHandler(join(base_directory, reports_batch_dir))
    ip, obs_set = rh.get_ip_and_processable_observations()
    if ip is None and obs_set is None:
        raise ValueError('Reports batch directory {} has no reports to use!'.format(reports_batch_dir))
    analyzer = analysis.Analyzer(obs_set)
    results = analyzer.get_results()
    return results

def get_expected_usage(batch_start_time, batch_end_time):
    batch_starting_conf, batch_ending_conf = None, None
    for experiment_configuration in batch_description['experiment']:
        start = experiment_configuration['start']
        lapse = experiment_configuration['lapse']
        if start <= batch_start_time < start + lapse:
            batch_starting_conf = experiment_configuration
        if start <= batch_end_time < start + lapse:
            batch_ending_conf = experiment_configuration
        if batch_starting_conf is not None and batch_ending_conf is not None:
            break
    if batch_starting_conf is None and batch_ending_conf is None:
        raise ValueError('Batch for start time: {} and end time {} has no config.'.format(batch_start_time,
                                                                                          batch_end_time))
    if batch_ending_conf is None:
        expected_usage = batch_starting_conf['speed'] / batch_description['connection_speed']
    elif batch_starting_conf is None:
        expected_usage = batch_ending_conf['speed'] / batch_description['connection_speed']
    elif batch_starting_conf['speed'] == batch_ending_conf['speed']:
        expected_usage = batch_starting_conf['speed'] / batch_description['connection_speed']
    else:
        batch_lapse = batch_end_time - batch_start_time
        batch_starting_conf_lapse = batch_starting_conf['start'] + batch_starting_conf['lapse'] - batch_start_time
        batch_ending_conf_lapse = batch_end_time - batch_ending_conf['start']
        expected_usage = ((batch_starting_conf_lapse / batch_lapse) * batch_starting_conf['speed'] +
            (batch_ending_conf_lapse / batch_lapse) * batch_ending_conf['speed']) / batch_description['connection_speed']
    return expected_usage
    

In [15]:
columns = ['batch_start_time', 'batch_end_time', 
           'downstream_hurst_rs', 'downstream_hurst_wavelet',
           'downstream_quality', 'downstream_usage',
           'upstream_hurst_rs', 'upstream_hurst_wavelet',
           'upstream_quality', 'upstream_usage',
           'expected_downstream_usage', 'error', 'relative_error']
data = pd.DataFrame(columns=columns)

for reports_batch_dir in listdir(base_directory):
    reports_batch_dir_path = join(base_directory, reports_batch_dir)
    if not isdir(reports_batch_dir_path): continue
    results = get_analyzer_results(reports_batch_dir_path)
    batch_start_time = int(reports_batch_dir)
    batch_end_time = results['timestamp']
    expected_usage = get_expected_usage(batch_start_time, batch_end_time)
    df_ = pd.DataFrame([(batch_start_time, batch_end_time, 
                         results['downstream']['hurst']['rs'], results['downstream']['hurst']['wavelet'],
                         results['downstream']['quality'], results['downstream']['usage'], 
                         results['upstream']['hurst']['rs'], results['upstream']['hurst']['wavelet'],
                         results['upstream']['quality'], results['upstream']['usage'],
                         expected_usage, 
                         fabs(expected_usage - results['downstream']['usage']), 
                         (fabs(expected_usage - results['downstream']['usage']) / expected_usage) 
                         if expected_usage != 0 else fabs(expected_usage - results['downstream']['usage']))
                       ], 
                       columns=columns)
    data = data.append(df_)
data = data.sort_values(by=['batch_start_time']).reset_index(drop=True)
display(data)

Unnamed: 0,batch_start_time,batch_end_time,downstream_hurst_rs,downstream_hurst_wavelet,downstream_quality,downstream_usage,upstream_hurst_rs,upstream_hurst_wavelet,upstream_quality,upstream_usage,expected_downstream_usage,error,relative_error
0,1511625711,1511626797,0.522214,0.627488,1.0,0.977723,0.515216,0.680305,1.0,1.102273,1.0,0.022277,0.022277
1,1511626270,1511626797,0.50124,0.45112,1.0,0.98088,0.500642,0.511709,1.0,1.156126,1.0,0.01912,0.01912
2,1511626831,1511627554,0.514083,0.367388,1.0,1.056478,0.567162,0.40468,1.0,1.083333,1.0,0.056478,0.056478
3,1511627395,1511628612,0.499209,0.289634,1.0,0.697674,0.482101,0.380996,1.0,0.851948,1.0,0.302326,0.302326
4,1511628089,1511628843,0.505553,0.431548,1.0,0.973978,0.527217,0.429702,1.0,0.825,1.0,0.026022,0.026022
5,1511628721,1511629582,0.56519,0.558028,1.0,0.730453,0.560772,0.60908,1.0,0.801386,0.889082,0.15863,0.17842
6,1511631494,1511632561,0.516871,0.510157,1.0,0.516908,0.513665,0.64833,1.0,0.695864,0.75,0.233092,0.310789
7,1511632242,1511632975,0.507401,0.586395,1.0,0.334096,0.462707,0.334901,1.0,0.646789,0.690314,0.356218,0.516023
8,1511632791,1511633742,0.54039,0.464451,1.0,0.426804,0.447449,0.264603,1.0,0.635492,0.502366,0.075562,0.150412
9,1511633340,1511634191,0.546359,0.646349,1.0,0.481328,0.484537,0.493431,1.0,0.645098,0.5,0.018672,0.037344


In [16]:
data.plot(x='batch_start_time', y=['downstream_usage', 'expected_downstream_usage'], kind='line')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7fda2b6a3438>

In [21]:
display(data[['error', 'relative_error']].describe(include='all'))

Unnamed: 0,error,relative_error
count,19.0,19.0
mean,0.115114,0.155228
std,0.098024,0.142702
min,0.018672,0.01912
25%,0.043133,0.067808
50%,0.083532,0.1
75%,0.15744,0.168828
max,0.356218,0.516023


In [17]:
data.plot(x='batch_start_time', y='error', kind='line')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7fda2af71080>

In [18]:
data.plot(x='batch_start_time', y='relative_error', kind='line')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7fda2aefeda0>