In [1]:
%matplotlib inline 

# Caching Rest Data Investigation
***

## Table of Contents
***

## Aim <a class="anchor" id="aim"></a>
***

The aim of this notebook is to do some Exploratory data analysis on the data obtained from the Caching Rest Application

## Code Setup <a class="anchor" id="code-setup"></a>
***

### Import Libraries

In [2]:
import pymysql
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 6]
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("paper")

### Custom Classes

In [3]:
class CachePerformanceMetrics:
    def __init__(self, cache_hit_size, cache_miss_size):
        self.cache_hit_size = cache_hit_size
        self.cache_miss_size = cache_miss_size
        self.total_size = cache_hit_size + cache_miss_size
        self.hit_ratio = cache_hit_size/self.total_size
    
    def __repr__(self):
        return "Hit Ratio : " + str(self.hit_ratio)

### Constants

In [4]:
db_connection = pymysql.connect("pidb","dbuser","password","cachetest" )

### Useful Functions

#### Table Read

In [5]:
def read_full_db_table(table_name):
    return pd.read_sql('SELECT * FROM ' + table_name, con=db_connection)

#### Query to Dataframe

In [6]:
def obtain_df_from_query(query_string):
    return pd.read_sql(query_string, con=db_connection)

#### Cache Hit Bytes to Boolean

In [7]:
def convert_cache_hit_to_boolean(df):
    df.cache_hit = df.cache_hit.map({b'\x00' : False, b'\x01' : True})
    return df

#### DF ids to comma seperated string

In [8]:
def df_ids_to_string(df):
    id_list = df.id.tolist()
    id_list = [str(i) for i in id_list]
    return ''.join(id_list)

#### Process a Set of Parameters

In [18]:
def crunch_the_data(cache_type, cache_size_mb, eviction_policy):
    query_for_batch = ("cache_type == '" + cache_type + "' and cache_size_mb == '" + str(cache_size_mb) 
                       + "' and eviction_policy == '" + eviction_policy + "'")
    # Batches fitting criteria
    batches_df = batch_df.query(query_for_batch)
    
    # Cache Performance
    cache_performace_df = obtain_df_from_query("SELECT * FROM cache_performance cp where cp.batch_id in ("
                                              + df_ids_to_string(batches_df)
                                              + ")")
    # Issue with Booleans coming in as bytes and not booleans
    cache_performace_df = convert_cache_hit_to_boolean(cache_performace_df)
    
    # Cache Initial Content
    cache_initial_content_df = obtain_df_from_query("SELECT * FROM cache_initial_content cic where cic.batch_id in ("
                                              + df_ids_to_string(batches_df)
                                              + ")")
    # Cache Remainder
    cache_remainder_df = obtain_df_from_query("SELECT * FROM cache_remainder cr where cr.batch_id in ("
                                              + df_ids_to_string(batches_df)
                                              + ")")

    # Separate data into dictionary based on batches
    cache_performance_per_batch = {}
    cache_initial_content_per_batch = {}
    cache_remainder_per_batch = {}
    batch_ids = batches_df.id.unique().tolist()
    for batch_id in batch_ids :
        query_for_df = "batch_id == '" + str(batch_id) + "'"
        cache_performance_per_batch[batch_id] = cache_performace_df.query(query_for_df)
        cache_initial_content_per_batch[batch_id] = cache_initial_content_df.query(query_for_df)
        cache_remainder_per_batch[batch_id] = cache_remainder_df.query(query_for_df)
    
    # Compute the Cache Performance Metrics
    cache_performance_metrics = []
    for batch_id, cp_df in cache_performance_per_batch.items():
        print("For batch id : ", batch_id) 
        cp_df_hit = cp_df.query("cache_hit")
        cp_df_miss = cp_df.query("cache_hit == False")
        hit_size = cp_df_hit.shape[0]
        miss_size = cp_df_miss.shape[0]
        cache_performance_metric = CachePerformanceMetrics(hit_size, miss_size)
        cache_performance_metrics.append(cache_performance_metric)
        print("Cache Hits ", cache_performance_metric.cache_hit_size, ", Misses : ", 
              cache_performance_metric.cache_miss_size, " Hit Ratio : ", cache_performance_metric.hit_ratio)
    average_hit_ratio = 0
    for cache_performance_metric in cache_performance_metrics:
        average_hit_ratio = average_hit_ratio + cache_performance_metric.hit_ratio
    average_hit_ratio = average_hit_ratio/(len(cache_performance_metrics))
    print("Average Hit Ratio : ", average_hit_ratio)

### Global Data

In [10]:
batch_df = read_full_db_table("batch")

## Data Import <a class="anchor" id="data-import"></a>
***

In [11]:
list_of_cache_types = batch_df.cache_type.unique().tolist()
for cache_type in list_of_cache_types :
    print(cache_type)

ehcache
hazelcast


## Ehcache Data

### Ehcache LRU 128 MB

In [19]:
crunch_the_data("ehcache", 128, "LRU")

For batch id :  3
Cache Hits  268 , Misses :  3961  Hit Ratio :  0.06337195554504611
Average Hit Ratio :  0.06337195554504611


### Ehcache LRU 256 MB

In [13]:
crunch_the_data("ehcache", 256, "LRU")

For batch id :  1
Cache Hits  1084 , Misses :  7576  Hit Ratio :  0.12517321016166283
[Hit Ratio : 0.12517321016166283]


### Ehcache LRU 512 MB

In [14]:
crunch_the_data("ehcache", 512, "LRU")

For batch id :  2
Cache Hits  5961 , Misses :  17795  Hit Ratio :  0.25092608183195825
[Hit Ratio : 0.25092608183195825]


### Ehcache LRU 1024 MB

In [17]:
crunch_the_data("ehcache", 1024, "LRU")

For batch id :  4
Cache Hits  36688 , Misses :  35786  Hit Ratio :  0.5062229213235091
[Hit Ratio : 0.5062229213235091]
