# Serverless cost simulator for web-based traffic

This notebook simulates a syntetic month of requests based on wikipedia traffic shape. You can tune the `monthly_scale_factor` to adjust the total number of requests for the month.

After simulation, the total cost for the requests is calculated, plotting nice graphs with [plotly](https://plot.ly/).

We have choosen english wikipedia as source data, as it can be a fair representation of worldwide traffic. Other languages from wikipedia can be used to localize it further.

**TO-DO:**
- Study if datetime objects are really necessary
- Replace datetime.now with some other date


## 0. Initial setup

### Imports

In [None]:
# dataframe-related imports
from datetime import datetime
import pandas as pd
import numpy as np

import webish_simulator
import wikimedia_scraper as ws

## User variables

In [None]:
# Wikipedia project language
wikipedia_language = 'en'
lambda_memory = 128 # MiB
lambda_time = 200 # ms


image_type = None
# Uncomment next line to produce pngs
# image_type = 'png'


### Plotly setup

In [None]:
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

common_layout = go.Layout(
    legend=dict(
        orientation="h",
        y=-.2,
    ),
    xaxis=dict(
        autorange=True,
        type='date',
        anchor='free',
    ),
)

requests_date_layout = go.Layout(common_layout)
requests_date_layout.title = 'Requests distribution'
requests_date_layout.xaxis.title = 'Time'
requests_date_layout.yaxis.title = '# Requests'

time_cost_layout = go.Layout(common_layout)
time_cost_layout.xaxis.title = 'Time'
time_cost_layout.yaxis.title ='Cost ($)'

breakeven_scale_layout = go.Layout(common_layout)
breakeven_scale_layout.title = 'Break-even - scale',
breakeven_scale_layout.xaxis.title = 'AVG. reqs. per second'
breakeven_scale_layout.xaxis.type = 'linear'
breakeven_scale_layout.yaxis.title = '% total monthly reqs to hit breakeven '

# Colors and styles:
BBVAcore = dict(color = '#072146')
BBVAnavy = dict(color = '#004481')
BBVAaqua = dict(color = '#2dcccd')
BBVAmedium = dict(color = '#2A86CA')
BBVAlight = dict(color = '#5BBEFF')
BBVAred = dict(color = '#da3851')
BBVAcoral = dict(color = '#f35e61')
BBVAorange = dict(color = '#f7893b')
BBVAgreen = dict(color = '#48ae64')
BBVAyellow = dict(color = '#f8cd51')
BBVApink = dict(color = '#f78be8')
BBVAgold = dict(color = '#d8be75')
BBVApurple = dict(color = '#8f7ae5')

## 1.Get data from wikipedia source

`ws.get_traffic_generator` connects to wikimedia and downloads traffic logs for the selected date range. Data is cached for faster access in subsequent executions.

**Warning** During the first execution this could take long time!

Parameters to tune:
- `project`: project from wikipedia (language) to use.
- `start_date` and `end_date`: date range to extract from wikipedia.

In [None]:
project = wikipedia_language

start_date = datetime(2017,  1,  1)
end_date   = datetime(2017, 12, 31)

ws.output_notebook()

traffic_generator = ws.get_traffic_generator(start_date, end_date, projects=(project,))
df = pd.DataFrame(list(traffic_generator))

In [None]:
# Change DF index to a datetime index
df = df.set_index(pd.DatetimeIndex(df['date']))
df = df.drop(['date'], axis=1)

# Calculate rolling mean (week)
df['rolling'] = df['hits'].rolling(window=24*7, min_periods=3).mean()

df.head()

### 1.1 A (random) week of Wikipedia requests

In [None]:
data = []

# Get a subset of one week from original DataFrame and create the trace to plot it
random_week_df = df.loc['2017-01-23':'2017-01-29']

requests_trace = go.Scatter(
    x=random_week_df.index,
    y=random_week_df.hits,
    name='Requests (EN)',
    marker = BBVAnavy
)

data.append(requests_trace)

# Trace the rolling mean of the previous subset
rolling_trace = go.Scatter(
    x = random_week_df.index,
    y = random_week_df.hits.rolling(window=24, min_periods=0).mean(),
    name = 'Requests (EN - day mean)',
    marker = BBVAaqua
)

data.append(rolling_trace)

# Customize date format
requests_date_layout.title="Wikipedia requests distribution (one week)"
requests_date_layout.xaxis.tickformat='%a'

fig = go.Figure(data=data, layout=requests_date_layout)
iplot(fig, image=image_type, image_height=480, image_width=800, filename='serverless.wikipedia.random_week')

### 1.2 Plot a whole year of requests of Wikipedia in english

In [None]:
data = []

requests_trace = go.Scatter(
    x=df.index,
    y=df['hits'],
    name='Requests (EN)',
    marker = BBVAnavy,
)

data.append(requests_trace)

rolling_trace = go.Scatter(
    x=df.index,
    y=df['rolling'],
    name='Requests (EN - week mean)',
    marker = BBVAaqua,
)

data.append(rolling_trace)

# Customize date format
requests_date_layout.title="Wikipedia requests (2017)"
requests_date_layout.xaxis.tickformat='%b'
requests_date_layout.xaxis.nticks=12

fig = go.Figure(data=data, layout=requests_date_layout)
iplot(fig, image=image_type, image_height=480, image_width=800, filename='serverless.wikipedia.synth_year')

## 2. Build a synthetic month of requests

Taken the wikipedia traffic shape per hour, simulate a _mean month_ whose requests distibution have the shape of the selected wikipedia project.

The resulting scale (i.e. the same total amount of requests in a month) is configurable.

Note: The scale (total reqs in a month) can be controlled by using the param `monthly_scale_factor` in `webish_simulator.simulate`

In [None]:
monthly_scale_factor=100000000 # total reqs in a month

month_df = webish_simulator.simulate(df, monthly_scale_factor=monthly_scale_factor)

month_df.head()

### 2.1 Plot synthetic month

In [None]:
data = []

requests_trace = go.Scatter(
    x=month_df.index,
    y=month_df.requests,
    name='Requests',
    marker=BBVAnavy,
)

data.append(requests_trace)

rolling_trace = go.Scatter(
    x=month_df.index,
    y=month_df.requests.rolling(window=24, min_periods=0).mean(),
    name='Requests (day mean)',
    marker=BBVAaqua,
)

data.append(rolling_trace)

# Customize date format
requests_date_layout.title="Requests distribution (synthetic)"
requests_date_layout.xaxis.tickformat='%d'
requests_date_layout.xaxis.nticks=30

fig = go.Figure(data=data, layout=requests_date_layout)
iplot(fig, image=image_type, image_height=480, image_width=800, filename='serverless.wikipedia.synth_month')

### 2.2 Calculate costs

In [None]:
ec2_flavors = {
    'm3.medium':  1000,
    'm4.large':   1500,
    'm4.4xlarge': 5000,
}

month_df = webish_simulator.get_lambda_cost(month_df, MB_per_request=lambda_memory, ms_per_req=lambda_time)

break_even = dict()
for flavor,reqs in ec2_flavors.items():
    month_df = webish_simulator.get_ec2_cost(month_df, flavor=flavor, max_reqs_per_second=reqs )
    month_df[flavor+'_break_even'] = month_df['lambda_sum'] - month_df[flavor+'_sum']
    break_even[flavor] = webish_simulator.find_breakeven(month_df, flavor)

# Get Breakeven points for current DF
time_at_break_even = dict()
for flavor in ec2_flavors:
    if break_even[flavor] is not None:
        break_even_row = month_df.loc[month_df['req_sum'] == break_even[flavor]]
        time_at_break_even[flavor]=(break_even_row.index[0], break_even_row.lambda_sum[0])
    
month_df.tail()

### 2.3 Plot costs (synthetic month)

In [None]:
# TODO: Show number of instances in 
data = []
from itertools import cycle
# color = cycle([BBVAred, BBVAorange, BBVAgreen, BBVAyellow])
color = iter([BBVAlight, BBVAaqua, BBVAnavy])

lambda_trace = go.Scatter(
    x=month_df.index,
    y=month_df['lambda_sum'],
    name='Lambda',
    marker=BBVAcoral,
)

data.append(lambda_trace)

for flavor in ec2_flavors.keys():
    ec2_trace = go.Scatter(
        x=month_df.index,
        y=month_df[flavor + '_sum'],
        name=flavor,
        marker=next(color),
    )

    data.append(ec2_trace)

# Customize date format
time_cost_layout.xaxis.tickformat = '%d'
time_cost_layout.xaxis.ticklen=1
time_cost_layout.xaxis.nticks=30
time_cost_layout.title="Cost evolution during a synthetic month ("+str(lambda_memory)+" MiB Lambda)"


# Print breakeven points (if they exist)
time_cost_layout['annotations']=[]
for flavor in ec2_flavors:
    if break_even[flavor] is not None:
        time_cost_layout['annotations'].append(go.Annotation(
                text="Break-even ("+flavor+")",
                x=time_at_break_even[flavor][0],
                y=time_at_break_even[flavor][1]
            )
        )

mylayout = dict(time_cost_layout)
mylayout['xaxis']['range']=[datetime(2018, 1, 1), datetime(2018, 1, 31)]
mylayout['yaxis']['range']=[0, 120]

fig = go.Figure(data=data, layout=mylayout)
iplot(fig, image=image_type, image_height=480, image_width=800, filename='serverless.wikipedia.synth_month_costs')

## 3. Simulate multiple scenarios with different total requests in a month

In [None]:
lambda_flavor={
    'memory':     lambda_memory,
    'exec_time':  lambda_time,
}
ec2_flavors = {
    'm3.medium':  1000,
    'm4.large':   1500,
    'm4.4xlarge': 5000,
}

x, y = webish_simulator.get_breakeven(df, range(30000000, 800000000, 5000000), ec2_flavors, lambda_flavor)

In [None]:
data = []
color = iter([BBVAlight, BBVAaqua, BBVAnavy])

for flavor, breakeven_points in y.items():
    breakeven_trace = go.Scatter(
        x=x,
        y=breakeven_points,
        name=flavor,
        marker=next(color),
    )
    data.append(breakeven_trace)

fig = go.Figure(data=data, layout=breakeven_scale_layout)

iplot(fig, image=image_type, image_height=480, image_width=800, filename='serverless.wikipedia.synth_month_costs')