# Starting with Bokeh
In this notebook the first functions to manipulate Bokeh


In [None]:
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show, output_file

from bokeh.io import show
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    LinearColorMapper,
    BasicTicker,
    PrintfTickFormatter,
    ColorBar,
)

from math import pi
import sys
import matplotlib as mpl
mpl.rcParams['axes.titlesize'] = 20
mpl.rcParams['axes.labelsize'] = 18
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Data pre-processing
data = pd.read_csv('data/data_elog_eindhoven.csv', sep = ';')
# Delete columns
to_delete = ['Unnamed: 0', 'index']
data.drop(to_delete, axis=1, inplace=True)

#Sorth the data
data.sort_values(['location', 'UTC_time' ], ascending=[True, True], inplace=True)
def calculate_diff(data): 
    """
    In this function the consumption difference is calculated per user.
    """
    def diff_func(df): return df.diff()
    data['delta_total'] = data.groupby('location')['total'].apply(diff_func)
    
    return data.reset_index(drop=True)

data = calculate_diff(data)

#Create new varianbles
data['dummy'] = 1
data['datetime64'] = pd.to_datetime(data['UTC_time'])
data['norm_date'] = data['datetime64'].dt.normalize()
data['year'] = data['datetime64'].dt.year
data['month'] = data['datetime64'].dt.month
data['day'] = data['datetime64'].dt.day
data['hour'] = data['datetime64'].dt.hour
data = data[data['year'] == 2017] #Only files in 2017

## Heat-Map for Water Consuption
Vizualization of the average water consuption per hour in different time sections of the day. Only averages where the number pf obsevations $n>H$ are considered (0 otherwise.)

### Data Agegation


In [None]:
def heat_map_mpl(hour_consuption, num_locations, H = 0):
    
    hour_consuption[num_locations < H] = 0
    mask = np.zeros_like(hour_consuption, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    f, ax = plt.subplots(figsize=(25, 20))
    
    cmap = sns.cubehelix_palette(light=1, as_cmap=True)
    #cmap = sns.diverging_palette(220, 10, as_cmap=True)
    sns.heatmap(hour_consuption, cmap=cmap, linewidths=.5, square= False)
    ax.set_xticks([])
    #mpl.pyplot.savefig('Figures/correlation_Dependent_Independet.png')
    plt.show()


def data_aggregation(data, aggegation_method = 'sum'):
    """
    This funtion creates the matrices that will be use to generate the heat maps
    Params:
    data: the elog data set
    aggegation_method: how to aggregate the data ['sum', 'mean', 'median']
    Return:
    hour_consuption: Matrix with the average water consuption per time slot (hour)
    """
    # Here we create the matrices that will be shown at the heat-map
    data = data.dropna()
    
    if aggegation_method == 'median':
        hour_consuption = data.groupby(by = ['month', 'hour'])['delta_total'].median()
   
    elif aggegation_method == 'sum':
        hour_consuption = data.groupby(by = ['month', 'hour'])['delta_total'].sum()
        
    elif aggegation_method == 'mean':
        hour_consuption = data.groupby(by = ['month', 'hour'])['delta_total'].mean() 
        
    else:
        print('The option {} does not exist, please select [sum, mean, median]'.format(aggegation_method))
        sys.exit()
        
    num_locations = data.groupby(by = ['month', 'hour'] , as_index=False).apply(lambda x: x.location.nunique()) #This must be cheched, all values are 5
    
    
    # Change formats
    #dict_month = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
    
    hour_consuption = hour_consuption.unstack()
    hour_consuption.index = hour_consuption.index.astype(str)
    hour_consuption.columns = hour_consuption.columns.astype(str)
    
    num_locations = num_locations.unstack()
    num_locations.index = hour_consuption.index.astype(str)
    num_locations.columns = hour_consuption.columns.astype(str)
        
    #Test for errors
    
    
    assert hour_consuption.shape == num_locations.shape, 'different shapes'

    return hour_consuption.T, num_locations.T

hour_consuption, num_locations = data_aggregation(data, 'median')

In [None]:
hour_consuption.head()

In [None]:
# Create a file per location
def create_files_HM(data):
    unique_location = data['location'].unique()
    
    for i in unique_location:
        temp_data = data[data['location'] == i]
        hour_consuption, num_locations = data_aggregation(temp_data, 'sum')
        aggregated_day = hour_consuption.sum(axis=0)
        aggregated_day = aggregated_day.reset_index()
        aggregated_day.columns = ['norm_date', 'total_consuption']
        
        hour_consuption.to_csv('data/Data_heat_maps/hour_consuption/{}.csv'.format(str(i)))
        num_locations.to_csv('data/Data_heat_maps/num_locations/{}.csv'.format(str(i)))
        aggregated_day.to_csv('data/Data_heat_maps/aggregated_day/{}.csv'.format(str(i)), index = False)
        
                             
create_files_HM(data)

In [None]:
# #These heat-maps can be used to spot mystakes
# heat_map_mpl(hour_consuption, num_locations, 0)

### Image
Let's the fun have some fun with the Bokeh library


In [None]:
# Simple Image plot, it seems 
def create_heat_map(data):
    #baseed on: https://bokeh.pydata.org/en/latest/docs/gallery/image.html
    x_max = data.shape[1]
    y_man = data.shape[0]
    p = figure(x_range=(0,x_max), y_range=(0,y_man))
    p.image(image=[data.values], x=0, y=0, dw=x_max, dh=y_man, palette="Spectral11")
    output_file("image.html", title="image.py example")
    show(p)

In [None]:
create_heat_map(hour_consuption)

### Iterative scatter plot
from: https://bokeh.pydata.org/en/latest/docs/gallery/color_scatter.html


### Iteractive heat-map
from: http://bokeh.pydata.org/en/latest/docs/gallery/unemployment.html

In [None]:
def plot_bokeh(data):

    hours = list(data.index)
    months = list(data.columns)
    
    # reshape to 1D array or rates with a month and year for each row.
    df = pd.DataFrame(data.stack(), columns=['rate']).reset_index()

    # this is the colormap from the original NYTimes plot
    colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
    mapper = LinearColorMapper(palette=colors, low=df.rate.min(), high=df.rate.max())

    source = ColumnDataSource(df)

    TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

    p = figure(title="Water Consumption (months from {0} to {1})".format(months[0], months[-1]),
               x_range = months, y_range = hours,
               x_axis_location="above", plot_width=900, plot_height=400,
               tools=TOOLS, toolbar_location='below')

    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = "5pt"
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = pi / 3

    p.rect(x="month", y="hour", width=1, height=1,
           source=source,
           fill_color={'field': 'rate', 'transform': mapper},
           line_color=None)

    color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="5pt",
                         ticker=BasicTicker(desired_num_ticks=len(colors)),
                         label_standoff=6, border_line_color=None, location=(0, 0))
    p.add_layout(color_bar, 'right')

    p.select_one(HoverTool).tooltips = [
         ('date', '@month @hour'),
         ('Water Consumption (L)', '@rate'),
    ]

    show(p)      # show the plot
    
plot_bokeh(hour_consuption)