In [1]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
from bokeh.models import Span, CustomJS, Select, DateRangeSlider, Spinner, Toggle, CheckboxGroup, Panel, DataRange, FactorRange
from bokeh.models.annotations import Title
from bokeh.models.widgets import Tabs
from bokeh.models import ColumnDataSource, HoverTool, ColorBar, FixedTicker, SingleIntervalTicker, LinearAxis, Button, DatePicker
from bokeh.layouts import gridplot, column, row, Column, layout
from bokeh.transform import linear_cmap
from bokeh.palettes import all_palettes
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application
import datetime
from bokeh.io import curdoc
from selenium import webdriver
import chromedriver_binary
output_notebook()

In [2]:
#!pip install pyspark

In [3]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

## Importamos la base de datos `ads_produccion.csv`

In [4]:
#link = 'https://drive.google.com/file/d/16NNM2GhenGdPJ4celJhmOgDn7YTHNQyb/view?usp=sharing'
#downloaded = drive.CreateFile({'id':'16NNM2GhenGdPJ4celJhmOgDn7YTHNQyb'})
#downloaded.GetContentFile('ads_produccion.csv')

In [5]:
infer_schema = "True"
first_row_is_header = "True"
delimiter = ","
file_type = "csv"
file_location = "ads_produccion.csv"

ads_produccion = (
   spark.read.format(file_type)
    .option("inferSchema", infer_schema)
    .option("header", first_row_is_header)
    .option("sep", delimiter)
    .load(file_location)
)

### Filtramos la base de datos

In [6]:
ads_produccion = (
    ads_produccion.withColumn('fecha', F.to_timestamp('Fecha_Hora', 'yyyy-MM-dd HH:mm:ss'))
    .where((F.col('ton_total').isNotNull())
          & (F.col('ton_total') > 0))
    .drop('Fecha_Hora', 'n_perfo', 'n_eq_apoyo', 'n_aljibe')      
)

## Importamos la base de datos `Datos_movimiento_mina.csv`

In [7]:
#link = 'https://drive.google.com/file/d/1imqAv7vgnkNNm_g9sBZz2D7u5xtERy_B/view?usp=sharing'
#downloaded = drive.CreateFile({'id':'1imqAv7vgnkNNm_g9sBZz2D7u5xtERy_B'})
#downloaded.GetContentFile('Datos_movimiento_mina.csv')

In [8]:
infer_schema = "True"
first_row_is_header = "True"
delimiter = ";"
file_type = "csv"
file_location = "Datos_movimiento_mina.csv"
movimiento_mina = (
   spark.read.format(file_type)
    .option("inferSchema", infer_schema)
    .option("header", first_row_is_header)
    .option("sep", delimiter)
    .load(file_location)
)

### Preprocesamiento de la base de datos

In [9]:
movimiento_mina = (
    movimiento_mina.select(
     F.col('TONELAJE FC').cast(T.DoubleType()),
     F.col('# Baldes').cast(T.DoubleType()),
     F.col('TIEMPO DE ACULATAMIENTO').cast(T.DoubleType()),
     F.col('TIEMPO DE CARGA').cast(T.DoubleType()),
     F.col('TIEMPO DE DESCARGA').cast(T.DoubleType()),
     F.col('TIEMPO ESPERA PALA').cast(T.DoubleType()),
     F.col('TIEMPO COLA CAMION').cast(T.DoubleType()),
     F.col('TIEMPO VIAJE HACIA LA PALA').cast(T.DoubleType()),
     F.col('TIEMPO DE VIAJE HACIA DESCARGA').cast(T.DoubleType()),
     F.col('TIEMPO DE CICLO TOTAL CAEX').cast(T.DoubleType()),
     F.col('TIEMPO DE CICLO EFECTIVO PALA').cast(T.DoubleType()),
     F.col('DISTANCIA DE VIAJE VACIO').cast(T.DoubleType()),
     F.col('DISTANCIA DE VIAJE LLENO').cast(T.DoubleType()),
     F.col('DISTANCIA EQUIVALENTE DE VIAJE VACIO').cast(T.DoubleType()),
     F.col('DISTANCIA EQUIVALENTE DE VIAJE LLENO').cast(T.DoubleType()),            
     'LoadingTimestamp',
     'DumpingTimestamp').withColumnRenamed('TIEMPO DE CARGA', 't_carga')
    .withColumnRenamed('TONELAJE FC', 'ton_camion')
    .withColumnRenamed('# Baldes', 'n_baldes')
    .withColumnRenamed('TIEMPO DE ACULATAMIENTO', 't_aculatamiento')
    .withColumnRenamed('TIEMPO DE DESCARGA', 't_descarga')
    .withColumnRenamed('TIEMPO ESPERA PALA', 't_espera_pala')
    .withColumnRenamed('TIEMPO COLA CAMION', 't_cola_camion')
    .withColumnRenamed('TIEMPO VIAJE HACIA LA PALA', 't_a_pala')
    .withColumnRenamed('TIEMPO DE VIAJE HACIA DESCARGA', 't_viaje_descarga')
    .withColumnRenamed('TIEMPO DE CICLO TOTAL CAEX', 't_ciclo_caex')
    .withColumnRenamed('TIEMPO DE CICLO EFECTIVO PALA', 't_ciclo_pala')
    .withColumnRenamed('DISTANCIA DE VIAJE VACIO', 'd_viaje_vacio')
    .withColumnRenamed('DISTANCIA DE VIAJE LLENO', 'd_viaje_lleno')
    .withColumnRenamed('DISTANCIA EQUIVALENTE DE VIAJE VACIO', 'd_eq_vacio')
    .withColumnRenamed('DISTANCIA EQUIVALENTE DE VIAJE LLENO', 'd_eq_lleno')
    .withColumn('Loading', F.to_timestamp('LoadingTimestamp', 'yyyy-MM-dd HH:mm:ss.SSS'))
    .withColumn('Dumping', F.to_timestamp('DumpingTimestamp', 'yyyy-MM-dd HH:mm:ss.SSS'))
    .drop('LoadingTimestamp', 'DumpingTimestamp')
)
movimiento_mina = movimiento_mina.select([F.when(F.col(c)=="NULL", None).otherwise(F.col(c)).alias(c) for c in movimiento_mina.columns])

### Eliminamos las filas que contengan valores negativos y los valores nulos

In [10]:
movimiento_mina = (
    movimiento_mina.select('*')
    .where((F.col('ton_camion') > 0)
          & (F.col('t_descarga') > 0)
          & (F.col('t_carga') > 0)
          & (F.col('t_ciclo_caex') > 0)
          & (F.col('t_ciclo_pala') > 0))
    .drop('t_espera_pala', 't_cola_camion', 't_aculatamiento', 't_a_pala')
    .na.drop()
)

### Realizamos una agrupacion por hora de cada variable en `Datos_movimiento_mina.csv` para que cohincida con la base de datos `ads_produccion.csv` y podemos hacer un `join` de ambas

In [11]:
expression = [F.round(F.mean(col), 0).alias(col) for col in movimiento_mina.columns]
mov_historico_hora = (
    movimiento_mina.select('*')
    .groupBy(F.year(F.col('Dumping')).alias('y'), 
             F.month(F.col('Dumping')).alias('m'), 
             F.dayofmonth(F.col('Dumping')).alias('d'), 
             F.hour(F.col('Dumping')).alias('h'))
    .agg(*expression)
    .withColumn('tmp', F.concat(F.col("y"), F.lit("-"), F.col('m'), F.lit("-"), F.col('d'), F.lit(" "), F.col('h'), F.lit(':00:00')))
    .withColumn('date', F.to_timestamp(F.col('tmp')))
    .drop('y', 'm', 'd', 'h', 'tmp', 'Loading', 'Dumping')
    .orderBy('date')
)

## Unimos ambas bases de datos en una sola, donde podremos extraer datos globales por hora, diarios, mensuales y anuales si queremos

In [12]:
global_por_hora = ads_produccion.join(mov_historico_hora, on=(ads_produccion['fecha'] == mov_historico_hora['date']), how='inner').drop('date')

In [13]:
global_por_hora.printSchema()

root
 |-- ton_total: double (nullable = true)
 |-- n_descargas: double (nullable = true)
 |-- n_cam: double (nullable = true)
 |-- n_shov: double (nullable = true)
 |-- ton_chancador: double (nullable = true)
 |-- ton_botadero: double (nullable = true)
 |-- descargas_botadero: double (nullable = true)
 |-- ton_chancador_1: double (nullable = true)
 |-- ton_chancador_2: double (nullable = true)
 |-- cam_chancador: double (nullable = true)
 |-- cam_botadero: double (nullable = true)
 |-- ton_alta_ley: double (nullable = true)
 |-- ton_media_ley: double (nullable = true)
 |-- ton_baja_ley: double (nullable = true)
 |-- ton_lastre: double (nullable = true)
 |-- fecha: timestamp (nullable = true)
 |-- ton_camion: double (nullable = true)
 |-- n_baldes: double (nullable = true)
 |-- t_carga: double (nullable = true)
 |-- t_descarga: double (nullable = true)
 |-- t_viaje_descarga: double (nullable = true)
 |-- t_ciclo_caex: double (nullable = true)
 |-- t_ciclo_pala: double (nullable = true)


In [14]:
global_por_hora.count()

11771

In [15]:
variables_fmt = {'ton_total': 'Tonelaje Total',
'n_descargas': 'Numero descargas' ,
'n_cam': 'Numero de camiones',
'n_shov': 'Numero de paladas',
'ton_chancador': 'Tonelaje a chancador',
'ton_botadero': 'Tonelaje a botadero',
'descargas_botadero': 'Numero descargas a botadero',
'ton_chancador_1': 'Tonelaje a chancador 1',
'ton_chancador_2': 'Tonelaje a chancador 2',
'cam_chancador': 'Camiones hacia chancador',
'cam_botadero': 'Camiones hacia el botadero',
'ton_alta_ley': 'Tonelaje alta ley',
'ton_media_ley': 'Tonelaje ley media',
'ton_baja_ley': 'Tonelaje baja ley',
'ton_lastre': 'Toneladas de lastre',
'ton_camion': 'Toneladas camion lleno',
'n_baldes': 'Numero de baldadas',
't_carga': 'Tiempo de carga (seg)',
't_descarga': 'Tiempo de descarga (seg)',
't_viaje_descarga': 'Tiempo viaje a descarga (seg)',
't_ciclo_caex': 'Tiempo ciclo CAEX (seg)',
't_ciclo_pala': 'Tiempo ciclo Pala (seg)',
'd_viaje_vacio': 'Distancia viaje camion vacio (seg)',
'd_viaje_lleno': 'Distancia viaje camion lleno (seg)',
'd_eq_vacio': 'Distancia equivalente camion vacio (seg)',
'd_eq_lleno': 'Distancia equivalente camion lleno (seg)'}

### global_por_hora no se exporta bien como csv asi que hay que solucionar esto

In [16]:
#global_por_hora.write.option('header', True).csv('mycsv.csv')

In [17]:
infer_schema = "True"
first_row_is_header = "True"
delimiter = ","
file_type = "csv"
file_location = "global_mina_por_hora.csv"

global_hora = (
   spark.read.format(file_type)
    .option("inferSchema", infer_schema)
    .option("header", first_row_is_header)
    .option("sep", delimiter)
    .load(file_location)
)

In [18]:
global_hora.select('ton_total').describe().show()

+-------+------------------+
|summary|         ton_total|
+-------+------------------+
|  count|             11771|
|   mean|17192.457371505512|
| stddev| 6313.571945259422|
|    min|             287.0|
|    max|           50949.1|
+-------+------------------+



In [19]:
global_hora.count()

11771

### Calcular valores outliers para filtrar la base de datos `global_por_hora`

Creamos la variable `bounds` que calcula el `q1`, `q2`, `q3` y los valores minimos y maximos para poder realizar diagramas de cajas

In [20]:
bounds = {
    c: dict(zip(["q1", 'q2', "q3"], global_hora.approxQuantile(c, [0.25, 0.5, 0.75], 0))) 
    for c, d in zip(global_hora.columns, global_hora.dtypes) if d[1] == "double"
}
for c in bounds:
    iqr = bounds[c]['q3'] - bounds[c]['q1']
    bounds[c]['q1'] = np.round(bounds[c]['q1'], 2)
    bounds[c]['q3'] = np.round(bounds[c]['q3'], 2)
    bounds[c]['min'] = np.round(bounds[c]['q1'] - (iqr * 1.5), 2)
    bounds[c]['max'] = np.round(bounds[c]['q3'] + (iqr * 1.5), 2)

In [21]:
def outlier_values(df, col, bounds):
    return np.unique(df.select(col)
            .where((F.col(col) < bounds[col]['min'])
                 | (F.col(col) > bounds[col]['max']))
            .rdd
            .flatMap(lambda x: x)
            .collect())

def outliers(df, cols=[], all_cols=False):
    out_dict = dict()
    if all_cols:
        for col in df.columns:
            try:
                out_dict[col] = outlier_values(df, col, bounds=bounds)
            except:
                continue
    else:
        for col in cols:
            out_dict[col] = outlier_values(df, col, bounds=bounds)
    return out_dict

In [22]:
dict_outliers_df = outliers(df=global_hora, cols=[], all_cols=True)

### Graficos de cajas de las distintas variables (estatico)

In [23]:
def box_plot_mina(bounds, col, out_dict):
    df = pd.DataFrame(bounds).T
    p = figure(width = 350, height=350, tools="", background_fill_color="#efefef", x_range=[col], toolbar_location='above')

    p.segment([col], df.loc[col]['max'], [col], df.loc[col].q3, line_color="black")
    p.segment([col], df.loc[col]['min'], [col], df.loc[col].q1, line_color="black")

    # boxes
    p.vbar([col], 0.3, df.loc[col].q2, df.loc[col].q3, fill_color="#E08E79", line_color="black")
    p.vbar([col], 0.3, df.loc[col].q1, df.loc[col].q2, fill_color="#3B8686", line_color="black")

    # whiskers (almost-0 height rects simpler than segments)
    p.rect([col], df.loc[col]['max'], 0.1, 0.01, line_color="black")
    p.rect([col], df.loc[col]['min'], 0.1, 0.01, line_color="black")

    if not out_dict[col].size == 0:
        p.circle([col]*len(out_dict[col]), out_dict[col], size=6, color="#F38630", fill_alpha=0.6)

    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = "white"
    p.grid.grid_line_width = 2
    p.xaxis.major_label_text_font_size="16px"

    return p

In [24]:
#show(box_plot_mina(bounds=bounds, col='ton_baja_ley', out_dict=dict_outliers_df))

### BoxPlot dinamico

In [25]:
def boxPlot(doc):
    def make_ds(bounds, target, dict_outliers_df=dict_outliers_df):
        # SEGMENTS
        df = pd.DataFrame(bounds)
        segment_dict={
        'x0' : [0]*2,
        'y0' : [df[target]['q3'], df[target]['min']],
        'x1' : [0]*2,
        'y1' : [df[target]['max'], df[target]['q1']]
        }
        segments = ColumnDataSource(data = segment_dict)
        
        # BOXS
        box_1 = ColumnDataSource(data = {'y0': [df[target]['q2']],'y1': [df[target]['q3']]})
        box_2 = ColumnDataSource(data = {'y0': [df[target]['q1']],'y1': [df[target]['q2']]})
        
        # WHISKERS
        whis_1 = ColumnDataSource(data = {'y': [df[target]['max']]})
        whis_2 = ColumnDataSource(data = {'y': [df[target]['min']]})
        
        # OUTLIERS
        outliers = ColumnDataSource(data = {'out': dict_outliers_df[target]})
        return segments, box_1, box_2, whis_1, whis_2, outliers
    
    def boxbox(src, src_v1, src_v2, whis_1, whis_2, outliers):
        p = figure(width = 450, height=450, tools="", background_fill_color="#efefef", x_axis_type=None, x_range=(-1, 1))
        p.segment(x0="x0", y0="y0", x1="x1", y1="y1", line_color="black", source=src)
        p.vbar(0, 0.8, 'y0', 'y1', fill_color="#E08E79", line_color="black", source=src_v1)
        p.vbar(0, 0.8, 'y0', 'y1', fill_color="#3B8686", line_color="black", source=src_v2)
        p.rect(0, 'y', 0.1, 0.01, line_color="black", source=whis_1)
        p.rect(0, 'y', 0.1, 0.01, line_color="black", source=whis_2)

        if not (len(outliers.data['out']) == 0):
            p.circle(0, 'out', size=6, color="#F38630", fill_alpha=0.6, source=outliers)
            
        p = style(p)
        
        return p
    
    def update(attr, old, new):

        selected_variable = get_key(select.value, variables_fmt)
        new_src, new_src_v1, new_src_v2, new_whis_1, new_whis_2, new_outliers = make_ds(bounds=bounds, target=selected_variable, dict_outliers_df=dict_outliers_df)
        
        outliers.data.update(new_outliers.data)
        whis_1.data.update(new_whis_1.data)
        whis_2.data.update(new_whis_2.data)
        src_v2.data.update(new_src_v2.data)
        src_v1.data.update(new_src_v1.data)
        src.data.update(new_src.data)

    def style(p):
        # Titulos de ejes y tamaño de fuente
        p.xaxis.axis_label_text_font_size = '14pt'
        p.xaxis.axis_label_text_font_style = 'bold'
        p.yaxis.axis_label_text_font_size = '14pt'
        p.yaxis.axis_label_text_font_style = 'bold'

        # Tamaño de los numeros de la grilla
        p.xaxis.major_label_text_font_size = '12pt'
        p.yaxis.major_label_text_font_size = '12pt'
        
        p.xgrid.grid_line_color = None
        p.ygrid.grid_line_color = "white"
        p.grid.grid_line_width = 2

        return p

    def get_key(val, variables_fmt):
        for key, value in variables_fmt.items():
            if val == value:
                return key

    select = Select(title = "Seleccionar variable:", value = 'Tonelaje Total', 
                    options=list(variables_fmt.values()), width=370)
    select.on_change('value', update)
    
    src, src_v1, src_v2, whis_1, whis_2, outliers = make_ds(bounds=bounds, target=get_key(select.value, variables_fmt), dict_outliers_df=dict_outliers_df)

    p = boxbox(src=src, src_v1=src_v1, src_v2=src_v2, whis_1=whis_1, whis_2=whis_2, outliers=outliers)

    layout = column(select, p)
    
    tab = Panel(child=layout, title = 'BoxPlot')
    tabs = Tabs(tabs=[tab])
    
    doc.add_root(tabs)

handler = FunctionHandler(boxPlot)
app = Application(handler)
show(app)

#### Filtro la base de datos mediante un filtro que elimine los `outliers` en la base de datos `global_por_hora`

In [26]:
df = pd.DataFrame(bounds).T

In [27]:
df.loc['n_baldes']['min']

4.0

In [28]:
for col in global_hora.columns:
    if col != 'fecha':
        global_hora = (
            global_hora.select('*')
            .where((F.col(col) <= df.loc[col]['max'])
                 & (F.col(col) >= df.loc[col]['min'])
        ))

### Hacemos una agrupacion global por dia de la nueva base de datos `global_por_hora` para hacer analisis diarios

In [29]:
expression = [F.round(F.mean(col), 0).alias(col) for col in global_hora.columns]
global_por_dia = (
    global_hora.select('*')
    .groupBy(F.year(F.col('fecha')).alias('y'), 
             F.month(F.col('fecha')).alias('m'), 
             F.dayofmonth(F.col('fecha')).alias('d')) 
    .agg(*expression)
    .withColumn('tmp', F.concat(F.col("y"), F.lit("-"), F.col('m'), F.lit("-"), F.col('d')))
    .withColumn('date', F.to_date(F.col('tmp')))
    .drop('y', 'm', 'd', 'tmp', 'fecha')
    .orderBy('date')
)
global_por_dia.select('date', 'ton_total', 'n_cam', 'n_shov').show(10, False)

+----------+---------+-----+------+
|date      |ton_total|n_cam|n_shov|
+----------+---------+-----+------+
|2020-12-31|22155.0  |46.0 |6.0   |
|2021-01-01|18888.0  |39.0 |6.0   |
|2021-01-02|20964.0  |42.0 |6.0   |
|2021-01-03|18782.0  |42.0 |7.0   |
|2021-01-04|15044.0  |37.0 |6.0   |
|2021-01-05|16669.0  |40.0 |5.0   |
|2021-01-06|16704.0  |41.0 |6.0   |
|2021-01-07|17748.0  |42.0 |6.0   |
|2021-01-08|18421.0  |42.0 |7.0   |
|2021-01-09|19078.0  |42.0 |7.0   |
+----------+---------+-----+------+
only showing top 10 rows



### Creamos la base `tiempos_mina_global_hora` que agrupa de manera global las variables por rango de horas del dia, para identificar si la hora de día influye en el comportamiento de las variables

In [30]:
expression = [F.round(F.mean(col), 0).alias(col) for col in global_hora.columns]
tiempos_mina_global_hora = (
    global_hora.select('*')
    .groupBy(F.hour(F.col('fecha')).alias('Time'))
    .agg(*expression)
    .drop('fecha')
    .orderBy('Time')
)
tiempos_mina_global_hora.toPandas().head(10)

Unnamed: 0,Time,ton_total,n_descargas,n_cam,n_shov,ton_chancador,ton_botadero,descargas_botadero,ton_chancador_1,ton_chancador_2,...,n_baldes,t_carga,t_descarga,t_viaje_descarga,t_ciclo_caex,t_ciclo_pala,d_viaje_vacio,d_viaje_lleno,d_eq_vacio,d_eq_lleno
0,0,13572.0,44.0,37.0,7.0,5134.0,8438.0,28.0,2591.0,2543.0,...,4.0,124.0,62.0,987.0,1970.0,187.0,5260.0,4735.0,7995.0,8200.0
1,1,21261.0,69.0,45.0,7.0,8130.0,13131.0,43.0,4059.0,4071.0,...,4.0,123.0,64.0,982.0,2025.0,187.0,5404.0,4843.0,8249.0,8359.0
2,2,21271.0,69.0,45.0,7.0,8048.0,13223.0,43.0,4037.0,4011.0,...,4.0,125.0,63.0,976.0,1963.0,187.0,5377.0,4824.0,8120.0,8331.0
3,3,20882.0,68.0,45.0,7.0,7941.0,12941.0,42.0,4050.0,3890.0,...,4.0,127.0,64.0,982.0,1961.0,190.0,5304.0,4819.0,7984.0,8364.0
4,4,8521.0,28.0,24.0,6.0,3338.0,5183.0,17.0,1893.0,1444.0,...,4.0,124.0,63.0,1013.0,2004.0,191.0,5124.0,4599.0,7530.0,7733.0
5,5,18367.0,60.0,41.0,7.0,6999.0,11368.0,37.0,3584.0,3415.0,...,4.0,124.0,64.0,966.0,2066.0,188.0,5302.0,4705.0,8139.0,8147.0
6,6,20066.0,65.0,43.0,7.0,7505.0,12562.0,41.0,3825.0,3680.0,...,4.0,127.0,64.0,965.0,1992.0,191.0,5324.0,4772.0,8127.0,8207.0
7,7,21484.0,70.0,45.0,7.0,8058.0,13426.0,44.0,4061.0,3997.0,...,4.0,126.0,63.0,958.0,1956.0,190.0,5335.0,4771.0,7959.0,8122.0
8,8,9028.0,30.0,26.0,6.0,3330.0,5698.0,19.0,1664.0,1667.0,...,4.0,128.0,63.0,936.0,1893.0,192.0,5249.0,4797.0,7853.0,8187.0
9,9,15789.0,51.0,39.0,6.0,6504.0,9285.0,30.0,3398.0,3106.0,...,4.0,120.0,64.0,835.0,1258.0,184.0,5073.0,4406.0,8002.0,7392.0


## Intento de histograma interactivo

In [31]:
def histogram(doc):
    def make_dataset(df, selected, dataset_bins = 15):
        # Convert to Pandas DataFrame, df ya tiene que estar filtrado
        df = df.toPandas()

        # Calculamos el histograma 
        arr_hist, edges = np.histogram(df[selected],
                                      bins = dataset_bins,
                                      range = [df[selected].min(), df[selected].max()])

        arr_df = pd.DataFrame({'proportion': arr_hist / np.sum(arr_hist),
                               'left': edges[:-1], 'right': edges[1:]})

        arr_df['f_proportion'] = ["{:.2f} %".format(prop*100) for prop in arr_df['proportion']]

        arr_df['interval'] = [f'{left:.0f} a {right:.0f}' for left, 
                              right in zip(arr_df['left'], arr_df['right'])]

        arr_df['name'] = variables_fmt[selected]

        return ColumnDataSource(arr_df)

    def style(p):
        # Title 
        p.title.align = 'center'
        p.title.text_font_size = '20pt'
        p.title.text_font = 'serif'

        # Axis titles
        p.xaxis.axis_label_text_font_size = '14pt'
        p.xaxis.axis_label_text_font_style = 'bold'
        p.yaxis.axis_label_text_font_size = '14pt'
        p.yaxis.axis_label_text_font_style = 'bold'

        # Tick labels
        p.xaxis.major_label_text_font_size = '12pt'
        p.yaxis.major_label_text_font_size = '12pt'

        # Tick Axis
        p.y_range.start = 0

        return p

    def make_plot(src):
        # Blank plot with correct labels
        target = np.unique(src.data["name"])[0]

        p = figure(plot_width = 700, plot_height = 650,
                   title = f'Histograma {target}', tools='',
                   x_axis_label = f'{target}', y_axis_label = 'Proporcion')

        # Quad glyphs to create a histogram
        p.quad(source = src, bottom = 0, top = 'proportion', left = 'left', right = 'right',
               color = 'red', fill_alpha = 0.7, hover_fill_color = 'black', hover_fill_alpha = 1, 
               line_color = 'black')

        # Hover tool with vline mode
        hover = HoverTool(tooltips=[('Rango', '@interval'),
                                    ('Proporcion', '@f_proportion')],
                          mode='vline')

        p.add_tools(hover)

        # Styling
        p = style(p)

        return p

    # Update function takes three default parameters
    def update(attr, old, new):

        selected_variable = get_key(select.value, variables_fmt)
        bins = spinner.value

        new_src = make_dataset(df=global_hora, selected=selected_variable, dataset_bins=bins)
        
        # Actualizar el titulo y los ejes
        p.title.text = f'Histograma {select.value}'
        p.xaxis.axis_label = f'{select.value}'

        # Update the source used the quad glpyhs
        src.data.update(new_src.data)
        
    def get_key(val, variables_fmt):
        for key, value in variables_fmt.items():
            if val == value:
                return key

    select = Select(title = "Seleccionar variable:", value = 'Tonelaje Total', 
                    options=list(variables_fmt.values()), width=550)
    select.on_change('value', update)
    
    spinner = Spinner(title="Bins:", low=10, high=40, step=1, value=15, width=100)
    spinner.on_change('value', update)
    
    src = make_dataset(df=global_hora, selected=get_key(select.value, variables_fmt))

    p = make_plot(src)

    layout = column(row(spinner, select), p)
    
    tab = Panel(child=layout, title = 'Histogram')
    tabs = Tabs(tabs=[tab])
    
    doc.add_root(tabs)
    
    #return Panel(child=layout, title = 'Histogram')
    
# Set up an application
handler = FunctionHandler(histogram)
app = Application(handler)

In [32]:
show(app)

## Scatter interactivo diario (rango de fechas sin funcionamiento)

In [43]:
i = 0
def scatter_diario(doc):
    def make_dataset(df, x, date_column='date', start_date='2021-01-01', end_date='2021-02-01'):
        # Convert to Pandas DataFrame, df ya tiene que estar filtrado
        arr_df = (df.select('*')
              .filter(F.col(date_column).between(datetime.datetime.strptime(f'{start_date}', '%Y-%m-%d'),
                                                 datetime.datetime.strptime(f'{end_date}', '%Y-%m-%d')))).toPandas()

        return ColumnDataSource(data = {'x': arr_df[x], 'y': arr_df['ton_total'], 'date': arr_df[date_column]})

    def make_plot(src, col):
        p = figure(width=600, height=600, x_axis_label = f'{col}', y_axis_label = 'Tonelaje Total', tools='',
                   title='Rango de fecha 2021-01-01 a 2021-02-01')
            
        p.circle(x = 'x', y = 'y', size=10, color='#2171b5', alpha=0.7, source=src, hover_color = 'magenta')

        # Hovertool no se actualiza con rerender el nombre de la columna
        hover = HoverTool(tooltips=[('Fecha', '@date{%Y-%m-%d}'),
                                    ('Tonelaje Total', '@{y}{,}'),
                                    (f'{col}', '@{x}{,}')],
                         formatters={'@date': 'datetime'})

        hline = Span(location=(src.data['y'].min()+src.data['y'].max())*0.5, dimension='width', line_color='black', line_width=3, line_alpha=0.7)
        p.add_layout(hline)
        vline = Span(location=(src.data['x'].min()+src.data['x'].max())*0.5, dimension='height', line_color='black', line_width=3, line_alpha=0.7)
        p.add_layout(vline)

        p.add_tools(hover)
        p = style(p)

        return p, vline, hline

    def update(attr, old, new):
        selected_variable = get_key(select.value, variables_fmt)
        
        new_src = make_dataset(df=global_por_dia, x=selected_variable, start_date=date_start.value, end_date=date_end.value)

        # Actualizar el titulo y los ejes
        p.title.text = f'Rango de fecha {date_start.value} a {date_end.value}'
        p.xaxis.axis_label = f'{select.value}'
        
        # Update the tooltips
        p.hover[0].tooltips = [('Fecha', '@date{%Y-%m-%d}'),
                               ('Tonelaje Total', '@{y}{,}'),
                               (f'{select.value}', '@{x}{,}')]
        
        vline.location = (new_src.data['x'].min()+new_src.data['x'].max())*0.5
        hline.location = (new_src.data['y'].min()+new_src.data['y'].max())*0.5
        
        src.data.update(new_src.data)
    

    def style(p):
        # Title 
        p.title.align = 'center'
        p.title.text_font_size = '18pt'
        p.title.text_font = 'serif'

        # Axis titles
        p.xaxis.axis_label_text_font_size = '14pt'
        p.xaxis.axis_label_text_font_style = 'bold'
        p.yaxis.axis_label_text_font_size = '14pt'
        p.yaxis.axis_label_text_font_style = 'bold'

        # Tick labels
        p.xaxis.major_label_text_font_size = '12pt'
        p.yaxis.major_label_text_font_size = '12pt'

        return p

    def get_key(val, variables_fmt):
        for key, value in variables_fmt.items():
            if val == value:
                return key
            
    def timestamp_to_str(ms):
        date = datetime.datetime.fromtimestamp(ms/1000.0).date() + datetime.timedelta(days = 1)
        return date.strftime("%Y-%m-%d")
    
    def callback():
        global i
        if i % 2 == 0:
            hline.visible = False
            vline.visible = False
        else:
            hline.visible = True
            vline.visible = True
        i += 1
    
    # Select Widget
    select = Select(title = "Seleccionar variable:", value = 'Numero de camiones', 
                    options=list(variables_fmt.values()), width=285, margin = (20, 5, 5, 5))
    select.on_change('value', update)
    
    button = Button(label = 'Span', margin = (38, 5, 5, 5), width=50, button_type="primary")
    button.on_click(callback)
    
    
    date_start = DatePicker(title='Fecha inicio', value="2021-01-01", min_date="2020-12-31", max_date="2022-05-11", width=170, margin = (20, 5, 5, 5))
    date_start.on_change('value', update)
    
    date_end = DatePicker(title='Fecha termino', value="2021-02-01", min_date="2020-12-31", max_date="2022-05-11", width=170, margin = (20, 5, 5, 5))
    date_end.on_change('value', update)
    
    src = make_dataset(df=global_por_dia, x=get_key(select.value, variables_fmt), start_date=date_start.value, end_date=date_end.value)

    p, vline, hline = make_plot(src, select.value)
    
    layout = row(column(row(select, button), row(date_start, date_end), width=375), p)
    
    # Make a tab with the layout 
    tab = Panel(child=layout, title = 'Plot')
    tabs = Tabs(tabs=[tab])
    
    doc.add_root(tabs)
    
    # Intento de Tabs
    #return Panel(child=layout, title = 'Plot')

handler = FunctionHandler(scatter_diario)
app = Application(handler)

In [44]:
show(app)

In [41]:
def barras_hora(doc):
    def make_dataset(df, target):
        df = df.toPandas()
        return ColumnDataSource(data = dict(x = df['Time'], y = df[target]))
    
    def style(p):
        # Title 
        p.title.align = 'center'
        p.title.text_font_size = '20pt'
        p.title.text_font = 'serif'

        # Axis titles
        p.xaxis.axis_label_text_font_size = '12pt'
        p.xaxis.axis_label_text_font_style = 'bold'
        p.yaxis.axis_label_text_font_size = '14pt'
        p.yaxis.axis_label_text_font_style = 'bold'

        # Tick labels
        p.xaxis.major_label_text_font_size = '12pt'
        p.yaxis.major_label_text_font_size = '12pt'
        
        # Tick Axis
        p.y_range.start = 0
        p.x_range.start = 0.5
        p.x_range.end = 23.5
        return p

    def make_plot(src, y):
        p = figure(plot_width=700, plot_height=450, x_axis_type=None)
        
        p.vbar(x='x', bottom = 0, top='y', source=src, width=0.75)
        
        ticker = SingleIntervalTicker(interval=1, num_minor_ticks=1)
        xaxis = LinearAxis(ticker=ticker)
        p.add_layout(xaxis, 'below')

        hover = HoverTool()
        hover.tooltips = [(f'{y}', '@{y}{,}')]
        
        p.xaxis.axis_label = 'Hora del dia'
        p.yaxis.axis_label = f'{y}'
        p.add_tools(hover)
        
        p = style(p)
        
        return p

    def update(attr, old, new):
        selected_variable = get_key(select.value, variables_fmt)

        new_src = make_dataset(df=tiempos_mina_global_hora, target=get_key(select.value, variables_fmt))
        p.yaxis.axis_label = f'{select.value}'
        
        p.hover[0].tooltips = [(f'{select.value}', '@{y}{,}')]
        
        src.data.update(new_src.data)

    def get_key(val, variables_fmt):
        for key, value in variables_fmt.items():
            if val == value:
                return key

    # Select Widget
    select = Select(title = "Seleccionar variable:", value = 'Numero de camiones', 
                    options=list(variables_fmt.values()), width=285, margin = (5, 5, 5, 5))
    select.on_change('value', update)

    src = make_dataset(df=tiempos_mina_global_hora, target=get_key(select.value, variables_fmt))

    p = make_plot(src, select.value)
    
    layout = column(select, p)
    
    # Make a tab with the layout 
    tab = Panel(child=layout, title = 'Barras')
    tabs = Tabs(tabs=[tab])
    
    doc.add_root(tabs)
    
handler = FunctionHandler(barras_hora)
app = Application(handler)

In [42]:
show(app)

### Observaciones

1. <del>Intentar obtener este grafico pero por rango de fecha</del>
2. <del>Agregar un CheckButton para indicar si mostrar o no el Span, o con un Toggle</del>
3. Calcular lo mismo pero para las otras funciones, agregandole el colormap con un CheckButton
4. <del> Dejar todo en un solo Panel </del>
5. Agregas otras variable categoricas como habia mencionado antes para mostrarlas en el HoverTool, con un buen format con JavaScript (video de YT)
6. Comentar acerca de nuevas variables que incorporar al analisis como condiciones de los caminos de los camiones, equipos secundarios como camiones aljibe, los numeros de fase que se trabajan en el dia etc.
7. Agregar la hora 24 como las 00 para que el colormap range quede bien 
8. <del>Format the HoverTool with <a href="https://stackoverflow.com/questions/53324833/creating-dynamic-hovertool-tooltips-in-bokeh-based-off-of-a-dropdown-using-custo">this</a> code</del>
9. Informacion de histograma (mean, min, max) como `table`
10. Modificar `main.py` para que se actualice con lo que tengo aca

<a href="https://towardsdatascience.com/data-visualization-with-bokeh-in-python-part-ii-interactions-a4cf994e2512">Visualizacion de datos en Bokeh</a>

Hacer un <a href="https://medium.com/pursuitnotes/python-data-visualization-with-bokeh-a7ada195087c">Hoover Tooltip</a> pero con JS code

#### Esta funcion permite graficar el DataFrame `tiempos_mina_global_hora` comparando todas las variables respecto al tonelaje, donde tambien se puede añadir informacion extra con `extra_info`