# Libraries and Dependencies

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Bokeh visualization library
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import (ColumnDataSource, HoverTool, Select, CustomJS, 
                         LinearColorMapper, ColorBar)
from bokeh.layouts import column, row
from bokeh.transform import factor_cmap, transform
from bokeh.palettes import Category10, Category20, Viridis256

# Initialize Bokeh for Jupyter notebook
output_notebook()

# Data Processing

## Load and process

In [23]:
# Load dataset
df = pd.read_csv('Datos_de_las_Visitas_a_los_Puntos_de_Información_Turística_(PITS)_en_Boyacá_20250919.csv')

# Apply normalization to city columns
city_columns = ['CIUDAD DE PROCEDENCIA', 'CIUDAD DE DESTINO 1', 'CIUDAD DE DESTINO 2', 
                'CIUDAD DE DESTINO 3', 'CIUDAD DE DESTINO 4', 'CIUDAD DE DESTINO 5']

# Create normalized versions of city columns
for col in city_columns:
    df[f'{col}_NORMALIZED'] = df[col].apply(normalize_text)

# Apply manual mappings to normalized city columns
for col in city_columns:
    normalized_col = f'{col}_NORMALIZED'
    df[f'{col}_CLEAN'] = df[normalized_col].apply(apply_city_mapping)

# Apply normalization and mapping to travel motivation
df['MOTIVO_NORMALIZED'] = df['MOTIVO DEL VIAJE'].apply(normalize_text)
df['MOTIVO_CLEAN'] = df['MOTIVO_NORMALIZED'].apply(apply_travel_motivation_mapping)

# Apply normalization and mapping to transportation mode
df['TRANSPORTE_NORMALIZED'] = df['TIPO TRANSPORTE'].apply(normalize_text)
df['TRANSPORTE_CLEAN'] = df['TRANSPORTE_NORMALIZED'].apply(apply_transportation_mapping)

# Update working columns to use cleaned versions
df['CIUDAD_ORIGEN_CLEAN'] = df['CIUDAD DE PROCEDENCIA_CLEAN']


# Analize

In [None]:
# DATA ANALYSIS AND INTERACTIVE VISUALIZATIONS

# Temporal analysis - Monthly distribution
month_counts = df['MES'].value_counts().sort_index()
month_data = month_counts.reset_index()
month_data.columns = ['Mes', 'Cantidad']
source_month = ColumnDataSource(month_data)

p2 = figure(x_range=[str(x) for x in month_data['Mes']], height=350, 
           title='Distribución Mensual de Visitas',
           tools='pan,wheel_zoom,box_zoom,reset,save')

p2.vbar(x='Mes', top='Cantidad', width=0.8, source=source_month, color='navy', alpha=0.7)
p2.add_tools(HoverTool(tooltips=[('Mes', '@Mes'), ('Visitas', '@Cantidad')]))
p2.title.text_font_size = "14pt"
show(p2)

# Yearly distribution  
year_counts = df['AÑO'].value_counts().sort_index()
year_data = year_counts.reset_index()
year_data.columns = ['Año', 'Cantidad']
source_year = ColumnDataSource(year_data)

p3 = figure(x_range=[str(x) for x in year_data['Año']], height=350, 
           title='Distribución Anual de Visitas',
           tools='pan,wheel_zoom,box_zoom,reset,save')

p3.vbar(x='Año', top='Cantidad', width=0.8, source=source_year, color='darkgreen', alpha=0.7)
p3.add_tools(HoverTool(tooltips=[('Año', '@Año'), ('Visitas', '@Cantidad')]))
p3.title.text_font_size = "14pt"
show(p3)

# Cities analysis (using cleaned data) - Origin cities
origin_counts = df['CIUDAD_ORIGEN_CLEAN'].value_counts()
origin_data = origin_counts.head(10).reset_index()
origin_data.columns = ['Ciudad', 'Cantidad']
source_origin = ColumnDataSource(origin_data)

p4 = figure(y_range=origin_data['Ciudad'], height=400, 
           title='Top 10 Ciudades de Origen (Datos Limpios)',
           tools='pan,wheel_zoom,box_zoom,reset,save')

p4.hbar(y='Ciudad', right='Cantidad', height=0.8, source=source_origin, 
        color='green', alpha=0.7)
p4.add_tools(HoverTool(tooltips=[('Ciudad', '@Ciudad'), ('Visitantes', '@Cantidad')]))
p4.title.text_font_size = "14pt"
show(p4)

# Destination cities analysis (using cleaned data)
destination_cols_clean = ['CIUDAD DE DESTINO 1_CLEAN', 'CIUDAD DE DESTINO 2_CLEAN', 
                         'CIUDAD DE DESTINO 3_CLEAN', 'CIUDAD DE DESTINO 4_CLEAN', 
                         'CIUDAD DE DESTINO 5_CLEAN']
all_destinations = []
for col in destination_cols_clean:
    destinations = df[col].dropna()
    destinations = destinations[destinations != 'NA']
    all_destinations.extend(destinations.tolist())

destination_counts = pd.Series(all_destinations).value_counts()
destination_data = destination_counts.head(10).reset_index()
destination_data.columns = ['Ciudad', 'Cantidad']
source_dest = ColumnDataSource(destination_data)

p5 = figure(y_range=destination_data['Ciudad'], height=400, 
           title='Top 10 Ciudades de Destino (Datos Limpios)',
           tools='pan,wheel_zoom,box_zoom,reset,save')

p5.hbar(y='Ciudad', right='Cantidad', height=0.8, source=source_dest, 
        color='orange', alpha=0.7)
p5.add_tools(HoverTool(tooltips=[('Ciudad', '@Ciudad'), ('Visitas', '@Cantidad')]))
p5.title.text_font_size = "14pt"
show(p5)

# Calculate popularity by number of destinations visited
df['num_destinations'] = 0
for col in destination_cols_clean:
    df['num_destinations'] += ((df[col].notna()) & (df[col] != 'NA')).astype(int)

destination_popularity = df['num_destinations'].value_counts().sort_index()
multi_dest_data = destination_popularity.reset_index()
multi_dest_data.columns = ['Num_Destinos', 'Cantidad']
source_multi = ColumnDataSource(multi_dest_data)

p6 = figure(title="Distribución por Número de Destinos Visitados",
            x_axis_label='Número de Destinos', y_axis_label='Número de Turistas',
            width=600, height=400,
            tools="pan,wheel_zoom,box_zoom,reset,save")

p6.vbar(x='Num_Destinos', top='Cantidad', width=0.8, source=source_multi, 
         color='coral', alpha=0.8)
p6.add_tools(HoverTool(tooltips=[('Destinos', '@Num_Destinos'), 
                                 ('Turistas', '@Cantidad')]))
p6.title.text_font_size = "14pt"
show(p6)

# Travel motivation analysis (using cleaned data)
travel_reason = df['MOTIVO_CLEAN'].value_counts()
reason_data = travel_reason.reset_index()
reason_data.columns = ['Motivo', 'Cantidad']
source_reason = ColumnDataSource(reason_data)

p7 = figure(x_range=reason_data['Motivo'], height=400, 
           title='Distribución de Motivos de Viaje (Datos Limpios)',
           tools='pan,wheel_zoom,box_zoom,reset,save')

p7.vbar(x='Motivo', top='Cantidad', width=0.8, source=source_reason,
        color=factor_cmap('Motivo', palette=Viridis256[::30], 
                         factors=reason_data['Motivo'].tolist()))
p7.add_tools(HoverTool(tooltips=[('Motivo', '@Motivo'), ('Cantidad', '@Cantidad')]))
p7.xaxis.major_label_orientation = 1.2
p7.title.text_font_size = "14pt"
show(p7)

# Transportation mode analysis (using cleaned data)
# transport = df['TRANSPORTE_CLEAN'].value_counts()
# transport_data = transport.reset_index()
# transport_data.columns = ['Transporte', 'Cantidad']
# source_transport = ColumnDataSource(transport_data)

# p8 = figure(x_range=transport_data['Transporte'], height=400, 
#            title='Distribución de Medios de Transporte (Datos Limpios)',
#            tools='pan,wheel_zoom,box_zoom,reset,save')

# p8.vbar(x='Transporte', top='Cantidad', width=0.8, source=source_transport, 
#         color=factor_cmap('Transporte', palette=Category20[:len(transport_data)], 
#                          factors=transport_data['Transporte'].tolist()), alpha=0.7)
# p8.add_tools(HoverTool(tooltips=[('Transporte', '@Transporte'), ('Cantidad', '@Cantidad')]))
# p8.xaxis.major_label_orientation = 1.2
# p8.title.text_font_size = "14pt"
# show(p8)

# Trip duration analysis
duration_stats = df['TIEMPO DE PERMANENCIA EN DIAS'].describe()
duration_counts = df['TIEMPO DE PERMANENCIA EN DIAS'].value_counts().sort_index()
duration_data = duration_counts.reset_index()
duration_data.columns = ['Dias', 'Cantidad']
source_duration = ColumnDataSource(duration_data)

p9 = figure(height=400, title='Distribución de Duración del Viaje (Días)',
           tools='pan,wheel_zoom,box_zoom,reset,save')

p9.line(x='Dias', y='Cantidad', source=source_duration, line_width=3, color='purple')
p9.circle(x='Dias', y='Cantidad', source=source_duration, size=8, color='purple', alpha=0.7)
p9.add_tools(HoverTool(tooltips=[('Días', '@Dias'), ('Cantidad', '@Cantidad')]))
p9.title.text_font_size = "14pt"
show(p9)

# Group size analysis (how many people travel together)
travelers_stats = df['CANTIDAD DE VIAJEROS'].describe()
travelers_counts = df['CANTIDAD DE VIAJEROS'].value_counts().sort_index()
travelers_data = travelers_counts.reset_index()
travelers_data.columns = ['Tamaño_Grupo', 'Cantidad']
source_travelers = ColumnDataSource(travelers_data)

p10 = figure(height=400, title='Distribución del Tamaño de Grupos de Viaje',
           tools='pan,wheel_zoom,box_zoom,reset,save')

p10.vbar(x='Tamaño_Grupo', top='Cantidad', width=0.8, source=source_travelers, color='teal', alpha=0.7)
p10.add_tools(HoverTool(tooltips=[('Tamaño de Grupo', '@Tamaño_Grupo'), ('Número de Grupos', '@Cantidad')]))
p10.title.text_font_size = "14pt"
show(p10)

# Country of origin analysis
# country_counts = df['PAIS DE PROCEDENCIA'].value_counts()
# country_data = country_counts.head(10).reset_index()
# country_data.columns = ['Pais', 'Cantidad']
# source_country = ColumnDataSource(country_data)

# p11 = figure(y_range=country_data['Pais'], height=400, 
#            title='Top 10 Países de Procedencia',
#            tools='pan,wheel_zoom,box_zoom,reset,save')

# p11.hbar(y='Pais', right='Cantidad', height=0.8, source=source_country, 
#          color='red', alpha=0.7)
# p11.add_tools(HoverTool(tooltips=[('País', '@Pais'), ('Visitantes', '@Cantidad')]))
# p11.title.text_font_size = "14pt"
# show(p11)




