#### **Librerías a utilizar**

In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

#### **Carga de bdd**

In [3]:
base_turismo = pd.read_excel(
    r"..\bdd\bdd_limpia\base_turismo_clean.xlsx")
base_turismo

Unnamed: 0,num_noches_durmieron,gasto_alojamiento,gasto_alimentacion,gasto_transporte,gasto_visitas,gasto_discotecas,mes_viaje,principal_actividad1,destino_principal_viaje,gasto_tot
0,0,0,20,0,0,0,Febrero,"Visita a festivales, ferias",Gualaceo,20
1,0,0,100,0,0,0,Diciembre,Visita a playas,Playas,100
2,2,0,60,0,0,0,Enero,Visita a atractivos naturales,Machala,60
3,3,0,15,20,0,0,Febrero,Visita a balnearios,Calvas,35
4,1,0,30,10,0,0,Febrero,Visita a balnearios,Paute,40
...,...,...,...,...,...,...,...,...,...,...
2146,2,0,75,0,0,0,Febrero,Actividades relacionadas con su trabajo,Quito,75
2147,0,0,40,0,0,0,Enero,Visita a atractivos naturales,Loreto,40
2148,3,0,20,32,0,0,Febrero,Visita a balnearios,Machala,52
2149,0,0,0,2,2,2,Febrero,Visita a atractivos históricos,Santo Domingo,6


In [4]:
base_turismo.columns

Index(['num_noches_durmieron', 'gasto_alojamiento', 'gasto_alimentacion',
       'gasto_transporte', 'gasto_visitas', 'gasto_discotecas', 'mes_viaje',
       'principal_actividad1', 'destino_principal_viaje', 'gasto_tot'],
      dtype='object')

In [None]:
# Columnas numéricas y categóricas
numeric_cols = ['num_noches_durmieron', 'gasto_alojamiento',
    'gasto_alimentacion', 'gasto_transporte', 'gasto_visitas',
    'gasto_discotecas'
]

categorical_cols = [
    'mes_viaje', 'principal_actividad1', 'destino_principal_viaje'
]

# Resumen general
print("=== Dimensiones del DataFrame ===")
print(base_turismo.shape)
print("\n=== Primeras filas ===")
print(base_turismo.head())
print("\n=== Valores nulos por columna ===")
print(base_turismo.isnull().sum())

# Estadísticas descriptivas para numéricas
print("\n=== Estadísticas descriptivas (numéricas) ===")
print(base_turismo[numeric_cols].describe())

# Histogramas interactivos de variables numéricas
for col in numeric_cols:
    fig = px.histogram(base_turismo, x=col, nbins=30, title=f"Distribución de {col}")
    fig.show()

# Boxplots interactivos para detectar outliers
for col in numeric_cols:
    fig = px.box(base_turismo, y=col, points="all", title=f"Boxplot de {col}")
    fig.show()

# Distribución de variables categóricas
for col in categorical_cols:
    counts = base_turismo[col].value_counts().reset_index()
    counts.columns = [col, 'conteo']
    fig = px.bar(counts, x=col, y='conteo', title=f"Distribución de {col}")
    fig.show()

# Scatter plots: num_viaje vs otras numéricas
for col in numeric_cols:
    if col != 'gasto_tot':
        fig = px.scatter(
            base_turismo, x=col, y='gasto_tot',
            title=f"gasto_tot vs {col}",
            trendline="ols"
        )
        fig.show()

# Heatmap interactivo de correlaciones
corr = base_turismo[numeric_cols].corr()
fig = go.Figure(data=go.Heatmap(
    z=corr.values,
    x=corr.columns,
    y=corr.columns,
    colorscale='Viridis',
    zmin=-1, zmax=1
))
fig.update_layout(title="Mapa de calor de correlación (numéricas)")
fig.show()

# Boxplots de num_viaje por cada categoría (resumen por categoría)
for col in categorical_cols:
    fig = px.box(
        base_turismo,
        x=col,
        y='gasto_tot',
        points="all",
        title=f"gasto_tot por {col}")
    fig.show()

# Pairplot interactivo con variables numéricas (opcional)
if len(numeric_cols) <= 6:
    fig = px.scatter_matrix(
        base_turismo,
        dimensions=numeric_cols,
        color='mes_viaje',
        title="Pairplot numéricas")
    fig.show()

=== Dimensiones del DataFrame ===
(2151, 10)

=== Primeras filas ===
   num_noches_durmieron  gasto_alojamiento  gasto_alimentacion  \
0                     0                  0                  20   
1                     0                  0                 100   
2                     2                  0                  60   
3                     3                  0                  15   
4                     1                  0                  30   

   gasto_transporte  gasto_visitas  gasto_discotecas  mes_viaje  \
0                 0              0                 0    Febrero   
1                 0              0                 0  Diciembre   
2                 0              0                 0      Enero   
3                20              0                 0    Febrero   
4                10              0                 0    Febrero   

            principal_actividad1 destino_principal_viaje  gasto_tot  
0    Visita a festivales, ferias                Gualaceo     

In [6]:
base_turismo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2151 entries, 0 to 2150
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   num_noches_durmieron     2151 non-null   int64 
 1   gasto_alojamiento        2151 non-null   int64 
 2   gasto_alimentacion       2151 non-null   int64 
 3   gasto_transporte         2151 non-null   int64 
 4   gasto_visitas            2151 non-null   int64 
 5   gasto_discotecas         2151 non-null   int64 
 6   mes_viaje                2151 non-null   object
 7   principal_actividad1     2151 non-null   object
 8   destino_principal_viaje  2151 non-null   object
 9   gasto_tot                2151 non-null   int64 
dtypes: int64(7), object(3)
memory usage: 168.2+ KB


In [7]:
# Inspect the categorical variables
base_turismo.select_dtypes('object').nunique()

mes_viaje                    3
principal_actividad1         9
destino_principal_viaje    181
dtype: int64

In [8]:
# Inspect the numerical variables
base_turismo.describe()

Unnamed: 0,num_noches_durmieron,gasto_alojamiento,gasto_alimentacion,gasto_transporte,gasto_visitas,gasto_discotecas,gasto_tot
count,2151.0,2151.0,2151.0,2151.0,2151.0,2151.0,2151.0
mean,1.467225,4.943747,26.617387,11.452813,0.537889,3.299861,46.851697
std,2.405366,21.554519,34.435218,22.096585,2.726328,8.110848,54.039812
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,7.0,0.0,0.0,0.0,15.0
50%,1.0,0.0,20.0,2.0,0.0,0.0,30.0
75%,2.0,0.0,30.0,16.0,0.0,0.0,60.0
max,30.0,400.0,350.0,360.0,30.0,70.0,430.0
