In [9]:
# Importaciones 
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np 
from datetime import datetime 
from IPython.display import display

# Imputación de nulos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames
import warnings
warnings.filterwarnings("ignore")


In [10]:
# Apertura del csv a utilizar
df = pd.read_csv('../files_/finanzas-hotel-bookings.csv', index_col= 0, low_memory=False)

In [11]:
df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,0
0,Resort Hotel,False,342.000000,2015.0,July,27.0,1.0,0.000000,0.000000,2.0,,0.0,BB,PRT,,Direct,0.0,,0.0,C,C,3.0,,,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01 00:00:00,
1,Resort Hotel,False,737.000000,,July,27.0,1.0,0.000000,0.000000,2.0,,0.0,BB,,,Direct,0.0,0.0,0.0,,C,4.0,,,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01 00:00:00,
2,Resort Hotel,False,7.000000,2015.0,July,27.0,1.0,0.000000,1.000000,1.0,0.0,0.0,BB,GBR,,Direct,0.0,0.0,0.0,A,C,0.0,,,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02 00:00:00,
3,Resort Hotel,False,13.000000,,July,27.0,1.0,0.000000,1.000000,1.0,,0.0,BB,GBR,Corporate,Corporate,0.0,0.0,0.0,A,A,0.0,304.0,,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02 00:00:00,
4,Resort Hotel,False,14.000000,,July,,1.0,0.000000,2.000000,2.0,,0.0,BB,,Online TA,TA/TO,0.0,0.0,0.0,A,A,0.0,240.0,,0.0,Transient,98.0,0.0,1.0,Check-Out,2015-07-03 00:00:00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182872,,,224.337762,,,,,19.005545,19.819823,,,,,,,,,,,,,,,,,,,,,,,
182873,,,390.141963,,,,,14.751794,19.989726,,,,,,,,,,,,,,,,,,,,,,,
182874,,,230.689826,,,,,11.409496,20.461372,,,,,,,,,,,,,,,,,,,,,,,
182875,,,304.888534,,,,,16.744472,15.400773,,,,,,,,,,,,,,,,,,,,,,,


In [12]:
df.name = "Dataset Hotel"

In [13]:
def exploracion_general (lista):
    """Esta función proporciona toda la informacion necesaria de uno o varios DataFrame
    
    Args:
    lista : lista de los DataFrame que queremos explorar
    
    Returns:
    La funcion no tiene return pero devuelve varios prints con
    la informacion que necesitamos:
    - Descripciones separadas por columnas numericas y categoricas
    - Tipos de dato por columna
    - Numero total de filas y columnas
    - Informacion adicional
    - Total de nulos
    - Total de duplicados"""


    for df in lista:

        print("------Exploracion del dataframe: {} ------".format(df.name))
        try:
            print("-------Descripción columnas numéricas:---------")
            print(df.describe())
        except:
            print("Este DataFrame no contiene columnas numericas")

        try:
            print("-------Descripción columnas categoricas:---------")
            print(df.describe(include="O"))
        except: 
            print("Este DataFrame no contiene columnas categoricas")

        print("------Tipos de datos:---------")
        print(df.dtypes)
        print("------Numero de filas y columnas:------")
        print(df.shape)
        print("------Información adicional:---------")
        print(df.info())
        print("------Cantidad de nulos:---------")
        print(df.isnull().sum())
        print("------Cantidad de duplicados:---------")
        print(df.duplicated().sum())

In [14]:
exploracion_general([df])

------Exploracion del dataframe: Dataset Hotel ------
-------Descripción columnas numéricas:---------
           lead_time  arrival_date_year  arrival_date_week_number  \
count  119490.000000       64829.000000             101004.000000   
mean      104.172628        2016.156196                 27.175785   
std       106.975949           0.706674                 13.613871   
min         0.000000        2015.000000                  1.000000   
25%        18.000000        2016.000000                 16.000000   
50%        69.000000        2016.000000                 28.000000   
75%       161.000000        2017.000000                 38.000000   
max       737.000000        2017.000000                 53.000000   

       arrival_date_day_of_month  stays_in_weekend_nights  \
count              119271.000000            119490.000000   
mean                   15.795977                 0.939461   
std                     8.780503                 1.082472   
min                     1.000000

In [15]:
# Eliminar duplicados
df_sin_dup = df.drop_duplicates()

In [16]:
# Cambio negativo a nan
df_sin_dup[df_sin_dup["adr"]== -6.38] = df_sin_dup[df_sin_dup["adr"]== np.nan]

In [17]:
# Eliminación de los decimales
df_sin_dup["stays_in_weekend_nights"] = df_sin_dup["stays_in_weekend_nights"].round()
df_sin_dup["stays_in_week_nights"] = df_sin_dup["stays_in_week_nights"].round()

In [18]:
dicc =  {
    'January': '1',
    'February': '2',
    'March': '3',
    'April': '4',
    'May': '5',
    'June': '6',
    'July': '7',
    'August': '8',
    'September': '9',
    'October': '10',
    'November': '11',
    'December': '12'
}

In [19]:
df_sin_dup["arrival_date_month"] = df_sin_dup["arrival_date_month"].map(dicc)

In [20]:
df_sin_dup["arrival_date_month"].dtype

dtype('O')

In [21]:
df_sin_dup.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'agent', 'company',
       'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date', '0'],
      dtype='object')

In [22]:
df_sin_dup[df_sin_dup["0"].notnull()]

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,0
119390,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,City Hotel
119391,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True
119392,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,226
119394,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,June
119395,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182838,,,210.765755,,,,,15.0,23.0,,,,,,,,,,,,,,,,,,,,,,,Transient
182839,,,343.704995,,,,,20.0,16.0,,,,,,,,,,,,,,,,,,,,,,,108.0
182840,,,339.463972,,,,,13.0,21.0,,,,,,,,,,,,,,,,,,,,,,,0.0
182841,,,231.666473,,,,,15.0,24.0,,,,,,,,,,,,,,,,,,,,,,,0.0


In [23]:
df_sin_dup.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'agent', 'company',
       'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date', '0'],
      dtype='object')

In [24]:
df_sin_dup["arrival_date_month"].unique()

array(['7', '8', '9', '10', '11', '12', nan, '1', '2', '3', '4', '5', '6'],
      dtype=object)

In [25]:
df_sin_dup.rename

<bound method DataFrame.rename of                hotel is_canceled   lead_time  arrival_date_year  \
0       Resort Hotel       False  342.000000             2015.0   
1       Resort Hotel       False  737.000000                NaN   
2       Resort Hotel       False    7.000000             2015.0   
3       Resort Hotel       False   13.000000                NaN   
4       Resort Hotel       False   14.000000                NaN   
...              ...         ...         ...                ...   
182872           NaN         NaN  224.337762                NaN   
182873           NaN         NaN  390.141963                NaN   
182874           NaN         NaN  230.689826                NaN   
182875           NaN         NaN  304.888534                NaN   
182876           NaN         NaN  341.238166                NaN   

       arrival_date_month  arrival_date_week_number  \
0                       7                      27.0   
1                       7                      27.0

In [26]:
# Definir el mapeo para renombrar las columnas en minúsculas
rename_mapping = {
    'hotel': 'hotel',
    'is_canceled': 'canceled',
    'lead_time': 'lead_time',
    'arrival_date_year': 'arrival_year',
    'arrival_date_month': 'arrival_month',
    'arrival_date_week_number': 'arrival_week',
    'arrival_date_day_of_month': 'arrival_day',
    'stays_in_weekend_nights': 'weekend_nights',
    'stays_in_week_nights': 'week_nights',
    'adults': 'adults',
    'children': 'children',
    'babies': 'babies',
    'meal': 'meal',
    'country': 'country',
    'market_segment': 'market_segment',
    'distribution_channel': 'distribution_channel',
    'is_repeated_guest': 'repeated_guest',
    'previous_cancellations': 'prev_cancellations',
    'previous_bookings_not_canceled': 'prev_not_canceled',
    'reserved_room_type': 'reserved_room_type',
    'assigned_room_type': 'assigned_room_type',
    'booking_changes': 'booking_changes',
    'agent': 'agent',
    'company': 'company',
    'days_in_waiting_list': 'days_waiting_list',
    'customer_type': 'customer_type',
    'adr': 'adr',
    'required_car_parking_spaces': 'parking_spaces',
    'total_of_special_requests': 'special_requests',
    'reservation_status': 'reservation_status',
    'reservation_status_date': 'status_date',
    '0': '0'
}

# Renombrar las columnas según el mapeo en minúsculas
df = df_sin_dup.rename(columns=rename_mapping)

In [27]:
df.columns

Index(['hotel', 'canceled', 'lead_time', 'arrival_year', 'arrival_month',
       'arrival_week', 'arrival_day', 'weekend_nights', 'week_nights',
       'adults', 'children', 'babies', 'meal', 'country', 'market_segment',
       'distribution_channel', 'repeated_guest', 'prev_cancellations',
       'prev_not_canceled', 'reserved_room_type', 'assigned_room_type',
       'booking_changes', 'agent', 'company', 'days_waiting_list',
       'customer_type', 'adr', 'parking_spaces', 'special_requests',
       'reservation_status', 'status_date', '0'],
      dtype='object')

In [28]:
# Modificacion columna 0
df["0"].unique()

array([nan, 'City Hotel', 'True', ..., '29/10/15', '197.1', '4/02/17'],
      dtype=object)

In [29]:
len(df)

119837

In [30]:
df_clean = df[df["0"].isnull()] # Identificamos las filas con valores nulos en la columna "0"

In [31]:
len(df_clean)

118115

In [32]:
df_clean.drop("0", axis=1, inplace=True) # Eliminamos la columa "0"

In [33]:
df_clean.columns

Index(['hotel', 'canceled', 'lead_time', 'arrival_year', 'arrival_month',
       'arrival_week', 'arrival_day', 'weekend_nights', 'week_nights',
       'adults', 'children', 'babies', 'meal', 'country', 'market_segment',
       'distribution_channel', 'repeated_guest', 'prev_cancellations',
       'prev_not_canceled', 'reserved_room_type', 'assigned_room_type',
       'booking_changes', 'agent', 'company', 'days_waiting_list',
       'customer_type', 'adr', 'parking_spaces', 'special_requests',
       'reservation_status', 'status_date'],
      dtype='object')

In [34]:
# Modificación columna "status_date"
df_clean["status_date"].unique()

array(['2015-07-01 00:00:00', '2015-07-02 00:00:00',
       '2015-07-03 00:00:00', '2015-05-06 00:00:00',
       '2015-04-22 00:00:00', '2015-06-23 00:00:00',
       '2015-07-05 00:00:00', nan, '2015-07-07 00:00:00',
       '2015-07-08 00:00:00', '2015-05-11 00:00:00',
       '2015-07-15 00:00:00', '2015-07-16 00:00:00',
       '2015-05-29 00:00:00', '2015-05-19 00:00:00',
       '2015-06-19 00:00:00', '2015-07-06 00:00:00',
       '2015-05-23 00:00:00', '2015-05-18 00:00:00',
       '2015-07-09 00:00:00', '2015-06-02 00:00:00',
       '2015-07-04 00:00:00', '2015-06-29 00:00:00',
       '2015-06-16 00:00:00', '2015-06-18 00:00:00',
       '2015-06-12 00:00:00', '2015-06-09 00:00:00',
       '2015-05-26 00:00:00', '2015-07-11 00:00:00',
       '2015-07-12 00:00:00', '2015-07-13 00:00:00',
       '2015-07-17 00:00:00', '2015-04-15 00:00:00',
       '2015-05-13 00:00:00', '2015-07-10 00:00:00',
       '2015-05-20 00:00:00', '2015-05-12 00:00:00',
       '2015-07-14 00:00:00', '2015-06-17

In [35]:
df_clean["status_date"].isnull().sum()

12974

In [36]:
df_clean["status_date"] = pd.to_datetime(df_clean["status_date"], errors = "coerce")

In [37]:
# Ver fecha presente
fecha_actual = datetime.now()    # Obtener la fecha actual
fecha_actual

datetime.datetime(2024, 4, 23, 12, 53, 54, 630440)

In [38]:
# Ver fechas pasadas
fechas_pasadas = df_clean[df_clean['status_date'] > fecha_actual]   # Filtrar fechas pasadas
fechas_pasadas

Unnamed: 0,hotel,canceled,lead_time,arrival_year,arrival_month,arrival_week,arrival_day,weekend_nights,week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,repeated_guest,prev_cancellations,prev_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_waiting_list,customer_type,adr,parking_spaces,special_requests,reservation_status,status_date


In [39]:
# Verificar duplicados
duplicados = df_clean[df_clean.duplicated()]
duplicados

Unnamed: 0,hotel,canceled,lead_time,arrival_year,arrival_month,arrival_week,arrival_day,weekend_nights,week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,repeated_guest,prev_cancellations,prev_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_waiting_list,customer_type,adr,parking_spaces,special_requests,reservation_status,status_date
119393,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT
122688,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT
131036,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT
152300,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT
159155,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT
182004,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT


In [40]:
df_clean.drop_duplicates()

Unnamed: 0,hotel,canceled,lead_time,arrival_year,arrival_month,arrival_week,arrival_day,weekend_nights,week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,repeated_guest,prev_cancellations,prev_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_waiting_list,customer_type,adr,parking_spaces,special_requests,reservation_status,status_date
0,Resort Hotel,False,342.000000,2015.0,7,27.0,1.0,0.0,0.0,2.0,,0.0,BB,PRT,,Direct,0.0,,0.0,C,C,3.0,,,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01
1,Resort Hotel,False,737.000000,,7,27.0,1.0,0.0,0.0,2.0,,0.0,BB,,,Direct,0.0,0.0,0.0,,C,4.0,,,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01
2,Resort Hotel,False,7.000000,2015.0,7,27.0,1.0,0.0,1.0,1.0,0.0,0.0,BB,GBR,,Direct,0.0,0.0,0.0,A,C,0.0,,,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02
3,Resort Hotel,False,13.000000,,7,27.0,1.0,0.0,1.0,1.0,,0.0,BB,GBR,Corporate,Corporate,0.0,0.0,0.0,A,A,0.0,304.0,,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02
4,Resort Hotel,False,14.000000,,7,,1.0,0.0,2.0,2.0,,0.0,BB,,Online TA,TA/TO,0.0,0.0,0.0,A,A,0.0,240.0,,0.0,Transient,98.0,0.0,1.0,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182872,,,224.337762,,,,,19.0,20.0,,,,,,,,,,,,,,,,,,,,,,NaT
182873,,,390.141963,,,,,15.0,20.0,,,,,,,,,,,,,,,,,,,,,,NaT
182874,,,230.689826,,,,,11.0,20.0,,,,,,,,,,,,,,,,,,,,,,NaT
182875,,,304.888534,,,,,17.0,15.0,,,,,,,,,,,,,,,,,,,,,,NaT


In [41]:
# Verificar valores nulos
df_clean.isnull().sum()

hotel                      111
canceled                   111
lead_time                   39
arrival_year             54093
arrival_month             9454
arrival_week             18449
arrival_day                230
weekend_nights              39
week_nights                 39
adults                     101
children                 49732
babies                     104
meal                       111
country                  53864
market_segment           58875
distribution_channel     13577
repeated_guest            5123
prev_cancellations       42963
prev_not_canceled          111
reserved_room_type       39918
assigned_room_type         111
booking_changes            106
agent                    16261
company                 114266
days_waiting_list          111
customer_type            25204
adr                        111
parking_spaces             111
special_requests           111
reservation_status         111
status_date              13009
dtype: int64

In [42]:
porcentaje_nulos = (df_clean.isnull().sum()/ df_clean.shape[0]) * 100  # Calcular el porcentaje de valores nulos en 'status_date'
porcentaje_nulos

hotel                    0.093976
canceled                 0.093976
lead_time                0.033019
arrival_year            45.796893
arrival_month            8.004064
arrival_week            15.619523
arrival_day              0.194725
weekend_nights           0.033019
week_nights              0.033019
adults                   0.085510
children                42.104728
babies                   0.088050
meal                     0.093976
country                 45.603014
market_segment          49.845490
distribution_channel    11.494730
repeated_guest           4.337298
prev_cancellations      36.373873
prev_not_canceled        0.093976
reserved_room_type      33.795877
assigned_room_type       0.093976
booking_changes          0.089743
agent                   13.767091
company                 96.741311
days_waiting_list        0.093976
customer_type           21.338526
adr                      0.093976
parking_spaces           0.093976
special_requests         0.093976
reservation_st

In [43]:
df_clean["market_segment"].value_counts()

Online TA        28313
Offline TA/TO    11833
Groups            9626
Direct            6351
Corporate         2635
Complementary      363
Aviation           118
Undefined            1
Name: market_segment, dtype: int64

In [44]:
df_clean["distribution_channel"].unique()

array(['Direct', 'Corporate', 'TA/TO', nan, 'Undefined', 'GDS'],
      dtype=object)

In [45]:
df_clean[(df_clean["distribution_channel"].isnull()) & (df_clean["market_segment"].notnull())][["distribution_channel", "market_segment"]]

Unnamed: 0,distribution_channel,market_segment
16,,Offline TA/TO
39,,Direct
44,,Online TA
45,,Online TA
91,,Online TA
...,...,...
119277,,Groups
119304,,Direct
119312,,Online TA
119314,,Offline TA/TO


In [46]:
# Modificar las columnas con datos "Nan" a "Desconocido" de las siguiente lista de columnas.

lista_columnas = ['hotel', 'canceled', 'arrival_year', 'arrival_month', 'arrival_week', 'arrival_day', 'meal', 'country', 'market_segment', 'distribution_channel', 'repeated_guest', 'reserved_room_type', 'assigned_room_type', 'agent', 'company', 'customer_type', 'reservation_status']

for columna in lista_columnas: # Iterar sobre cada columna en la lista de columnas
    df_clean[columna] = df_clean[columna].fillna("Desconocido") # Reemplazar valores nulos en la columna actual con "Desconocido"

In [47]:
df_clean.head()  # Verificar los cambios realizados

Unnamed: 0,hotel,canceled,lead_time,arrival_year,arrival_month,arrival_week,arrival_day,weekend_nights,week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,repeated_guest,prev_cancellations,prev_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_waiting_list,customer_type,adr,parking_spaces,special_requests,reservation_status,status_date
0,Resort Hotel,False,342.0,2015.0,7,27.0,1.0,0.0,0.0,2.0,,0.0,BB,PRT,Desconocido,Direct,0.0,,0.0,C,C,3.0,Desconocido,Desconocido,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01
1,Resort Hotel,False,737.0,Desconocido,7,27.0,1.0,0.0,0.0,2.0,,0.0,BB,Desconocido,Desconocido,Direct,0.0,0.0,0.0,Desconocido,C,4.0,Desconocido,Desconocido,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01
2,Resort Hotel,False,7.0,2015.0,7,27.0,1.0,0.0,1.0,1.0,0.0,0.0,BB,GBR,Desconocido,Direct,0.0,0.0,0.0,A,C,0.0,Desconocido,Desconocido,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02
3,Resort Hotel,False,13.0,Desconocido,7,27.0,1.0,0.0,1.0,1.0,,0.0,BB,GBR,Corporate,Corporate,0.0,0.0,0.0,A,A,0.0,304.0,Desconocido,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02
4,Resort Hotel,False,14.0,Desconocido,7,Desconocido,1.0,0.0,2.0,2.0,,0.0,BB,Desconocido,Online TA,TA/TO,0.0,0.0,0.0,A,A,0.0,240.0,Desconocido,0.0,Transient,98.0,0.0,1.0,Check-Out,2015-07-03


In [48]:
# Redondear los datos de las columnas seleccionadas (lista_redondear)
lista_redondear = ['lead_time', 'weekend_nights', 'week_nights','adults', 'children', 'babies','prev_cancellations','prev_not_canceled', 'booking_changes', 'days_waiting_list','adr','parking_spaces','special_requests']

df_clean[lista_redondear] = df_clean[lista_redondear].apply(lambda x: round(x))  # Redondear las columnas numéricas en la lista

In [49]:
df_clean.head() # Verificar los cambios

Unnamed: 0,hotel,canceled,lead_time,arrival_year,arrival_month,arrival_week,arrival_day,weekend_nights,week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,repeated_guest,prev_cancellations,prev_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_waiting_list,customer_type,adr,parking_spaces,special_requests,reservation_status,status_date
0,Resort Hotel,False,342.0,2015.0,7,27.0,1.0,0.0,0.0,2.0,,0.0,BB,PRT,Desconocido,Direct,0.0,,0.0,C,C,3.0,Desconocido,Desconocido,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01
1,Resort Hotel,False,737.0,Desconocido,7,27.0,1.0,0.0,0.0,2.0,,0.0,BB,Desconocido,Desconocido,Direct,0.0,0.0,0.0,Desconocido,C,4.0,Desconocido,Desconocido,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01
2,Resort Hotel,False,7.0,2015.0,7,27.0,1.0,0.0,1.0,1.0,0.0,0.0,BB,GBR,Desconocido,Direct,0.0,0.0,0.0,A,C,0.0,Desconocido,Desconocido,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02
3,Resort Hotel,False,13.0,Desconocido,7,27.0,1.0,0.0,1.0,1.0,,0.0,BB,GBR,Corporate,Corporate,0.0,0.0,0.0,A,A,0.0,304.0,Desconocido,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02
4,Resort Hotel,False,14.0,Desconocido,7,Desconocido,1.0,0.0,2.0,2.0,,0.0,BB,Desconocido,Online TA,TA/TO,0.0,0.0,0.0,A,A,0.0,240.0,Desconocido,0.0,Transient,98.0,0.0,1.0,Check-Out,2015-07-03


In [50]:
# creamos columna arrival_date, uniendo arrival_year, arrival_month y arrival_day
# Definimos una función para manejar la conversión condicional
def create_date(row):
    if row['arrival_year'] != "Desconocido" and row['arrival_month'] != "Desconocido" and row['arrival_day'] != "Desconocido":
        # Convertimos a entero antes de pasar a pd.to_datetime
        year = int(float(row['arrival_year']))
        month = int(float(row['arrival_month']))
        day = int(float(row['arrival_day']))
        return pd.to_datetime(f"{year}-{month}-{day}")
    else:
        return pd.NaT

# Aplicamos la función a cada fila del DataFrame
df_clean['arrival_date'] = df_clean.apply(create_date, axis=1)

In [51]:
df_clean["arrival_year"].value_counts() ## podriamos imputar los años?

Desconocido    54093
2016.0         30461
2017.0         21867
2015.0         11694
Name: arrival_year, dtype: int64

In [56]:
len(df_clean["country"].unique())

164

In [53]:
df_clean.head() # Verificar los cambios

Unnamed: 0,hotel,canceled,lead_time,arrival_year,arrival_month,arrival_week,arrival_day,weekend_nights,week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,repeated_guest,prev_cancellations,prev_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_waiting_list,customer_type,adr,parking_spaces,special_requests,reservation_status,status_date,arrival_date
0,Resort Hotel,False,342.0,2015.0,7,27.0,1.0,0.0,0.0,2.0,,0.0,BB,PRT,Desconocido,Direct,0.0,,0.0,C,C,3.0,Desconocido,Desconocido,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01,2015-07-01
1,Resort Hotel,False,737.0,Desconocido,7,27.0,1.0,0.0,0.0,2.0,,0.0,BB,Desconocido,Desconocido,Direct,0.0,0.0,0.0,Desconocido,C,4.0,Desconocido,Desconocido,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01,NaT
2,Resort Hotel,False,7.0,2015.0,7,27.0,1.0,0.0,1.0,1.0,0.0,0.0,BB,GBR,Desconocido,Direct,0.0,0.0,0.0,A,C,0.0,Desconocido,Desconocido,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02,2015-07-01
3,Resort Hotel,False,13.0,Desconocido,7,27.0,1.0,0.0,1.0,1.0,,0.0,BB,GBR,Corporate,Corporate,0.0,0.0,0.0,A,A,0.0,304.0,Desconocido,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02,NaT
4,Resort Hotel,False,14.0,Desconocido,7,Desconocido,1.0,0.0,2.0,2.0,,0.0,BB,Desconocido,Online TA,TA/TO,0.0,0.0,0.0,A,A,0.0,240.0,Desconocido,0.0,Transient,98.0,0.0,1.0,Check-Out,2015-07-03,NaT


In [54]:
df_clean.to_csv("final_clean.csv")

## Preguntas que nos podemos hacer:

¿Cuál es la tasa de cancelación de este hotel?

Ha cambiado la tasa de cancelación con el tiempo.

Se cancelan más las reservas de verano que de invierno. Se cancelan más las reservas de entre semana que las de los fines de semana.

Los clientes que cancelan presentan algún tipo de característica demográfica.

Las reservas que se hacen con mayor anticipación tienen mucho riesgo de cancelarse.

Las reservas que incluyen hijos tienen menor riesgo.

Los usuarios que realizaron algún cambio en su reserva tienen menor riesgo.

Cuando el usuario ha realizado una solicitud especial el riesgo es menor.

Las reservas que tienen un “adr” bajo el riesgo es menor.

Relación con otras variables: Analizar cómo otras variables, como el tipo de comida incluida en la reserva o el número de solicitudes especiales, están relacionadas con las cancelaciones.