In [104]:
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np 
from IPython.display import display

# Imputación de nulos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames
import warnings
warnings.filterwarnings("ignore")


In [105]:
df = pd.read_csv('project-da-promo-G-module-4-team-3/files /finanzas-hotel-bookings.csv', index_col= 0, low_memory=False)


In [106]:
df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,0
0,Resort Hotel,False,342.000000,2015.0,July,27.0,1.0,0.000000,0.000000,2.0,,0.0,BB,PRT,,Direct,0.0,,0.0,C,C,3.0,,,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01 00:00:00,
1,Resort Hotel,False,737.000000,,July,27.0,1.0,0.000000,0.000000,2.0,,0.0,BB,,,Direct,0.0,0.0,0.0,,C,4.0,,,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01 00:00:00,
2,Resort Hotel,False,7.000000,2015.0,July,27.0,1.0,0.000000,1.000000,1.0,0.0,0.0,BB,GBR,,Direct,0.0,0.0,0.0,A,C,0.0,,,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02 00:00:00,
3,Resort Hotel,False,13.000000,,July,27.0,1.0,0.000000,1.000000,1.0,,0.0,BB,GBR,Corporate,Corporate,0.0,0.0,0.0,A,A,0.0,304.0,,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02 00:00:00,
4,Resort Hotel,False,14.000000,,July,,1.0,0.000000,2.000000,2.0,,0.0,BB,,Online TA,TA/TO,0.0,0.0,0.0,A,A,0.0,240.0,,0.0,Transient,98.0,0.0,1.0,Check-Out,2015-07-03 00:00:00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182872,,,224.337762,,,,,19.005545,19.819823,,,,,,,,,,,,,,,,,,,,,,,
182873,,,390.141963,,,,,14.751794,19.989726,,,,,,,,,,,,,,,,,,,,,,,
182874,,,230.689826,,,,,11.409496,20.461372,,,,,,,,,,,,,,,,,,,,,,,
182875,,,304.888534,,,,,16.744472,15.400773,,,,,,,,,,,,,,,,,,,,,,,


In [107]:
df.name = "Dataset Hotel"

In [108]:
def exploracion_general (lista):
    """Esta función proporciona toda la informacion necesaria de uno o varios DataFrame
    
    Args:
    lista : lista de los DataFrame que queremos explorar
    
    Returns:
    La funcion no tiene return pero devuelve varios prints con
    la informacion que necesitamos:
    - Descripciones separadas por columnas numericas y categoricas
    - Tipos de dato por columna
    - Numero total de filas y columnas
    - Informacion adicional
    - Total de nulos
    - Total de duplicados"""


    for df in lista:

        print("------Exploracion del dataframe: {} ------".format(df.name))
        try:
            print("-------Descripción columnas numéricas:---------")
            print(df.describe())
        except:
            print("Este DataFrame no contiene columnas numericas")

        try:
            print("-------Descripción columnas categoricas:---------")
            print(df.describe(include="O"))
        except: 
            print("Este DataFrame no contiene columnas categoricas")

        print("------Tipos de datos:---------")
        print(df.dtypes)
        print("------Numero de filas y columnas:------")
        print(df.shape)
        print("------Información adicional:---------")
        print(df.info())
        print("------Cantidad de nulos:---------")
        print(df.isnull().sum())
        print("------Cantidad de duplicados:---------")
        print(df.duplicated().sum())

In [109]:
exploracion_general([df])

------Exploracion del dataframe: Dataset Hotel ------
-------Descripción columnas numéricas:---------
           lead_time  arrival_date_year  arrival_date_week_number  \
count  119490.000000       64829.000000             101004.000000   
mean      104.172628        2016.156196                 27.175785   
std       106.975949           0.706674                 13.613871   
min         0.000000        2015.000000                  1.000000   
25%        18.000000        2016.000000                 16.000000   
50%        69.000000        2016.000000                 28.000000   
75%       161.000000        2017.000000                 38.000000   
max       737.000000        2017.000000                 53.000000   

       arrival_date_day_of_month  stays_in_weekend_nights  \
count              119271.000000            119490.000000   
mean                   15.795977                 0.939461   
std                     8.780503                 1.082472   
min                     1.000000

In [110]:
#quitamos duplicados
df_sin_dup = df.drop_duplicates()

In [111]:
#negativo a nan
df_sin_dup[df_sin_dup["adr"]== -6.38] = df_sin_dup[df_sin_dup["adr"]== np.nan]

In [112]:
#quitamos decimales
df_sin_dup["stays_in_weekend_nights"] = df_sin_dup["stays_in_weekend_nights"].round()
df_sin_dup["stays_in_week_nights"] = df_sin_dup["stays_in_week_nights"].round()

In [113]:
dicc =  {
    'January': '1',
    'February': '2',
    'March': '3',
    'April': '4',
    'May': '5',
    'June': '6',
    'July': '7',
    'August': '8',
    'September': '9',
    'October': '10',
    'November': '11',
    'December': '12'
}

In [114]:
df_sin_dup["arrival_date_month"] = df_sin_dup["arrival_date_month"].map(dicc)

In [115]:
df_sin_dup["arrival_date_month"].dtype

dtype('O')

In [116]:
df_sin_dup.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'agent', 'company',
       'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date', '0'],
      dtype='object')

In [117]:
df_sin_dup[df_sin_dup["0"].notnull()]

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,0
119390,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,City Hotel
119391,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True
119392,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,226
119394,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,June
119395,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182838,,,210.765755,,,,,15.0,23.0,,,,,,,,,,,,,,,,,,,,,,,Transient
182839,,,343.704995,,,,,20.0,16.0,,,,,,,,,,,,,,,,,,,,,,,108.0
182840,,,339.463972,,,,,13.0,21.0,,,,,,,,,,,,,,,,,,,,,,,0.0
182841,,,231.666473,,,,,15.0,24.0,,,,,,,,,,,,,,,,,,,,,,,0.0


In [118]:
df_sin_dup.to_csv("sin_dup.csv")

In [119]:
def exploracion_columna (lista):

    """Esta función proporciona toda la informacion necesaria de cada columna del DataFrame
    
    Args:
    lista : lista de los DataFrame que queremos explorar
    
    Returns:
    La funcion no tiene return pero devuelve varios prints con
    la informacion que necesitamos:
    - Frecuencia de cada valor unico de la columna
    - Total de nulos
    - Total de duplicados"""

    for dataframe in lista:
        print("------Exploracion del dataframe: {} ------".format(dataframe.name))
        for columna in list(dataframe.columns):
            print(f" \n----------- ESTAMOS ANALIZANDO LA COLUMNA: '{columna.upper()}' -----------\n")
            print(f"Frecuencia de valores en la columna: \n {dataframe[columna].value_counts()}")
            print(f"Suma de datos nulos {dataframe[columna].isnull().sum()}")
            print(f"Suma de datos duplicados {dataframe[columna].duplicated().sum()}")