In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy.stats import chi2_contingency
from IPython.display import display
from itertools import combinations
from scipy.stats import kstest, spearmanr, pearsonr
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames


class AutoEDA:

    def __init__(self):
        self.colors = ["#2146B2", "#E0CA27", "#F8C895", "#D98162", "#F2EFEB", "#26261B"]

    def read_file(self, file_path):
        """
        Reads a file and returns a pandas DataFrame.
        
        Parameters:
        - file_path (str): The path to the file.
        
        Returns:
        - DataFrame: A pandas DataFrame.
        """
        # Determine the file extension
        file_extension = file_path.split('.')[-1].lower()
        
        # Read the file based on the file extension
        try:
            if file_extension == 'csv':
                return pd.read_csv(file_path, index_col=0)
            elif file_extension in ['xls', 'xlsx']:
                return pd.read_excel(file_path)
            elif file_extension == 'json':
                return pd.read_json(file_path)
            elif file_extension == 'pkl':
                return pd.read_pickle(file_path)
            else:
                raise ValueError("Reading this format is not yet implemented.")
        except Exception as e:
            return f"File reading failed, error: {e}."


    def explo_df(self, DataFrame, column=None):
        """
        Explores a DataFrame or a specific column and prints various statistics.
        
        Parameters:
        - DataFrame: The DataFrame to explore.
        - column (str or list, optional): The column or columns to explore. If None, explore the entire DataFrame.
        """
        if column is None:
            # General DataFrame exploration
            print("DataFrame Information:")
            display(DataFrame.info())
            print("\nFirst 10 rows of the DataFrame:")
            display(DataFrame.head(10))
            print("\nLast 10 rows of the DataFrame:")
            display(DataFrame.tail(10))
            print("\nStatistical description of the DataFrame (numeric):")
            display(DataFrame.describe().T)
            print("\nStatistical description of the DataFrame (categorical):")
            display(DataFrame.describe(include='object').T)
            print("\nCount of null values per column:")
            display(DataFrame.isnull().sum())
            print("\nPercentage of null values per column (only columns with nulls):")
            null_percentage = round(DataFrame.isnull().sum()/DataFrame.shape[0]*100, 2)
            display(null_percentage[null_percentage > 0])
            print("\nRows with all values as null:")
            all_null_rows = DataFrame[DataFrame.isnull().all(axis=1)]
            if not all_null_rows.empty:
                display(all_null_rows)
            else:
                print("There is no rows with all values as null.")
            print("\nCount of duplicate rows:")
            display(DataFrame.duplicated().sum())
        else:
            # Column(s) exploration
            if isinstance(column, str):
                column = [column]  # Convert to list if a single column is passed as a string
            for col in column:
                print(f"\nExploration of the column: {col}")
                if DataFrame[col].dtype in ['int64', 'float64']:
                    print("\nStatistical description (numeric):")
                    display(DataFrame[col].describe())
                else:
                    print("\nStatistical description (categorical):")
                    display(DataFrame[col].describe(include='object'))
                print(f"\nCount of null values: {DataFrame[col].isnull().sum()}")
                print(f"\nUnique values: {DataFrame[col].unique()}")
                print(f"\nValue Counts: {DataFrame[col].value_counts()}")
                print(f"\nCount of duplicates in the column: {DataFrame.duplicated(subset=[col]).sum()}")

    
    def __identify_linearity(self, dataframe, column_combinations_list):
        """
        Identifies if the relationships between pairs of variables in a DataFrame are linear or not.

        Parameters:
        -----------
        dataframe : pandas.DataFrame
            The DataFrame containing the variables to be analyzed.

        column_combinations_list : list of tuples
            A list of tuples where each tuple contains two column names from the DataFrame to be analyzed.

        Returns:
        --------
        linear_relationships : list of tuples
            A list of tuples containing the names of the columns that have a linear relationship.

        non_linear_relationships : list of tuples
            A list of tuples containing the names of the columns that do not have a linear relationship.
        """
        linear_relationships = []
        non_linear_relationships = []

        for pair in column_combinations_list: 
            # Perform the normality test
            _, p_value1 = kstest(dataframe[pair[0]], "norm")
            _, p_value2 = kstest(dataframe[pair[1]], "norm")

            if p_value1 > 0.05 and p_value2 > 0.05:
                linear_relationships.append(pair)
            else:
                non_linear_relationships.append(pair)

        return linear_relationships, non_linear_relationships


    def numeric_correlations(self, dataframe):
        """
        Identifies correlations among numeric columns in the dataframe using Pearson or Spearman methods.

        Parameters:
        -----------
        dataframe : pandas.DataFrame
            The DataFrame containing the variables to analyze.

        Returns:
        --------
        results : dict
            A dictionary containing the correlation DataFrames. The keys are 'pearson' and 'spearman'.
            If all relationships are either linear or non-linear, only one key will be present.
        """
        # Select numeric columns
        numerics = dataframe.select_dtypes(include=np.number).columns
        
        # Generate all possible combinations of numeric columns
        num_combinations = list(combinations(numerics, 2))
        
        # Identify if the relationships are linear or non-linear
        linear, non_linear = self.__identify_linearity(dataframe, num_combinations)
        
        # Initialize the results dictionary
        results = {}

        if linear:
            # Apply Pearson correlation for linear relationships
            linear_columns = set([item for sublist in linear for item in sublist])
            df_pearson = dataframe[list(linear_columns)].corr(method="pearson")
            results['pearson'] = df_pearson

        if non_linear:
            # Apply Spearman correlation for non-linear relationships
            non_linear_columns = set([item for sublist in non_linear for item in sublist])
            df_spearman = dataframe[list(non_linear_columns)].corr(method="spearman")
            results['spearman'] = df_spearman
        
        return results


    def classify_correlations(self, correlation_df):
        """
        Classify the correlations in the given DataFrame into weak, moderate, and strong correlations.
        
        Parameters:
        -----------
        correlation_df : pandas.DataFrame
            DataFrame containing the correlation values between pairs of variables.
        
        Returns:
        --------
        None
        """
        weak_correlations = []
        moderate_correlations = []
        strong_correlations = []

        # To avoid duplicates, use a set to register processed pairs
        processed_pairs = set()

        for row in correlation_df.index:
            for col in correlation_df.columns:
                if row != col and (col, row) not in processed_pairs:
                    corr_value = correlation_df.at[row, col]
                    processed_pairs.add((row, col))
                    processed_pairs.add((col, row))

                    if 0.1 <= abs(corr_value) < 0.3:
                        weak_correlations.append((row, col, corr_value))
                    elif 0.3 <= abs(corr_value) < 0.7:
                        moderate_correlations.append((row, col, corr_value))
                    elif abs(corr_value) >= 0.7:
                        strong_correlations.append((row, col, corr_value))

        # Print the results
        print("Weak Correlations:")
        for item in weak_correlations:
            print(f"Between {item[0]} and {item[1]}: {item[2]:.2f}")

        print("\nModerate Correlations:")
        for item in moderate_correlations:
            print(f"Between {item[0]} and {item[1]}: {item[2]:.2f}")

        print("\nStrong Correlations:")
        for item in strong_correlations:
            print(f"Between {item[0]} and {item[1]}: {item[2]:.2f}")

        # Return None, as we're only printing the results
        return None
        # return weak_correlations, moderate_correlations, strong_correlations


    def identify_categorical_cols(self, df):
        return df.select_dtypes(include='O').columns
 
    def __cramers_v(self, confusion_matrix):
        chi2 = chi2_contingency(confusion_matrix)[0]
        n = confusion_matrix.sum().sum()
        r, k = confusion_matrix.shape
        return np.sqrt(chi2 / (n * (min(k, r) - 1)))

    def categorical_correlations(self, df, categorical_columns):
        correlations = []
        processed_pairs = set()  # Para rastrear los pares ya procesados

        
        for col1 in categorical_columns:
            for col2 in categorical_columns:
                if col1 != col2 and (col2, col1) not in processed_pairs:
                    confusion_matrix = pd.crosstab(df[col1], df[col2])
                    correlation = self.__cramers_v(confusion_matrix)
                    correlations.append((col1, col2, correlation))
                    processed_pairs.add((col1, col2))  # Añadir el par a los procesados

        weak_correlations = [item for item in correlations if 0.1 <= item[2] < 0.3]
        moderate_correlations = [item for item in correlations if 0.3 <= item[2] < 0.5]
        strong_correlations = [item for item in correlations if item[2] >= 0.5]
    
        # Print the results
        print("Weak Correlations:")
        for item in weak_correlations:
            print(f"Between {item[0]} and {item[1]}: {item[2]:.2f}")

        print("\nModerate Correlations:")
        for item in moderate_correlations:
            print(f"Between {item[0]} and {item[1]}: {item[2]:.2f}")

        print("\nStrong Correlations:")
        for item in strong_correlations:
            print(f"Between {item[0]} and {item[1]}: {item[2]:.2f}")

        # Return None, as we're only printing the results
        return None


    def plot_histogram(self, df, column, bins=10, title=None, xlabel=None, ylabel='Frequency'):
        """
        Plot a histogram for a given column in the DataFrame.

        Parameters:
        -----------
        df : pandas.DataFrame
            The DataFrame containing the data.
        column : str
            The column for which the histogram is to be plotted.
        bins : int, optional (default=10)
            Number of bins for the histogram.
        title : str, optional
            Title of the plot.
        xlabel : str, optional
            Label for the x-axis.
        ylabel : str, optional (default='Frequency')
            Label for the y-axis.
        """
        color = self.colors[0] # Choose a color from the color palette
        plt.figure(figsize=(8, 4))
        plt.hist(df[column].dropna(), bins=bins, edgecolor='k', color=color)
        plt.title(title if title else f'Histogram of {column}')
        plt.xlabel(xlabel if xlabel else column)
        plt.ylabel(ylabel)
        plt.show()


    def plot_scatter(self, df, x_column, y_column, title=None, xlabel=None, ylabel=None):
        """
        Plot a scatter plot for two given columns in the DataFrame.

        Parameters:
        -----------
        df : pandas.DataFrame
            The DataFrame containing the data.
        x_column : str
            The column for the x-axis.
        y_column : str
            The column for the y-axis.
        title : str, optional
            Title of the plot.
        xlabel : str, optional
            Label for the x-axis.
        ylabel : str, optional
            Label for the y-axis.
        """
        color = self.colors[1] # Choose a color from the color palette
        plt.figure(figsize=(8, 4))
        plt.scatter(df[x_column], df[y_column], color=color)
        plt.title(title if title else f'Scatter Plot of {x_column} vs {y_column}')
        plt.xlabel(xlabel if xlabel else x_column)
        plt.ylabel(ylabel if ylabel else y_column)
        plt.show()


    def plot_boxplot(self, df, column1, column2=None, title=None, xlabel=None, ylabel='Value'):
        """
        Plot a boxplot for one column or a comparison of two columns in the DataFrame.

        Parameters:
        -----------
        df : pandas.DataFrame
            The DataFrame containing the data.
        column1 : str
            The primary column for which the boxplot is to be plotted.
        column2 : str, optional
            The secondary column to compare with the primary column (for grouped boxplot).
        title : str, optional
            Title of the plot.
        xlabel : str, optional
            Label for the x-axis.
        ylabel : str, optional (default='Value')
            Label for the y-axis.
        """
        if column2 is None:
            # Single column boxplot
            plt.figure(figsize=(8, 4))
            sns.boxplot(y=df[column1], color=self.colors[3])
            plt.title(title if title else f'Boxplot of {column1}')
            plt.ylabel(ylabel)
            plt.show()
        else:
            # Comparison of two columns
            plt.figure(figsize=(10, 6))
            sns.boxplot(x=df[column2], y=df[column1], palette=self.colors)
            plt.title(title if title else f'Boxplot of {column1} by {column2}')
            plt.xlabel(column2)
            plt.ylabel(ylabel)
            plt.show()


    def visualize_pairplot(self, dataframe, columns, hue=None, height=5):
        """
        Visualize pair plots for selected columns of the DataFrame.
        
        Parameters:
        -----------
        dataframe : pandas.DataFrame
            The DataFrame containing the data to be visualized.
        
        columns : list of str
            List of column names to be included in the pair plot.
        
        hue : str, optional, default: None
            Column name to be used for color encoding.
        
        height : int, optional, default: 5
            Height of each facet in inches.
        
        Returns:
        --------
        None
        """
        if not all(col in dataframe.columns for col in columns):
            raise ValueError("One or more columns are not in the DataFrame.")
        
        # Create the pairplot with the specified hue and color palette
        pairplot = sns.pairplot(dataframe[columns + [hue]] if hue else dataframe[columns], 
                               hue=hue, palette=self.colors, height=height)

        plt.tight_layout()
        plt.show()

    def visualize_categorical_counts(self, dataframe, categorical_cols):
        """
        Visualize count plots for categorical columns in the DataFrame.
        
        Parameters:
        -----------
        dataframe : pandas.DataFrame
            The DataFrame containing the data to be visualized.
        
        categorical_cols : list of str
            List of categorical column names to be included in the count plots.
        
        Returns:
        --------
        None
        """
        num_cols = len(categorical_cols)
        num_rows = (num_cols + 1) // 2  # Calculate number of rows needed for subplots
        fig, ax = plt.subplots(num_rows, 2, figsize=(10, num_rows * 4))
        fig.subplots_adjust(hspace=0.5)
        
        # Flatten the axes array for easier indexing
        if num_rows > 1:
            ax = ax.flatten()
        else:
            ax = [ax]
        
        def count_plotter(ax, col, data, colors):
            counted = data[col].value_counts()
            palette = colors[:len(counted)]  # Use only as many colors as there are categories
            sns.barplot(ax=ax, x=counted.index, y=counted.values, width=0.9, palette=palette)
            ax.set_title(f"{col} count graph")
            if col in ['JobRole', 'EducationField']:
                ax.set_xticklabels(labels=counted.index, rotation=90, fontsize=6)
            else:
                ax.set_xticklabels(labels=counted.index, fontsize=8)

        for i, category in enumerate(categorical_cols):
            if i < len(ax):
                count_plotter(ax[i], category, data=dataframe, colors=self.colors)
            else:
                # Hide unused subplots
                ax[i].axis('off')

        plt.show()

    def visualize_facet_grid(self, df, col_names, x_values):
        """
        Create FacetGrid plots for given categorical columns and numerical values.

        Parameters:
        -----------
        df : pandas.DataFrame
            The DataFrame containing the data to be visualized.
        
        col_names : list of str
            List of categorical column names to be used for facets.
        
        x_values : list of str
            List of numerical column names to be plotted on the x-axis.
        
        hue : str, optional, default: 'Gender'
            Column name to be used for color encoding.
        
        Returns:
        --------
        None
        """
        if len(col_names) != len(x_values):
            raise ValueError("Length of col_names and x_values must be the same.")
        
        for col_name, x_value in zip(col_names, x_values):
            facet = sns.FacetGrid(df, col=col_name, hue=hue, aspect=1, palette=self.colors, col_wrap=3)
            facet.map(sns.kdeplot, x_value, fill=True)
            facet.set(xlim=[0, df[x_value].max()])
            facet.add_legend(label_order=df[hue].unique())
            plt.tight_layout()
            plt.show()

    def visualize_general_statistics(self, df, df_heatmap1, df_heatmap2):
        """
        Create a series of general statistics visualizations using subplots.
        
        Parameters:
        -----------
        df : pandas.DataFrame
            The DataFrame containing the data to be visualized.
        
        df_heatmap1 : pandas.DataFrame
            DataFrame for the first heatmap.
        
        df_heatmap2 : pandas.DataFrame
            DataFrame for the second heatmap.
        
        Returns:
        --------
        None
        """
        fig, ax = plt.subplots(3, 2, figsize=(15, 13))
        fig.suptitle('General Statistics')
        fig.subplots_adjust(wspace=0.4, hspace=0.5)
        
        # Boxplot for Department vs. Total Working Years
        sns.boxplot(ax=ax[0, 0], data=df, y='department', x='total_working_years', hue='gender', palette=self.colors[:2])
        ax[0, 0].set_title('Ages by Department', fontsize=14)
        
        # Boxplot for Education Field vs. Age
        sns.boxplot(ax=ax[0, 1], data=df, y='education_field', x='age', hue='gender', palette=self.colors[:2])
        ax[0, 1].set_title('Ages by Education Field', fontsize=14)
        
        # Heatmap for Job Role-Satisfaction Mapping
        sns.heatmap(ax=ax[1, 0], data=df_heatmap1, square=True, linewidth=1, cmap='Reds')
        ax[1, 0].set_title('Job Role-Satisfaction Mapping', fontsize=14)
        
        # Heatmap for Job Level-Involvement Mapping
        sns.heatmap(ax=ax[1, 1], data=df_heatmap2, square=True, linewidth=1, cmap='Blues')
        ax[1, 1].set_title('Job Level-Involvement Mapping', fontsize=14)
        
        # Histogram for Distribution of Salary Percent Hike
        sns.histplot(ax=ax[2, 0], data=df, x='percent_salary_hike', hue='gender', multiple='stack', palette=self.colors)
        ax[2, 0].set_title('Distribution of Salary Percent Hike', fontsize=14)
        
        # Histogram for Distribution of Years at Company
        sns.histplot(ax=ax[2, 1], data=df, x='years_at_company', hue='gender', multiple='stack', palette=self.colors)
        ax[2, 1].set_title('Distribution of Years at Company', fontsize=14)
        
        plt.show()


    def pieplot(self, data, columns, titles=None, explode_ratio=0.05):
        """
        Generates pie plots for specified columns in the dataframe and arranges them in a single row.

        Parameters:
        - data: pd.DataFrame, the dataframe containing the data.
        - columns: list of str, the names of the columns to plot.
        - titles: list of str, optional, titles for each pie plot.
        - explode_ratio: float, optional, the fraction by which to offset each wedge.

        Returns:
        - None, displays the pie plots.
        """
        def categorize_value(val):
            if val == 1:
                return 'Bajo'
            elif val == 2:
                return 'Medio Bajo'
            elif val == 3:
                return 'Medio Alto'
            elif val == 4:
                return 'Alto'

        # self.colors = ["#2146B2", "#E0CA27", "#F8C895", "#D98162", "#F2EFEB", "#26261B"]


        # Define a consistent color mapping for the categories
        color_mapping = {
            'Bajo': '#2146B2',        
            'Medio Bajo': '#E0CA27', 
            'Medio Alto': '#F8C895', 
            'Alto': '#D98162'      
        }

        num_columns = len(columns)
        
        if titles is None:
            titles = [None] * num_columns
        
        if num_columns == 0:
            raise ValueError("The 'columns' list must contain at least one column.")
        
        # Determine layout for subplots
        fig, axs = plt.subplots(1, num_columns, figsize=(num_columns * 5, 5))
        
        # Handle the case where there is only one column
        if num_columns == 1:
            axs = [axs]
        
        for ax, col, title in zip(axs, columns, titles):
            # Apply the categorization function to convert numeric values to categories
            categorized_data = data[col].apply(categorize_value).value_counts()
            
            # Create the explode configuration
            explode = [explode_ratio] * len(categorized_data)
            
            # Generate the pie plot
            wedges, texts, autotexts = ax.pie(
                categorized_data,
                labels=categorized_data.index,
                autopct='%1.1f%%',
                startangle=140,
                colors=[color_mapping[label] for label in categorized_data.index],
                explode=explode,
                shadow=True
            )
            
            # Set title if provided
            if title:
                ax.set_title(title, fontsize=16)
        
        plt.tight_layout()
        plt.show()


    def boxplot_distribution(self, data, category_column, value_column, title=None):    
        """
        Generates a boxplot to show the distribution of a numerical value across different categories.

        Parameters:
        - data: pd.DataFrame, the dataframe containing the data.
        - category_column: str, the name of the categorical column (e.g., products).
        - value_column: str, the name of the numerical column (e.g., price).
        - title: str, optional, the title of the plot.

        Returns:
        - None, displays the boxplot.
        """
        plt.figure(figsize=(12, 6))
        
        # Create the boxplot using seaborn
        sns.boxplot(x=data[category_column], y=data[value_column], palette=self.colors)
        
        # Add title if provided
        if title:
            plt.title(title, fontsize=16)
        
        # Add labels
        plt.xlabel(category_column.capitalize())
        plt.ylabel(value_column.capitalize())
        
        plt.xticks(rotation=45)  # Rotate x-axis labels if needed for better readability
        plt.tight_layout()
        plt.show()



In [2]:
df = AutoEDA()

In [5]:
data = pd.read_csv("finanzas-hotel-bookings.csv", index_col=0)


In [8]:
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,0
0,Resort Hotel,False,342.0,2015.0,July,27.0,1.0,0.0,0.0,2.0,,0.0,BB,PRT,,Direct,0.0,,0.0,C,C,3.0,,,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01 00:00:00,
1,Resort Hotel,False,737.0,,July,27.0,1.0,0.0,0.0,2.0,,0.0,BB,,,Direct,0.0,0.0,0.0,,C,4.0,,,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01 00:00:00,
2,Resort Hotel,False,7.0,2015.0,July,27.0,1.0,0.0,1.0,1.0,0.0,0.0,BB,GBR,,Direct,0.0,0.0,0.0,A,C,0.0,,,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02 00:00:00,
3,Resort Hotel,False,13.0,,July,27.0,1.0,0.0,1.0,1.0,,0.0,BB,GBR,Corporate,Corporate,0.0,0.0,0.0,A,A,0.0,304.0,,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02 00:00:00,
4,Resort Hotel,False,14.0,,July,,1.0,0.0,2.0,2.0,,0.0,BB,,Online TA,TA/TO,0.0,0.0,0.0,A,A,0.0,240.0,,0.0,Transient,98.0,0.0,1.0,Check-Out,2015-07-03 00:00:00,


In [16]:
def eda_columnas_hotel(archivo):
    # Cargar datos desde el archivo CSV
    df_customer = pd.read_csv(archivo)
    print("Archivo cargado exitosamente")
    # Explorar cada columna del DataFrame
    for columna in df_customer.columns:
        print(f"Exploración de la columna: {columna}")
        datos_columna = df_customer[columna]  # Extrae los datos de la columna específica
        # Descripción estadística
        print("Descripción estadística:")
        display(datos_columna.describe(include='all'))
        # valores nulos
        print("Conteo de valores nulos:")
        print(datos_columna.isnull().sum())
        # valores duplicados
        print("Número de filas duplicadas:")
        display(df_customer.duplicated(subset=[columna]).sum())
        # valores únicos
        print("Conteo de valores únicos:")
        print(datos_columna.nunique())
        # conteo de cada valor
        print("Conteo de valores:")
        display(datos_columna.value_counts())
        # valor más frecuente (moda)
        if not datos_columna.mode().empty:
            print("Valor más frecuente (moda):")
            display(datos_columna.mode().iloc[0])
        print("-" * 30)
# Llamada a la función
archivo = 'finanzas-hotel-bookings.csv'
eda_columnas_hotel(archivo)

Archivo cargado exitosamente
Exploración de la columna: Unnamed: 0
Descripción estadística:


count    182877.000000
mean      91438.000000
std       52792.186927
min           0.000000
25%       45719.000000
50%       91438.000000
75%      137157.000000
max      182876.000000
Name: Unnamed: 0, dtype: float64

Conteo de valores nulos:
0
Número de filas duplicadas:


0

Conteo de valores únicos:
182877
Conteo de valores:


Unnamed: 0
0         1
121933    1
121913    1
121914    1
121915    1
         ..
60961     1
60962     1
60963     1
60964     1
182876    1
Name: count, Length: 182877, dtype: int64

Valor más frecuente (moda):


0

------------------------------
Exploración de la columna: hotel
Descripción estadística:


count         119390
unique             2
top       City Hotel
freq           79330
Name: hotel, dtype: object

Conteo de valores nulos:
63487
Número de filas duplicadas:


182874

Conteo de valores únicos:
2
Conteo de valores:


hotel
City Hotel      79330
Resort Hotel    40060
Name: count, dtype: int64

Valor más frecuente (moda):


'City Hotel'

------------------------------
Exploración de la columna: is_canceled
Descripción estadística:


count     119390
unique         2
top        False
freq       75166
Name: is_canceled, dtype: object

Conteo de valores nulos:
63487
Número de filas duplicadas:


182874

Conteo de valores únicos:
2
Conteo de valores:


is_canceled
False    75166
True     44224
Name: count, dtype: int64

Valor más frecuente (moda):


False

------------------------------
Exploración de la columna: lead_time
Descripción estadística:


count    119490.000000
mean        104.172628
std         106.975949
min           0.000000
25%          18.000000
50%          69.000000
75%         161.000000
max         737.000000
Name: lead_time, dtype: float64

Conteo de valores nulos:
63387
Número de filas duplicadas:


182297

Conteo de valores únicos:
579
Conteo de valores:


lead_time
0.000000      6345
1.000000      3460
2.000000      2069
3.000000      1816
4.000000      1715
              ... 
359.478372       1
310.531535       1
207.269543       1
274.258844       1
341.238166       1
Name: count, Length: 579, dtype: int64

Valor más frecuente (moda):


0.0

------------------------------
Exploración de la columna: arrival_date_year
Descripción estadística:


count    64829.000000
mean      2016.156196
std          0.706674
min       2015.000000
25%       2016.000000
50%       2016.000000
75%       2017.000000
max       2017.000000
Name: arrival_date_year, dtype: float64

Conteo de valores nulos:
118048
Número de filas duplicadas:


182873

Conteo de valores únicos:
3
Conteo de valores:


arrival_date_year
2016.0    30873
2017.0    22041
2015.0    11915
Name: count, dtype: int64

Valor más frecuente (moda):


2016.0

------------------------------
Exploración de la columna: arrival_date_month
Descripción estadística:


count     119390
unique        15
top       August
freq       13877
Name: arrival_date_month, dtype: object

Conteo de valores nulos:
63487
Número de filas duplicadas:


182861

Conteo de valores únicos:
15
Conteo de valores:


arrival_date_month
August       13877
July         12661
May          11791
October      11160
April        11089
June         10939
September    10508
November      6794
December      6780
March         5922
February      4898
3             3872
January       3581
2             3170
1             2348
Name: count, dtype: int64

Valor más frecuente (moda):


'August'

------------------------------
Exploración de la columna: arrival_date_week_number
Descripción estadística:


count    101004.000000
mean         27.175785
std          13.613871
min           1.000000
25%          16.000000
50%          28.000000
75%          38.000000
max          53.000000
Name: arrival_date_week_number, dtype: float64

Conteo de valores nulos:
81873
Número de filas duplicadas:


182823

Conteo de valores únicos:
53
Conteo de valores:


arrival_date_week_number
33.0    3051
30.0    2608
34.0    2587
32.0    2575
18.0    2503
28.0    2399
21.0    2394
17.0    2363
20.0    2354
29.0    2328
42.0    2328
31.0    2318
41.0    2281
38.0    2244
27.0    2238
15.0    2237
25.0    2236
35.0    2225
39.0    2212
23.0    2212
22.0    2173
24.0    2123
16.0    2044
19.0    2039
13.0    2029
40.0    2024
26.0    2004
43.0    1994
44.0    1916
14.0    1912
8.0     1869
37.0    1849
36.0    1831
7.0     1802
10.0    1798
9.0     1771
12.0    1763
11.0    1755
45.0    1672
53.0    1544
49.0    1486
47.0    1447
46.0    1330
48.0    1288
6.0     1285
4.0     1267
50.0    1266
5.0     1188
3.0     1115
2.0     1027
52.0     993
1.0      895
51.0     812
Name: count, dtype: int64

Valor más frecuente (moda):


33.0

------------------------------
Exploración de la columna: arrival_date_day_of_month
Descripción estadística:


count    119271.000000
mean         15.795977
std           8.780503
min           1.000000
25%           8.000000
50%          16.000000
75%          23.000000
max          31.000000
Name: arrival_date_day_of_month, dtype: float64

Conteo de valores nulos:
63606
Número de filas duplicadas:


182845

Conteo de valores únicos:
31
Conteo de valores:


arrival_date_day_of_month
17.0    4402
5.0     4315
15.0    4191
25.0    4155
26.0    4139
9.0     4094
12.0    4081
16.0    4072
2.0     4052
19.0    4047
20.0    4027
18.0    3998
24.0    3993
28.0    3940
8.0     3917
3.0     3852
30.0    3847
6.0     3832
14.0    3817
27.0    3797
4.0     3762
21.0    3758
13.0    3744
7.0     3665
1.0     3619
23.0    3610
11.0    3598
22.0    3594
29.0    3577
10.0    3572
31.0    2204
Name: count, dtype: int64

Valor más frecuente (moda):


17.0

------------------------------
Exploración de la columna: stays_in_weekend_nights
Descripción estadística:


count    119490.000000
mean          0.939461
std           1.082472
min           0.000000
25%           0.000000
50%           1.000000
75%           2.000000
max          19.914715
Name: stays_in_weekend_nights, dtype: float64

Conteo de valores nulos:
63387
Número de filas duplicadas:


182759

Conteo de valores únicos:
117
Conteo de valores:


stays_in_weekend_nights
0.000000     51998
2.000000     33308
1.000000     30626
4.000000      1855
3.000000      1259
             ...  
11.989378        1
10.800697        1
10.149552        1
19.672050        1
17.791486        1
Name: count, Length: 117, dtype: int64

Valor más frecuente (moda):


0.0

------------------------------
Exploración de la columna: stays_in_week_nights
Descripción estadística:


count    119490.000000
mean          2.515068
std           1.976511
min           0.000000
25%           1.000000
50%           2.000000
75%           3.000000
max          50.000000
Name: stays_in_week_nights, dtype: float64

Conteo de valores nulos:
63387
Número de filas duplicadas:


182741

Conteo de valores únicos:
135
Conteo de valores:


stays_in_week_nights
2.000000     33684
1.000000     30310
3.000000     22258
5.000000     11077
4.000000      9563
             ...  
23.476129        1
19.361901        1
22.683324        1
16.604701        1
24.442690        1
Name: count, Length: 135, dtype: int64

Valor más frecuente (moda):


2.0

------------------------------
Exploración de la columna: adults
Descripción estadística:


count    119428.000000
mean          6.244423
std          14.574814
min           0.000000
25%           2.000000
50%           2.000000
75%           2.000000
max          59.000000
Name: adults, dtype: float64

Conteo de valores nulos:
63449
Número de filas duplicadas:


182846

Conteo de valores únicos:
30
Conteo de valores:


adults
2.0     82156
1.0     21094
3.0      5716
59.0     1035
54.0     1025
51.0     1011
58.0     1005
50.0      999
57.0      998
53.0      984
56.0      975
55.0      959
52.0      936
0.0       369
4.0        53
19.0       17
14.0       13
15.0       12
18.0       11
11.0       11
12.0       10
10.0        9
13.0        8
16.0        6
26.0        5
17.0        4
5.0         2
20.0        2
27.0        2
6.0         1
Name: count, dtype: int64

Valor más frecuente (moda):


2.0

------------------------------
Exploración de la columna: children
Descripción estadística:


count    69302.000000
mean         0.125162
std          0.688305
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         19.000000
Name: children, dtype: float64

Conteo de valores nulos:
113575
Número de filas duplicadas:


182862

Conteo de valores únicos:
14
Conteo de valores:


children
0.0     64221
1.0      2797
2.0      2138
3.0        45
18.0       17
10.0       15
14.0       15
15.0       10
11.0        9
16.0        9
17.0        8
19.0        7
13.0        6
12.0        5
Name: count, dtype: int64

Valor más frecuente (moda):


0.0

------------------------------
Exploración de la columna: babies
Descripción estadística:


count    119431.000000
mean          0.019903
std           0.433366
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          19.000000
Name: babies, dtype: float64

Conteo de valores nulos:
63446
Número de filas duplicadas:


182862

Conteo de valores únicos:
14
Conteo de valores:


babies
0.0     118414
1.0        900
10.0        17
16.0        16
12.0        16
2.0         15
19.0        12
18.0         8
17.0         7
13.0         7
15.0         7
11.0         7
14.0         4
9.0          1
Name: count, dtype: int64

Valor más frecuente (moda):


0.0

------------------------------
Exploración de la columna: meal
Descripción estadística:


count     119390
unique         5
top           BB
freq       92310
Name: meal, dtype: object

Conteo de valores nulos:
63487
Número de filas duplicadas:


182871

Conteo de valores únicos:
5
Conteo de valores:


meal
BB           92310
HB           14463
SC           10650
Undefined     1169
FB             798
Name: count, dtype: int64

Valor más frecuente (moda):


'BB'

------------------------------
Exploración de la columna: country
Descripción estadística:


count     65054
unique      163
top         PRT
freq      26577
Name: country, dtype: object

Conteo de valores nulos:
117823
Número de filas duplicadas:


182713

Conteo de valores únicos:
163
Conteo de valores:


country
PRT    26577
GBR     6585
FRA     5756
ESP     4712
DEU     3972
       ...  
LIE        1
NIC        1
GGY        1
FJI        1
UMI        1
Name: count, Length: 163, dtype: int64

Valor más frecuente (moda):


'PRT'

------------------------------
Exploración de la columna: market_segment
Descripción estadística:


count         59934
unique            8
top       Online TA
freq          28331
Name: market_segment, dtype: object

Conteo de valores nulos:
122943
Número de filas duplicadas:


182868

Conteo de valores únicos:
8
Conteo de valores:


market_segment
Online TA        28331
Offline TA/TO    12116
Groups           10008
Direct            6354
Corporate         2643
Complementary      363
Aviation           118
Undefined            1
Name: count, dtype: int64

Valor más frecuente (moda):


'Online TA'

------------------------------
Exploración de la columna: distribution_channel
Descripción estadística:


count     105899
unique         5
top        TA/TO
freq       86822
Name: distribution_channel, dtype: object

Conteo de valores nulos:
76978
Número de filas duplicadas:


182871

Conteo de valores únicos:
5
Conteo de valores:


distribution_channel
TA/TO        86822
Direct       12997
Corporate     5899
GDS            176
Undefined        5
Name: count, dtype: int64

Valor más frecuente (moda):


'TA/TO'

------------------------------
Exploración de la columna: is_repeated_guest
Descripción estadística:


count    114376.000000
mean          0.031877
std           0.175674
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: is_repeated_guest, dtype: float64

Conteo de valores nulos:
68501
Número de filas duplicadas:


182874

Conteo de valores únicos:
2
Conteo de valores:


is_repeated_guest
0.0    110730
1.0      3646
Name: count, dtype: int64

Valor más frecuente (moda):


0.0

------------------------------
Exploración de la columna: previous_cancellations
Descripción estadística:


count    76228.000000
mean         0.109317
std          1.028093
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         26.000000
Name: previous_cancellations, dtype: float64

Conteo de valores nulos:
106649
Número de filas duplicadas:


182855

Conteo de valores únicos:
21
Conteo de valores:


previous_cancellations
0.0     71960
1.0      3878
2.0        76
3.0        41
11.0       38
24.0       33
19.0       23
13.0       23
4.0        20
14.0       19
25.0       18
26.0       18
5.0        14
17.0       13
6.0        12
12.0       11
18.0        9
15.0        8
16.0        7
10.0        6
21.0        1
Name: count, dtype: int64

Valor más frecuente (moda):


0.0

------------------------------
Exploración de la columna: previous_bookings_not_canceled
Descripción estadística:


count    119390.000000
mean          0.137097
std           1.497437
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          72.000000
Name: previous_bookings_not_canceled, dtype: float64

Conteo de valores nulos:
63487
Número de filas duplicadas:


182803

Conteo de valores únicos:
73
Conteo de valores:


previous_bookings_not_canceled
0.0     115770
1.0       1542
2.0        580
3.0        333
4.0        229
         ...  
47.0         1
49.0         1
50.0         1
51.0         1
72.0         1
Name: count, Length: 73, dtype: int64

Valor más frecuente (moda):


0.0

------------------------------
Exploración de la columna: reserved_room_type
Descripción estadística:


count     79275
unique       10
top           A
freq      57202
Name: reserved_room_type, dtype: object

Conteo de valores nulos:
103602
Número de filas duplicadas:


182866

Conteo de valores únicos:
10
Conteo de valores:


reserved_room_type
A    57202
D    12637
E     4345
F     1907
G     1375
B      746
C      625
H      428
P        6
L        4
Name: count, dtype: int64

Valor más frecuente (moda):


'A'

------------------------------
Exploración de la columna: assigned_room_type
Descripción estadística:


count     119390
unique        12
top            A
freq       74053
Name: assigned_room_type, dtype: object

Conteo de valores nulos:
63487
Número de filas duplicadas:


182864

Conteo de valores únicos:
12
Conteo de valores:


assigned_room_type
A    74053
D    25322
E     7806
F     3751
G     2553
C     2375
B     2163
H      712
I      363
K      279
P       12
L        1
Name: count, dtype: int64

Valor más frecuente (moda):


'A'

------------------------------
Exploración de la columna: booking_changes
Descripción estadística:


count    119418.000000
mean          0.232737
std           0.770795
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          21.000000
Name: booking_changes, dtype: float64

Conteo de valores nulos:
63459
Número de filas duplicadas:


182854

Conteo de valores únicos:
22
Conteo de valores:


booking_changes
0.0     101257
1.0      12691
2.0       3800
3.0        927
4.0        376
5.0        118
6.0         63
7.0         31
10.0        19
8.0         17
17.0        17
11.0        16
15.0        15
13.0        13
12.0        12
14.0        11
16.0        11
19.0         8
9.0          8
18.0         6
20.0         1
21.0         1
Name: count, dtype: int64

Valor más frecuente (moda):


0.0

------------------------------
Exploración de la columna: agent
Descripción estadística:


count    103050.000000
mean         86.693382
std         110.774548
min           1.000000
25%           9.000000
50%          14.000000
75%         229.000000
max         535.000000
Name: agent, dtype: float64

Conteo de valores nulos:
79827
Número de filas duplicadas:


182543

Conteo de valores únicos:
333
Conteo de valores:


agent
9.0      31961
240.0    13922
1.0       7191
14.0      3640
7.0       3539
         ...  
289.0        1
432.0        1
265.0        1
93.0         1
304.0        1
Name: count, Length: 333, dtype: int64

Valor más frecuente (moda):


9.0

------------------------------
Exploración de la columna: company
Descripción estadística:


count    3870.000000
mean      188.828424
std       131.473039
min         6.000000
25%        62.000000
50%       178.000000
75%       270.000000
max       543.000000
Name: company, dtype: float64

Conteo de valores nulos:
179007
Número de filas duplicadas:


182566

Conteo de valores únicos:
310
Conteo de valores:


company
40.0     533
223.0    447
67.0     160
45.0     145
153.0    132
        ... 
225.0      1
518.0      1
126.0      1
54.0       1
282.0      1
Name: count, Length: 310, dtype: int64

Valor más frecuente (moda):


40.0

------------------------------
Exploración de la columna: days_in_waiting_list
Descripción estadística:


count    119390.000000
mean          2.321149
std          17.594721
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max         391.000000
Name: days_in_waiting_list, dtype: float64

Conteo de valores nulos:
63487
Número de filas duplicadas:


182748

Conteo de valores únicos:
128
Conteo de valores:


days_in_waiting_list
0.0      115692
39.0        227
58.0        164
44.0        141
31.0        127
          ...  
116.0         1
109.0         1
37.0          1
89.0          1
36.0          1
Name: count, Length: 128, dtype: int64

Valor más frecuente (moda):


0.0

------------------------------
Exploración de la columna: customer_type
Descripción estadística:


count         94199
unique            4
top       Transient
freq          70610
Name: customer_type, dtype: object

Conteo de valores nulos:
88678
Número de filas duplicadas:


182872

Conteo de valores únicos:
4
Conteo de valores:


customer_type
Transient          70610
Transient-Party    19839
Contract            3290
Group                460
Name: count, dtype: int64

Valor más frecuente (moda):


'Transient'

------------------------------
Exploración de la columna: adr
Descripción estadística:


count    119390.000000
mean        101.831122
std          50.535790
min          -6.380000
25%          69.290000
50%          94.575000
75%         126.000000
max        5400.000000
Name: adr, dtype: float64

Conteo de valores nulos:
63487
Número de filas duplicadas:


173997

Conteo de valores únicos:
8879
Conteo de valores:


adr
62.00     3754
75.00     2715
90.00     2473
65.00     2418
0.00      1959
          ... 
89.43        1
63.07        1
55.69        1
49.51        1
157.71       1
Name: count, Length: 8879, dtype: int64

Valor más frecuente (moda):


62.0

------------------------------
Exploración de la columna: required_car_parking_spaces
Descripción estadística:


count    119390.000000
mean          0.062518
std           0.245291
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           8.000000
Name: required_car_parking_spaces, dtype: float64

Conteo de valores nulos:
63487
Número de filas duplicadas:


182871

Conteo de valores únicos:
5
Conteo de valores:


required_car_parking_spaces
0.0    111974
1.0      7383
2.0        28
3.0         3
8.0         2
Name: count, dtype: int64

Valor más frecuente (moda):


0.0

------------------------------
Exploración de la columna: total_of_special_requests
Descripción estadística:


count    119390.000000
mean          0.571363
std           0.792798
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           5.000000
Name: total_of_special_requests, dtype: float64

Conteo de valores nulos:
63487
Número de filas duplicadas:


182870

Conteo de valores únicos:
6
Conteo de valores:


total_of_special_requests
0.0    70318
1.0    33226
2.0    12969
3.0     2497
4.0      340
5.0       40
Name: count, dtype: int64

Valor más frecuente (moda):


0.0

------------------------------
Exploración de la columna: reservation_status
Descripción estadística:


count        119390
unique            3
top       Check-Out
freq          75166
Name: reservation_status, dtype: object

Conteo de valores nulos:
63487
Número de filas duplicadas:


182873

Conteo de valores únicos:
3
Conteo de valores:


reservation_status
Check-Out    75166
Canceled     43017
No-Show       1207
Name: count, dtype: int64

Valor más frecuente (moda):


'Check-Out'

------------------------------
Exploración de la columna: reservation_status_date
Descripción estadística:


count                  106514
unique                    991
top       2015-10-21 00:00:00
freq                     1298
Name: reservation_status_date, dtype: object

Conteo de valores nulos:
76363
Número de filas duplicadas:


181885

Conteo de valores únicos:
991
Conteo de valores:


reservation_status_date
2015-10-21 00:00:00    1298
2015-07-06 00:00:00     732
2016-11-25 00:00:00     692
2015-01-01 00:00:00     678
2016-01-18 00:00:00     562
                       ... 
2029-04-31                1
2022-09-31                1
2029-02-30                1
2029-06-31                1
2024-11-31                1
Name: count, Length: 991, dtype: int64

Valor más frecuente (moda):


'2015-10-21 00:00:00'

------------------------------
Exploración de la columna: 0
Descripción estadística:


count     42951
unique     1588
top         0.0
freq      12780
Name: 0, dtype: object

Conteo de valores nulos:
139926
Número de filas duplicadas:


181288

Conteo de valores únicos:
1588
Conteo de valores:


0
0.0        12780
2.0         2363
1.0         2121
A           1771
BB          1257
           ...  
48.2           1
149.5          1
68.07          1
153.07         1
4/02/17        1
Name: count, Length: 1588, dtype: int64

Valor más frecuente (moda):


'0.0'

------------------------------


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 182877 entries, 0 to 182876
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  object 
 2   lead_time                       119490 non-null  float64
 3   arrival_date_year               64829 non-null   float64
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        101004 non-null  float64
 6   arrival_date_day_of_month       119271 non-null  float64
 7   stays_in_weekend_nights         119490 non-null  float64
 8   stays_in_week_nights            119490 non-null  float64
 9   adults                          119428 non-null  float64
 10  children                        69302 non-null   float64
 11  babies                          119431 non-null  float64
 12  meal                 

## DUPLICADOS

In [68]:
data.duplicated().sum()

63040

In [70]:
duplicated_rows = data[data.duplicated(keep=False)]  # Keeps all occurrences of duplicates
duplicated_rows

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,0
2183,Resort Hotel,True,170.0,2015.0,October,41.0,4.0,4.0,8.0,2.0,,0.0,BB,CN,,TA/TO,0.0,,0.0,,A,0.0,15.0,,0.0,Transient,44.5,0.0,0.0,Canceled,2015-07-17 00:00:00,
2191,Resort Hotel,True,170.0,2015.0,October,41.0,4.0,4.0,8.0,2.0,,0.0,BB,CN,,TA/TO,0.0,,0.0,,A,0.0,15.0,,0.0,Transient,44.5,0.0,0.0,Canceled,2015-07-17 00:00:00,
2625,Resort Hotel,True,148.0,2015.0,October,,24.0,0.0,1.0,2.0,,0.0,BB,,,Direct,0.0,0.0,0.0,A,A,0.0,,,0.0,Transient,47.0,0.0,0.0,Canceled,2015-06-26 00:00:00,
2630,Resort Hotel,True,148.0,2015.0,October,,24.0,0.0,1.0,2.0,,0.0,BB,,,Direct,0.0,0.0,0.0,A,A,0.0,,,0.0,Transient,47.0,0.0,0.0,Canceled,2015-06-26 00:00:00,
2634,Resort Hotel,True,148.0,2015.0,October,43.0,24.0,0.0,1.0,1.0,0.0,0.0,BB,,,Direct,0.0,,0.0,A,A,0.0,,,0.0,Transient,39.0,0.0,0.0,Canceled,2015-06-26 00:00:00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182772,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
182773,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
182774,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
182775,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [65]:
print("\nRows with all values as null:")
all_null_rows = data[data.isnull().all(axis=1)]
if not all_null_rows.empty:
    display(all_null_rows)     # Eliminar!


Rows with all values as null:


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,0
119393,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
119400,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
119403,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
119404,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
119406,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182772,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
182773,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
182774,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
182775,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## NULOS

In [47]:
null_percentage = round(data.isnull().sum()/data.shape[0]*100, 2)
display(null_percentage[null_percentage > 0])
print("\nRows with all values as null:")

all_null_rows = data[data.isnull().all(axis=1)]
if not all_null_rows.empty:
    display(all_null_rows)
else:
    print("There is no rows with all values as null.")
    print("\nCount of duplicate rows:")

hotel                             34.72
is_canceled                       34.72
lead_time                         34.66
arrival_date_year                 64.55
arrival_date_month                34.72
arrival_date_week_number          44.77
arrival_date_day_of_month         34.78
stays_in_weekend_nights           34.66
stays_in_week_nights              34.66
adults                            34.69
children                          62.10
babies                            34.69
meal                              34.72
country                           64.43
market_segment                    67.23
distribution_channel              42.09
is_repeated_guest                 37.46
previous_cancellations            58.32
previous_bookings_not_canceled    34.72
reserved_room_type                56.65
assigned_room_type                34.72
booking_changes                   34.70
agent                             43.65
company                           97.88
days_in_waiting_list              34.72



Rows with all values as null:


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,0
119393,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
119400,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
119403,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
119404,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
119406,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182772,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
182773,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
182774,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
182775,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### VARIABLES NUMERICAS

In [50]:
data.describe() # por defecto para las variables numéricas 

Unnamed: 0,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,119490.0,64829.0,101004.0,119271.0,119490.0,119490.0,119428.0,69302.0,119431.0,114376.0,76228.0,119390.0,119418.0,103050.0,3870.0,119390.0,119390.0,119390.0,119390.0
mean,104.172628,2016.156196,27.175785,15.795977,0.939461,2.515068,6.244423,0.125162,0.019903,0.031877,0.109317,0.137097,0.232737,86.693382,188.828424,2.321149,101.831122,0.062518,0.571363
std,106.975949,0.706674,13.613871,8.780503,1.082472,1.976511,14.574814,0.688305,0.433366,0.175674,1.028093,1.497437,0.770795,110.774548,131.473039,17.594721,50.53579,0.245291,0.792798
min,0.0,2015.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,-6.38,0.0,0.0
25%,18.0,2016.0,16.0,8.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,62.0,0.0,69.29,0.0,0.0
50%,69.0,2016.0,28.0,16.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,178.0,0.0,94.575,0.0,0.0
75%,161.0,2017.0,38.0,23.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,229.0,270.0,0.0,126.0,0.0,1.0
max,737.0,2017.0,53.0,31.0,19.914715,50.0,59.0,19.0,19.0,1.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0


In [75]:
# Obtenemos la lista de columnas numericas que tienen nulos
nulos_esta_num = data[data.columns[data.isnull().any()]].select_dtypes(include = np.number).columns
print("Las columnas numéricas que tienen nulos son : \n ")
print(nulos_esta_num)

Las columnas numéricas que tienen nulos son : 
 
Index(['lead_time', 'arrival_date_year', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'booking_changes', 'agent', 'company',
       'days_in_waiting_list', 'adr', 'required_car_parking_spaces',
       'total_of_special_requests'],
      dtype='object')


In [77]:
# lo primero que vamos a hacer es calcular el número de nulos que tenemos en cada una de las columnas numericas
data[nulos_esta_num].isnull().sum() / data.shape[0] *100

lead_time                         34.661002
arrival_date_year                 64.550490
arrival_date_week_number          44.769435
arrival_date_day_of_month         34.780754
stays_in_weekend_nights           34.661002
stays_in_week_nights              34.661002
adults                            34.694904
children                          62.104584
babies                            34.693264
is_repeated_guest                 37.457417
previous_cancellations            58.317339
previous_bookings_not_canceled    34.715683
booking_changes                   34.700372
agent                             43.650650
company                           97.883824
days_in_waiting_list              34.715683
adr                               34.715683
required_car_parking_spaces       34.715683
total_of_special_requests         34.715683
dtype: float64

In [78]:
# creamos una variable con los nombres de las columnas del DataFrame de las variables categóricas
columnas_numericas = [
    "lead_time",
    "arrival_date_year",
    "arrival_date_week_number",
    "arrival_date_day_of_month",
    "stays_in_weekend_nights",
    "stays_in_week_nights",
    "adults",
    "children",
    "babies",
    "is_repeated_guest",
    "previous_cancellations",
    "previous_bookings_not_canceled",
    "booking_changes",
    "agent",
    "company",
    "days_in_waiting_list",
    "adr",
    "required_car_parking_spaces",
    "total_of_special_requests"
]

print(f"Las columnas del DataFrame de variables categóricas son {columnas_numericas}")
print("--------------------------------------------------------------------------------")
# empezamos a iterar por cada una de las columnas para sacar sus valores únicos y sus frecuencias
for columna in columnas_numericas:
    print(f"Los valores únicos de {columna} son: {data[columna].unique()}\n")
    print(f"Las frecuencias de los valores únicos de la columna: {data[columna].value_counts()} ")
    print("------------------------------------------------------------------------------------------------")

Las columnas del DataFrame de variables categóricas son ['lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'agent', 'company', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests']
--------------------------------------------------------------------------------
Los valores únicos de lead_time son: [342.         737.           7.          13.          14.
   0.           9.          85.          75.          23.
  35.          68.          18.          37.          12.
  72.         127.          78.          48.          60.
  77.          99.         118.          95.          96.
  69.          45.          40.          15.          36.
  43.          70.          16.         107.          47.
 113.          90.          50.   

### VARIABLES CATEGORICAS

In [53]:
data.describe(include=object).T # para las variables categóricas

Unnamed: 0,count,unique,top,freq
hotel,119390,2,City Hotel,79330
is_canceled,119390,2,False,75166
arrival_date_month,119390,15,August,13877
meal,119390,5,BB,92310
country,65054,163,PRT,26577
market_segment,59934,8,Online TA,28331
distribution_channel,105899,5,TA/TO,86822
reserved_room_type,79275,10,A,57202
assigned_room_type,119390,12,A,74053
customer_type,94199,4,Transient,70610


In [55]:
# creamos una variable con los nombres de las columnas del DataFrame de las variables categóricas
columnas_categoricas = ["hotel", "is_canceled", "arrival_date_month", "meal", "country", "market_segment", "distribution_channel", "reserved_room_type", "assigned_room_type", "customer_type", "reservation_status", "reservation_status_date"]

print(f"Las columnas del DataFrame de variables categóricas son {columnas_categoricas}")
print("--------------------------------------------------------------------------------")
# empezamos a iterar por cada una de las columnas para sacar sus valores únicos y sus frecuencias
for columna in columnas_categoricas:
    print(f"Los valores únicos de {columna} son: {data[columna].unique()}\n")
    print(f"Las frecuencias de los valores únicos de la columna: {data[columna].value_counts()} ")
    print("------------------------------------------------------------------------------------------------")

Las columnas del DataFrame de variables categóricas son ['hotel', 'is_canceled', 'arrival_date_month', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'customer_type', 'reservation_status', 'reservation_status_date']
--------------------------------------------------------------------------------
Los valores únicos de hotel son: ['Resort Hotel' 'City Hotel' nan]

Las frecuencias de los valores únicos de la columna: hotel
City Hotel      79330
Resort Hotel    40060
Name: count, dtype: int64 
------------------------------------------------------------------------------------------------
Los valores únicos de is_canceled son: [False True nan]

Las frecuencias de los valores únicos de la columna: is_canceled
False    75166
True     44224
Name: count, dtype: int64 
------------------------------------------------------------------------------------------------
Los valores únicos de arrival_date_month son: ['July' 'August' 'September'

In [72]:
# Obtenemos la lista de columnas categóricas que tienen nulos
nulos_esta_cat = data[data.columns[data.isnull().any()]].select_dtypes(include = "O").columns
print("Las columnas categóricas que tienen nulos son : \n ")
print(nulos_esta_cat)

Las columnas categóricas que tienen nulos son : 
 
Index(['hotel', 'is_canceled', 'arrival_date_month', 'meal', 'country',
       'market_segment', 'distribution_channel', 'reserved_room_type',
       'assigned_room_type', 'customer_type', 'reservation_status',
       'reservation_status_date', '0'],
      dtype='object')


In [73]:
# sacamos el 'value_counts()' de cada una de las columnas categóricas que tienen nulos para saber como es la distribución de sus categorías
for col in nulos_esta_cat:
    print(f"La distribución de las categorías para la columna {col.upper()}")
    display(data[col].value_counts() / data.shape[0])  # display es una función utilizada para mostrar objetos de manera más legible en Jupyter Notebooks o entornos similares. 
    print("........................")

La distribución de las categorías para la columna HOTEL


hotel
City Hotel      0.433789
Resort Hotel    0.219054
Name: count, dtype: float64

........................
La distribución de las categorías para la columna IS_CANCELED


is_canceled
False    0.411019
True     0.241824
Name: count, dtype: float64

........................
La distribución de las categorías para la columna ARRIVAL_DATE_MONTH


arrival_date_month
August       0.075882
July         0.069232
May          0.064475
October      0.061025
April        0.060636
June         0.059816
September    0.057459
November     0.037151
December     0.037074
March        0.032382
February     0.026783
3            0.021173
January      0.019581
2            0.017334
1            0.012839
Name: count, dtype: float64

........................
La distribución de las categorías para la columna MEAL


meal
BB           0.504765
HB           0.079086
SC           0.058236
Undefined    0.006392
FB           0.004364
Name: count, dtype: float64

........................
La distribución de las categorías para la columna COUNTRY


country
PRT    0.145327
GBR    0.036008
FRA    0.031475
ESP    0.025766
DEU    0.021720
         ...   
LIE    0.000005
NIC    0.000005
GGY    0.000005
FJI    0.000005
UMI    0.000005
Name: count, Length: 163, dtype: float64

........................
La distribución de las categorías para la columna MARKET_SEGMENT


market_segment
Online TA        0.154918
Offline TA/TO    0.066252
Groups           0.054725
Direct           0.034745
Corporate        0.014452
Complementary    0.001985
Aviation         0.000645
Undefined        0.000005
Name: count, dtype: float64

........................
La distribución de las categorías para la columna DISTRIBUTION_CHANNEL


distribution_channel
TA/TO        0.474756
Direct       0.071070
Corporate    0.032257
GDS          0.000962
Undefined    0.000027
Name: count, dtype: float64

........................
La distribución de las categorías para la columna RESERVED_ROOM_TYPE


reserved_room_type
A    0.312789
D    0.069101
E    0.023759
F    0.010428
G    0.007519
B    0.004079
C    0.003418
H    0.002340
P    0.000033
L    0.000022
Name: count, dtype: float64

........................
La distribución de las categorías para la columna ASSIGNED_ROOM_TYPE


assigned_room_type
A    0.404933
D    0.138465
E    0.042684
F    0.020511
G    0.013960
C    0.012987
B    0.011828
H    0.003893
I    0.001985
K    0.001526
P    0.000066
L    0.000005
Name: count, dtype: float64

........................
La distribución de las categorías para la columna CUSTOMER_TYPE


customer_type
Transient          0.386107
Transient-Party    0.108483
Contract           0.017990
Group              0.002515
Name: count, dtype: float64

........................
La distribución de las categorías para la columna RESERVATION_STATUS


reservation_status
Check-Out    0.411019
Canceled     0.235224
No-Show      0.006600
Name: count, dtype: float64

........................
La distribución de las categorías para la columna RESERVATION_STATUS_DATE


reservation_status_date
2015-10-21 00:00:00    0.007098
2015-07-06 00:00:00    0.004003
2016-11-25 00:00:00    0.003784
2015-01-01 00:00:00    0.003707
2016-01-18 00:00:00    0.003073
                         ...   
2029-04-31             0.000005
2022-09-31             0.000005
2029-02-30             0.000005
2029-06-31             0.000005
2024-11-31             0.000005
Name: count, Length: 991, dtype: float64

........................
La distribución de las categorías para la columna 0


0
0.0        0.069883
2.0        0.012921
1.0        0.011598
A          0.009684
BB         0.006873
             ...   
48.2       0.000005
149.5      0.000005
68.07      0.000005
153.07     0.000005
4/02/17    0.000005
Name: count, Length: 1588, dtype: float64

........................
