In [2]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# >> Visualization
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
from scipy.stats import chi2_contingency
from IPython.display import display
from itertools import combinations
from scipy.stats import kstest, spearmanr, pearsonr
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [3]:
data = pd.read_csv("finanzas-hotel-bookings.csv", index_col=0)

In [None]:
# Cambiar los nombres de las columnas a minúsculas y _
data.columns = data.columns.str.lower().str.replace(' ', '_')


In [None]:
month_names = {
    1: 'January', 
    2: 'February', 
    3: 'March' 
}
columns_to_convert = ["arrival_date_month"]

# Aplicar el diccionario a cada columna
for col in columns_to_convert:
    data[col] = data[col].map(month_names) # Usamos map para aplicar el valor del diccionario de meses a cada columna de map
    data.head(2)

data["arrival_date_month"].value_counts()

In [None]:
# Cambiamos a float estas columnas para dar homogeneidad a los datos
data[["", "",""]] = data[["", "",""]].astype(float)

In [None]:
#Eliminiacion de columna
data.drop("", axis=1)

##NULOS

In [5]:
# Calcular media/mediana para hacer fillna
def change_null_for_unknown(self, column_list): # when doesnt exist a dominant category in categorical variable
        # Iterate through the list of columns to replace nulls with "Unknown"
        for column in column_list:
            if column in self.df.columns:
                # Replace nulls with the value "Unknown" for each column in the list
                self.df[column] = self.df[column].fillna("Unknown")
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return self.df
    
def change_null_for_mode(self, column_list): # When we have a dominant category in categorical variables
        for column in column_list:
            if column in self.df.columns:
                # Calculate the mode of the column
                mode = self.df[column].mode()[0]
                # Replace nulls with the mode for each column in the list
                self.df[column] = self.df[column].fillna(mode)
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return self.df
    
def change_null_for_mean(self, column_list): # when we have a 0-10% of nulls in numerical category and distribution is normal
        # Iterate through the list of columns to replace nulls with mean
        for column in column_list:
            if column in self.df.columns:
                    mean= self.df[column].mean()
                # Replace nulls with the mode for each column in the list
                    self.df[column] = self.df[column].fillna(mean)
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return self.df
    
def change_null_for_median(self, column_list): # when we have a 0-10% of nulls in numerical category and distribution is atypical
        # Iterate through the list of columns to replace nulls with median
        for column in column_list:
            if column in self.df.columns:
                    median= self.df[column].median()
                # Replace nulls with the mode for each column in the list
                    self.df[column] = self.df[column].fillna(median)
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return self.df

In [None]:
columns_modify = ["", "", ""]
change_null_for_unknown(columns_modify) # cambiar nulos por Unknown

# comprobamos si quedan nulos 
print("Después del reemplazo usando fillna quedan los siguientes nulos")
data[""].isnull().sum()

In [6]:
def impute_with_knn(self, column_list, n_neighbors=5): # when we have a numerical variable with more than 10% of nulls
        # Create an instance of KNNImputer
        imputer_knn = KNNImputer(n_neighbors=n_neighbors)

        # Fit and transform the data
        imputed_data = imputer_knn.fit_transform(self.df[column_list])

        # Convert the result to a DataFrame
        imputed_df = pd.DataFrame(imputed_data, columns=column_list)

        # Add the imputed columns to the original DataFrame
        for column in column_list:
            self.df[f"{column}_knn"] = imputed_df[column]

        return self.df