In [None]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# >> Visualization
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
from scipy.stats import chi2_contingency
from IPython.display import display
from itertools import combinations
from scipy.stats import kstest, spearmanr, pearsonr
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)


In [None]:
data = pd.read_csv("spain.csv", index_col=0)
data.head()

In [None]:
def change_null_None(df,column_list): # rellena con None
     
        for column in column_list:
            if column in df.columns:

                df[column] = df[column].fillna("None")
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return df.head()


def change_null_cuisine(df,column_list): # rellena con spanish (categoria dominante) para variable CUISINE.
     
        for column in column_list:
            if column in df.columns:

                df[column] = df[column].fillna("Spanish")
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return df.head()

def change_null_for_standar_number(df, column_list): # rellena con -1 las columnas numéricas para los nulos.
     
        for column in column_list:
            if column in df.columns:

                df[column] = df[column].fillna(-1)
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return df.head()

def change_null_for_unknown(df,column_list): # Cuando no existe una categoría dominante en una variable categórica
     
        for column in column_list:
            if column in df.columns:

                df[column] = df[column].fillna("Unknown")
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return df.head()
    
def change_null_for_mode(df,column_list): # Cuando hay una categoría dominante en variables categóricas
        for column in column_list:
            if column in df.columns:
                mode = df[column].mode()[0]
          
                df[column] = df[column].fillna(mode)
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return df.head()
    
def change_null_for_mean(df, column_list):  # Cuando tenemos un 0-10% de nulos en una categoría numérica y la distribución es normal
        for column in column_list:
            if column in df.columns:
                    mean= df[column].mean()
              
                    df[column] = df[column].fillna(mean)
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return df.head()
    
def change_null_for_median(df, column_list):  # Cuando tenemos un 0-10% de nulos en una categoría numérica y la distribución es atípica
        for column in column_list:
            if column in df.columns:
                    median= df[column].median()
                
                    df[column] = df[column].fillna(median)
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return df.head()

def impute_with_knn(df, column_list, n_neighbors=5): # Cuando tenemos variables numéricas con más 10% de nulos
   
        imputer_knn = KNNImputer(n_neighbors=n_neighbors)

        imputed_data = imputer_knn.fit_transform(df[column_list])

        imputed_df = pd.DataFrame(imputed_data, columns=column_list)

        for column in column_list:
            df[f"{column}_knn"] = imputed_df[column]

        return df.head()

def transform_to_integer(df, column_names):
    """Transform floats to int for a list of columns."""
    for column in column_names:
        df[column] = df[column].astype(int)
    return df.head()

def drop_redundant_columns(df, columns):
        """Drops redundant columns . Param columns is a list of columns to de dropped"""
        df.drop(columns=columns, inplace=True, errors='ignore')


def map_columns_yes_or_no(df, columns):
    dicc = {"N": "No", "Y": "Yes"}

    for column in columns:
        df[column] = df[column].map(dicc)
    
    return df.head()

def quick_check(df, column_names):
    """This function is for testing purposes, to quickly check data type and unique values of a column."""
    
    for column in column_names:
        print(f"Column name: {column}")
        print(f"Data type: {df[column].dtype}")
        print(f"Null count: {df[column].isnull().sum()}")
        print("---------------------------------------------------------------------------------")  
    return df.head()

def convert_price_range(df):

        df["price_level"] = df["price_level"].fillna("€€-€€€")

        conversion_dictionary = {
        "€": "0 - 30€",
        "€€-€€€" : "30 - 80€",
        "€€€€" : "más de 80€"
         }

        # Aplicar la conversión usando .map() para reemplazar todos los valores de la columna
        df["price_range"] = df["price_level"].map(conversion_dictionary)

        return df.head()

       

# COMPROBACIONES

In [None]:
# eliminar columnas innecesarias
columns_to_drop = ["popularity_detailed", "special_diets","default_language", "atmosphere", "keywords" ]
drop_redundant_columns(data,columns_to_drop)
data.shape


In [None]:
# cambiar nulos por none
columns_none = ["awards", "top_tags", "features"]
change_null_None(data,columns_none)
quick_check(data,columns_none)

In [None]:
# cambiar nulos por Unknown
columns_unknown = ["latitude", "longitude", "claimed", "popularity_generic", "meals", "original_open_hours"]
change_null_for_unknown(data,columns_unknown)
quick_check(data, columns_unknown)

In [None]:
# cambiar nulos por -1
columns_number = ["open_days_per_week", "open_hours_per_week","working_shifts_per_week"]
change_null_for_standar_number(data,columns_number)
quick_check(data, columns_number)

In [None]:
# cambiar columna cuisine por Spanish - categoria dominante
change_null_cuisine(data,["cuisines"])
quick_check(data,["cuisines"])


In [None]:
# Cambiar columnas a YES o NO
columns_yes_no = ["vegetarian_friendly", "vegan_options", "gluten_free"]
map_columns_yes_or_no(data, columns_yes_no)
quick_check(data, columns_yes_no)

In [None]:

# cambia los rangos de precios de columnas price_level y price_range
convert_price_range(data)

In [None]:
columns_modify = [
    "avg_rating", 
    "total_reviews_count", 
    "reviews_count_in_default_language",
    "excellent",
    "very_good",
    "average",
    "poor",
    "terrible",
    "food",
    "service",
    "value"
]

for col in columns_modify:
    # Calcula la media y mediana de la columna en el DataFrame
    mean_value = data[col].mean()
    median_value = data[col].median()
    
    # Imprime los resultados
    print(f"Column: {col}")
    print(f"Mean: {mean_value}")
    print(f"Median: {median_value}")
    print("----------------------------------------------------------------------------------------")

In [None]:
# imputar nulos por media
columns_mean = ["avg_rating", "average","poor","terrible"]
change_null_for_mean(data, columns_mean)

# print for tasting
for column in columns_mean:
    quick_check(data, column)
    print("----------------------------------------------")

In [None]:
# imputar nulos por mediana
columns_median = ["total_reviews_count", "reviews_count_in_default_language", "excellent","very_good"]
change_null_for_mean(data, columns_median)

# print for tasting
for column in columns_median:
    quick_check(data, column)
    print("----------------------------------------------")



In [None]:
# imputar nulos por KNN
columns_knn = ["food","service","value"]
impute_with_knn(data,columns_knn)
quick_check(data, columns_knn)

In [None]:
# Cambiar columnas de float a int , lo hacemos lo último porque sino dan errores los nulos.
columnns_int = ["open_days_per_week", "open_hours_per_week","working_shifts_per_week", "avg_rating", "total_reviews_count", "reviews_count_in_default_language",
    "excellent",
    "very_good",
    "average",
    "poor",
    "terrible",
    "food",
    "service",
    "value"]

transform_to_integer(data,columnns_int)