In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# >> Visualization
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
from scipy.stats import chi2_contingency
from IPython.display import display
from itertools import combinations
from scipy.stats import kstest, spearmanr, pearsonr
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv("spain.csv", index_col=0)
data.head()

Unnamed: 0,restaurant_link,restaurant_name,original_location,country,region,province,city,address,latitude,longitude,claimed,awards,popularity_detailed,popularity_generic,top_tags,price_level,price_range,meals,cuisines,special_diets,features,vegetarian_friendly,vegan_options,gluten_free,original_open_hours,open_days_per_week,open_hours_per_week,working_shifts_per_week,avg_rating,total_reviews_count,default_language,reviews_count_in_default_language,excellent,very_good,average,poor,terrible,food,service,value,atmosphere,keywords
320900,g10021880-d13763192,Taberna La Sacristia,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle del Concejo 19, 41870 Aznalcollar Spain",37.51928,-6.26885,Unclaimed,,#4 of 5 Restaurants in Aznalcollar,#4 of 6 places to eat in Aznalcollar,Spanish,,,,Spanish,,,N,N,N,,,,,3.0,1.0,English,1.0,0.0,0.0,1.0,0.0,0.0,,,,,
320901,g10021880-d15758746,Tasca el Capricho,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle del 28 de Febrero 9, 41870 Aznalcollar S...",37.52065,-6.26822,Unclaimed,,#3 of 5 Restaurants in Aznalcollar,#3 of 6 places to eat in Aznalcollar,Spanish,,,,Spanish,,,N,N,N,,,,,5.0,2.0,All languages,2.0,2.0,0.0,0.0,0.0,0.0,,,,,
320902,g10021880-d19332558,Bar Las Adelfas,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle Perdon N° 23 Capilla de La Cruz, 41870 A...",37.52428,-6.27144,Claimed,,#5 of 5 Restaurants in Aznalcollar,#5 of 6 places to eat in Aznalcollar,"Mediterranean, Spanish, Grill, Diner",,,"Breakfast, Lunch, Dinner, Brunch, Drinks","Mediterranean, Spanish, Grill, Diner, Dining bars",,,N,N,N,"{""Mon"": [], ""Tue"": [""19:30-23:45""], ""Wed"": [""1...",6.0,51.0,6.0,3.0,2.0,All languages,2.0,1.0,0.0,0.0,0.0,1.0,,,,,
320903,g10021880-d19468788,El Rincon nº 7,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle del Concejo 60, 41870 Aznalcollar Spain",37.51714,-6.2686,Claimed,,#1 of 5 Restaurants in Aznalcollar,#1 of 6 places to eat in Aznalcollar,"Mid-range, Steakhouse, Cafe, Spanish",€€-€€€,€2-€18,"Lunch, Dinner, Drinks","Steakhouse, Cafe, Dining bars, Spanish",,,N,N,N,"{""Mon"": [], ""Tue"": [], ""Wed"": [], ""Thu"": [""20:...",4.0,23.0,7.0,5.0,18.0,All languages,18.0,17.0,1.0,0.0,0.0,0.0,,,,,
320904,g10021880-d19847377,Nuevo jacaranda,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle del 28 de Febrero 2, 41870 Aznalcollar S...",37.52088,-6.26844,Claimed,,,,,,,,,,Reservations,N,N,N,,,,,,,,,,,,,,,,,,


In [3]:
def change_null_cuisine(df,column_list): # rellena con spanish (categoria dominante) para variable CUISINE.
     
        for column in column_list:
            if column in df.columns:

                df[column] = df[column].fillna("Spanish")
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return df

def change_null_for_standar_number(df, column_list): # rellena con -1 las columnas numéricas para los nulos.
     
        for column in column_list:
            if column in df.columns:

                df[column] = df[column].fillna(-1)
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return df

def change_null_for_unknown(df,column_list): # Cuando no existe una categoría dominante en una variable categórica
     
        for column in column_list:
            if column in df.columns:

                df[column] = df[column].fillna("Unknown")
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return df
    
def change_null_for_mode(df,column_list): # Cuando hay una categoría dominante en variables categóricas
        for column in column_list:
            if column in df.columns:
                mode = df[column].mode()[0]
          
                df[column] = df[column].fillna(mode)
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return df
    
def change_null_for_mean(df, column_list):  # Cuando tenemos un 0-10% de nulos en una categoría numérica y la distribución es normal
        for column in column_list:
            if column in df.columns:
                    mean= df[column].mean()
              
                    df[column] = df[column].fillna(mean)
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return df
    
def change_null_for_median(df, column_list):  # Cuando tenemos un 0-10% de nulos en una categoría numérica y la distribución es atípica
        for column in column_list:
            if column in df.columns:
                    median= df[column].median()
                
                    df[column] = df[column].fillna(median)
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return df

def impute_with_knn(df, column_list, n_neighbors=5): # Cuando tenemos variables numéricas con más 10% de nulos
   
        imputer_knn = KNNImputer(n_neighbors=n_neighbors)

        imputed_data = imputer_knn.fit_transform(df[column_list])

        imputed_df = pd.DataFrame(imputed_data, columns=column_list)

        for column in column_list:
            df[f"{column}_knn"] = imputed_df[column]

        return df

def transform_to_integer(df, column_name):
        """Transform floats to int"""
        df[column_name] = df[column_name].astype(int)

def drop_redundant_columns(df, columns):
        """Drops redundant columns . Param columns is a list of columns to de dropped"""
        df.drop(columns=columns, inplace=True, errors='ignore')


def map_column_yes_or_no(df, column):
    dicc = {"N": "No", "Y": "Yes"}

    # Reemplaza valores con los del diccionario para la columna que metamos en el atributo.
    df[column] =df[column].map(dicc)

def quick_check(df,column_name):
        """ This function is for testing purposes, to quicky check data type and unique values of a column"""
        print (f"Column name: {column_name}")
        print (f"Data type: {df[column_name].dtype}")
        print (f"Unique values: {df[column_name].unique()}")
        print (f"Null count: {df[column_name].isnull().sum()}")

def convert_price_range(df):
        # Primero reemplazados el concepto €€- €€€ para €€
        df["price_level"] =df["price_level"].replace({"€€-€€€": "€€"})
      
        
        conversion_dictionary = {
        "€": "0 - 20€",
        "€€" : "20 - 50€",
        "€€€" : "50 - 80€", # no hay en el listado estos valores que hacemos??? los nulos los modificamos aqui?
        "€€€€" : "más de 80€"
         }

        # Aplicar la conversión usando .map() para reemplazar todos los valores de la columna
        df["price_range"] = df["price_level"].map(conversion_dictionary)


       

# COMPROBACIONES

In [4]:
columns_modify = ["", ""]
change_null_for_mean(data, columns_modify)

# print for tasting
for column in columns_modify:
    quick_check(data, column)
    print("----------------------------------------------")

Column name: 


KeyError: ''

In [None]:
map_column_yes_or_no(data,"vegetarian_friendly")


In [None]:
data["vegetarian_friendly"].value_counts()

vegetarian_friendly
No     119740
Yes     37739
Name: count, dtype: int64

In [None]:
convert_price_range(data)

In [None]:
print(data["price_level"].unique())


[nan '€€' '€' '€€€€']


In [None]:
data["price_level"].value_counts()

price_level
€€      79406
€       33489
€€€€     3913
Name: count, dtype: int64

In [None]:
data["price_range"].value_counts()

price_range
20 - 50€      79406
0 - 20€       33489
más de 80€     3913
Name: count, dtype: int64