In [223]:
import pandas as pd
import numpy as np
import ast

pd.set_option('display.max_columns', None)

In [224]:
data = pd.read_csv("spain.csv", index_col=0)
data.head()

Unnamed: 0,restaurant_link,restaurant_name,original_location,country,region,province,city,address,latitude,longitude,claimed,awards,popularity_detailed,popularity_generic,top_tags,price_level,price_range,meals,cuisines,special_diets,features,vegetarian_friendly,vegan_options,gluten_free,original_open_hours,open_days_per_week,open_hours_per_week,working_shifts_per_week,avg_rating,total_reviews_count,default_language,reviews_count_in_default_language,excellent,very_good,average,poor,terrible,food,service,value,atmosphere,keywords
320900,g10021880-d13763192,Taberna La Sacristia,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle del Concejo 19, 41870 Aznalcollar Spain",37.51928,-6.26885,Unclaimed,,#4 of 5 Restaurants in Aznalcollar,#4 of 6 places to eat in Aznalcollar,Spanish,,,,Spanish,,,N,N,N,,,,,3.0,1.0,English,1.0,0.0,0.0,1.0,0.0,0.0,,,,,
320901,g10021880-d15758746,Tasca el Capricho,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle del 28 de Febrero 9, 41870 Aznalcollar S...",37.52065,-6.26822,Unclaimed,,#3 of 5 Restaurants in Aznalcollar,#3 of 6 places to eat in Aznalcollar,Spanish,,,,Spanish,,,N,N,N,,,,,5.0,2.0,All languages,2.0,2.0,0.0,0.0,0.0,0.0,,,,,
320902,g10021880-d19332558,Bar Las Adelfas,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle Perdon N° 23 Capilla de La Cruz, 41870 A...",37.52428,-6.27144,Claimed,,#5 of 5 Restaurants in Aznalcollar,#5 of 6 places to eat in Aznalcollar,"Mediterranean, Spanish, Grill, Diner",,,"Breakfast, Lunch, Dinner, Brunch, Drinks","Mediterranean, Spanish, Grill, Diner, Dining bars",,,N,N,N,"{""Mon"": [], ""Tue"": [""19:30-23:45""], ""Wed"": [""1...",6.0,51.0,6.0,3.0,2.0,All languages,2.0,1.0,0.0,0.0,0.0,1.0,,,,,
320903,g10021880-d19468788,El Rincon nº 7,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle del Concejo 60, 41870 Aznalcollar Spain",37.51714,-6.2686,Claimed,,#1 of 5 Restaurants in Aznalcollar,#1 of 6 places to eat in Aznalcollar,"Mid-range, Steakhouse, Cafe, Spanish",€€-€€€,€2-€18,"Lunch, Dinner, Drinks","Steakhouse, Cafe, Dining bars, Spanish",,,N,N,N,"{""Mon"": [], ""Tue"": [], ""Wed"": [], ""Thu"": [""20:...",4.0,23.0,7.0,5.0,18.0,All languages,18.0,17.0,1.0,0.0,0.0,0.0,,,,,
320904,g10021880-d19847377,Nuevo jacaranda,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle del 28 de Febrero 2, 41870 Aznalcollar S...",37.52088,-6.26844,Claimed,,,,,,,,,,Reservations,N,N,N,,,,,,,,,,,,,,,,,,


In [225]:
df = data.copy() # hace una copia

In [226]:
# Nan en la columna city antes de crear la función
df["city"].isna().sum()

102884

In [227]:
# Convertir las cadenas de texto de original_location en listas
df['original_location'] = df['original_location'].apply(lambda x: ast.literal_eval(x))

In [228]:
# Función para actualizar la ciudad si es NaN
def actualizar_city_si_nan(row):
    """
    Actualiza el valor de la columna 'city' en un DataFrame basado en el contenido de la columna 'original_location'.
    
    Si el valor de 'city' es NaN, la función intenta obtener el último elemento de la lista en la columna 'original_location'.
    La lista en 'original_location' debe tener al menos un elemento para que se pueda extraer el valor.
    
    Parámetros:
    row (pd.Series): Una fila del DataFrame proporcionada por `apply`. Contiene las columnas 'city' y 'original_location'.
    
    Retorna:
    str o np.nan: Devuelve el último elemento de la lista en 'original_location' si 'city' es NaN y la lista no está vacía.
                  Si 'city' no es NaN, devuelve el valor actual de 'city'. Si 'original_location' no es una lista o es vacía,
                  devuelve np.nan.
    """
    if pd.isna(row['city']):
        if isinstance(row['original_location'], list) and len(row['original_location']) > 0:
            return row['original_location'][-1]
    return row['city']

# Aplicar la función a cada fila del DataFrame
df['city'] = df.apply(actualizar_city_si_nan, axis=1)

In [229]:
# Nan en la columna city después de la función obtener ciudad
df["city"].isna().sum()

0

In [244]:
df["province"].isna().sum()

29570

In [245]:
def actualizar_provincia_si_nan(row):
    """
    Actualiza el valor de la columna 'province' en un DataFrame con el valor de la columna 'region' si 'province' es NaN.
    
    Parámetros:
    row (pd.Series): Una fila del DataFrame proporcionada por `apply`. Contiene las columnas 'province' y 'region'.
    
    Retorna:
    str o np.nan: Devuelve el valor de 'region' si 'province' es NaN, de lo contrario devuelve el valor actual de 'province'.
    """
    if pd.isna(row['province']):
        return row['region']
    return row['province']

# Aplicar la función a cada fila del DataFrame para actualizar la columna 'province'
df['province'] = df.apply(actualizar_provincia_si_nan, axis=1)

In [246]:
df["province"].isna().sum()

2

In [241]:
df[df["province"].isna()].sample(5)

Unnamed: 0,restaurant_link,restaurant_name,original_location,country,region,province,city,address,latitude,longitude,claimed,awards,popularity_detailed,popularity_generic,top_tags,price_level,price_range,meals,cuisines,special_diets,features,vegetarian_friendly,vegan_options,gluten_free,original_open_hours,open_days_per_week,open_hours_per_week,working_shifts_per_week,avg_rating,total_reviews_count,default_language,reviews_count_in_default_language,excellent,very_good,average,poor,terrible,food,service,value,atmosphere,keywords
413563,g187518-d12523261,Pintada,"[Europe, Spain, Region of Murcia, Murcia]",Spain,Region of Murcia,,Murcia,"Calle del Arenal 3, 30011 Murcia Spain",37.97711,-1.12307,Unclaimed,,#200 of 803 Restaurants in Murcia,#213 of 1003 places to eat in Murcia,"Mid-range, Italian, Pizza",€€-€€€,,"Lunch, Dinner","Italian, Pizza",,"Takeout, Seating, Table Service, Reservations",N,N,N,,,,,4.5,18.0,All languages,18.0,11.0,6.0,0.0,0.0,1.0,4.5,4.5,4.0,,
402990,g187514-d13167715,La Terraza de Chachi Piruli,"[Europe, Spain, Community of Madrid, Madrid]",Spain,Community of Madrid,,Madrid,"Avenida Monasterio de Silos 20B, 28034 Madrid ...",40.50353,-3.708708,Claimed,,#6176 of 12512 places to eat in Madrid,,"Mid-range, Spanish",€€-€€€,€4-€30,"Drinks, Lunch, Dinner",Spanish,,,N,N,N,"{""Mon"": [], ""Tue"": [""13:00-17:00"", ""20:00-23:0...",6.0,43.0,11.0,4.0,6.0,All languages,6.0,2.0,3.0,0.0,1.0,0.0,,,,,
369553,g187451-d14140446,La Comtienda Fomento,"[Europe, Spain, Asturias, Gijon]",Spain,Asturias,,Gijon,"Calle de Felipe Menendez 7 Haciendo Esquina, 3...",43.542553,-5.665965,Claimed,"Travellers' Choice, Certificate of Excellence ...",#53 of 716 Restaurants in Gijon,#54 of 913 places to eat in Gijon,"Mid-range, Mediterranean, Spanish",€€-€€€,€10-€20,"Lunch, Dinner","Mediterranean, Spanish",,,N,N,N,"{""Mon"": [""12:30-16:30"", ""19:30-23:59""], ""Tue"":...",7.0,67.883333,14.0,4.5,131.0,All languages,131.0,84.0,28.0,13.0,3.0,3.0,4.0,4.5,4.5,,
411445,g187514-d8529308,La Posada de Carmela,"[Europe, Spain, Community of Madrid, Madrid]",Spain,Community of Madrid,,Madrid,"Calle Federico Mompou 4 Zona Norte, Las Tablas...",40.511204,-3.679107,Claimed,,#3456 of 10193 Restaurants in Madrid,#3719 of 12512 places to eat in Madrid,"Mid-range, Mediterranean, Spanish, Venezuelan",€€-€€€,€10-€15,"Breakfast, Lunch, Brunch, Drinks","Spanish, Mediterranean, Venezuelan",,,N,N,N,"{""Mon"": [""06:30-17:00""], ""Tue"": [""06:30-17:00""...",6.0,68.25,6.0,4.5,34.0,English,1.0,0.0,0.0,1.0,0.0,0.0,4.5,4.5,4.5,,
409455,g187514-d5794486,Taberna de Argensola,"[Europe, Spain, Community of Madrid, Madrid]",Spain,Community of Madrid,,Madrid,"Calle Argensola 2, 28004 Madrid Spain",40.42469,-3.695343,Unclaimed,,#2568 of 10193 Restaurants in Madrid,#2750 of 12512 places to eat in Madrid,"Cheap Eats, Spanish",€,,"Breakfast, Lunch, Dinner",Spanish,,"Serves Alcohol, Seating",N,N,N,"{""Mon"": [""07:00-01:00""], ""Tue"": [""07:00-01:00""...",6.0,108.0,6.0,4.0,45.0,English,7.0,4.0,1.0,2.0,0.0,0.0,4.0,4.0,4.5,,


In [234]:
# Ver todo el contenido de la primera fila de la columna 'address'
print(df['address'].loc[474188])


Paseo Brusco 33, 39180 Noja Spain
