In [118]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import spacy
import torch
from google.cloud import bigquery
import os
import re

In [119]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../ETL/credencial.json"
client = bigquery.Client()

In [120]:
def consulta(query):
    query_job = client.query(query)
    results = query_job.result()
    
    rows = []
    for row in results:
        rows.append(row)

    # Verifica si hay filas antes de crear el DataFrame
    if rows:
        df = pd.DataFrame(data=[list(x.values()) for x in rows], columns=list(rows[0].keys()))
        return df

In [121]:
def df_unique(df):
    # Obtener los valores únicos globales
    valores_unicos_globales = df[df.columns[1:]].values.flatten()
    valores_unicos_globales = pd.unique(valores_unicos_globales)

    print("Valores únicos globales:")
    print(valores_unicos_globales)

In [122]:
def norm_bools(df):
    # Filtrar las columnas que tienen valor 1 para cada fila
    df_filtered = df.iloc[:, 1:].apply(lambda x: df.columns[1:][x == True].tolist(), axis=1)

    # Crear un nuevo DataFrame con las columnas "id" e "lista_nombres"
    df_result = pd.DataFrame({
        "business_id": df["business_id"],
        "category": df_filtered
    })
    return df_result

In [123]:
# Función para transformar nombres de columna a minúsculas y reemplazar guiones bajos por espacios
def norm_columns(columna):
    columna = columna.lower()  # Convertir a minúsculas
    columna = columna.replace('_', ' ')  # Reemplazar guion bajo por espacio
    return columna

In [124]:
def convertir_a_minusculas(valor):
    if isinstance(valor, str):
        return valor.lower()
    else:
        return valor

In [125]:
def agregar_espacios_mayuscula(cadena):
    # Utilizar expresión regular para agregar espacio entre palabras que comienzan con mayúscula
    nueva_cadena = re.sub(r'(?<=[a-z])([A-Z])', r' \1', cadena)
    return nueva_cadena.lower()

In [126]:
db = 'restaurant_staging_dataset'
tb_names = 'restaurants_dim'
tb_geo = 'geographical_data_dim'
tb_cat_google ='category_google_dim'
tb_cat_yelp = 'category_yelp_dim'
tb_atr_google = 'attributes_google_dim'
tb_atr_yelp = 'attributes_yelp_dim'

In [127]:
query = f"""
    SELECT
            name.business_id,
            name.name,
            geo.latitude,
            geo.longitude
    FROM
    `{db}.{tb_names}` AS name
    JOIN
    `{db}.{tb_geo}` AS geo
    ON
    name.business_id = geo.business_id;
    """

df_main = consulta(query)
df_main.head()

Unnamed: 0,business_id,name,latitude,longitude
0,0x88d8e21af668300b:0xea043d9812296a1f,$,26.348095,-80.084331
1,0x880fbbbb91b4b39f:0x2b65f95cb0a57126,.,42.138498,-87.96294
2,0x89c25809c94cf79d:0x2a27a2a91276498f,M,40.798193,-74.014895
3,0x549052beb33efd35:0xaaada7d2d2f2a33e,JW,47.337044,-122.593467
4,0x89c25bdf7f35ecaf:0x214868dca2152cbe,PT,40.712621,-73.962286


In [128]:
len(df_main)

53490

***

In [129]:
query = f"""
SELECT *
FROM `{db}.{tb_cat_google}`;
"""
df_category_google = consulta(query)

In [130]:
df_category_google.head()

Unnamed: 0,business_id,American_restaurant,Asian_restaurant,Barbecue_restaurant,Breakfast_restaurant,Caribbean_restaurant,Chicken_restaurant,Chicken_wings_restaurant,Chinese_restaurant,Dessert_restaurant,...,Peruvian_restaurant,Pizza_restaurant,Salvadoran_restaurant,Seafood_restaurant,Soul_food_restaurant,Sushi_restaurant,Taco_restaurant,Thai_restaurant,Vegan_restaurant,Vietnamese_restaurant
0,0x40771faa13e3c185:0xec402d308a3cbf48,,,,,,,,,,...,,,,,,,,,,
1,0x4cb555faaafddd59:0x5800723c79470569,,,,,,,,,,...,,,,,,,,,,
2,0x4cc988052259d771:0x7e8ad4ae56f776a,,,,,,,,,,...,,,,,,,,,,
3,0x4d352ae3b0e17efd:0xf0f6d5e6b27a7e4a,,,,,,,,,,...,,,,,,,,,,
4,0x4d4d1c7203ed6cbd:0x2799a2f45daace0c,,,,,,,,,,...,,,,,,,,,,


In [131]:
df_unique(df_category_google)

Valores únicos globales:
[nan  0.  1.]


In [132]:
# Aplicar la función a los nombres de columna
df_category_google.columns = df_category_google.columns.map(norm_columns)
df_category_google.head()

Unnamed: 0,business id,american restaurant,asian restaurant,barbecue restaurant,breakfast restaurant,caribbean restaurant,chicken restaurant,chicken wings restaurant,chinese restaurant,dessert restaurant,...,peruvian restaurant,pizza restaurant,salvadoran restaurant,seafood restaurant,soul food restaurant,sushi restaurant,taco restaurant,thai restaurant,vegan restaurant,vietnamese restaurant
0,0x40771faa13e3c185:0xec402d308a3cbf48,,,,,,,,,,...,,,,,,,,,,
1,0x4cb555faaafddd59:0x5800723c79470569,,,,,,,,,,...,,,,,,,,,,
2,0x4cc988052259d771:0x7e8ad4ae56f776a,,,,,,,,,,...,,,,,,,,,,
3,0x4d352ae3b0e17efd:0xf0f6d5e6b27a7e4a,,,,,,,,,,...,,,,,,,,,,
4,0x4d4d1c7203ed6cbd:0x2799a2f45daace0c,,,,,,,,,,...,,,,,,,,,,


In [133]:
df_category_google = df_category_google.rename(columns={'business id': 'business_id'})

In [134]:
df_category_google = norm_bools(df_category_google)
df_category_google.head()

Unnamed: 0,business_id,category
0,0x40771faa13e3c185:0xec402d308a3cbf48,[]
1,0x4cb555faaafddd59:0x5800723c79470569,[]
2,0x4cc988052259d771:0x7e8ad4ae56f776a,[]
3,0x4d352ae3b0e17efd:0xf0f6d5e6b27a7e4a,[]
4,0x4d4d1c7203ed6cbd:0x2799a2f45daace0c,[]


***

In [135]:
query = f"""
SELECT *
FROM `{db}.{tb_cat_yelp}`;
"""
df_category_yelp = consulta(query)

In [136]:
df_category_yelp.head()

Unnamed: 0,business_id,pizza,sandwiches,fast food,american traditional,mexican,nightlife,breakfast & brunch,burgers,bars,...,salad,cafes,delis,japanese,sushi bars,barbeque,diners,asian fusion,steakhouses,caterers
0,MTSW4McQd7CbVtyjqoe9mw,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9OG5YkX1g2GReZM0AskizA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,tMkwHmWFUEXrC9ZduonpTg,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,2xVsWBNFwZOxIOdd9Mwnww,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,wghnIlMb_i5U46HMBGx9ig,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [137]:
# Obtener los valores únicos globales
valores_unicos_globales = df_category_yelp[df_category_yelp.columns[1:]].values.flatten()
valores_unicos_globales = pd.unique(valores_unicos_globales)

print("Valores únicos globales:")
print(valores_unicos_globales)

Valores únicos globales:
[0 1]


In [138]:
len(df_category_yelp)

15305

In [139]:
df_category_yelp = norm_bools(df_category_yelp)
df_category_yelp.head()

Unnamed: 0,business_id,category
0,MTSW4McQd7CbVtyjqoe9mw,[coffee & tea]
1,9OG5YkX1g2GReZM0AskizA,[italian]
2,tMkwHmWFUEXrC9ZduonpTg,"[seafood, japanese]"
3,2xVsWBNFwZOxIOdd9Mwnww,[burgers]
4,wghnIlMb_i5U46HMBGx9ig,[chinese]


***

In [140]:
query = f"""
SELECT *
FROM `{db}.{tb_atr_yelp}`;
"""
df_attributes_yelp = consulta(query)

In [141]:
df_attributes_yelp.head()

Unnamed: 0,business_id,RestaurantsDelivery,OutdoorSeating,BusinessAcceptsCreditCards,RestaurantsPriceRange2,RestaurantsTakeOut,Caters,RestaurantsGoodForGroups,NoiseLevel,GoodForKids,RestaurantsReservations,HasTV
0,VblfPP6EwV70ldrztBHA1A,True,True,True,1,True,True,True,2,True,True,True
1,fU05rrHw5TpBvmie20YVkA,True,True,True,1,True,True,True,1,True,True,True
2,IzpyHmR5eXkF2C6CWvG1Nw,True,False,True,1,True,True,True,sin datos,True,True,True
3,GgcDDkFQuCU5puBqq0lImQ,True,True,True,1,True,True,True,1,True,False,True
4,OBTuS99XYFfLgR0eKMAHcg,True,True,True,1,True,True,sin datos,sin datos,True,False,True


In [142]:
df_unique(df_attributes_yelp)

Valores únicos globales:
['True' '1' '2' 'False' 'sin datos' '3' '4']


In [143]:
df_attributes_yelp.columns

Index(['business_id', 'RestaurantsDelivery', 'OutdoorSeating',
       'BusinessAcceptsCreditCards', 'RestaurantsPriceRange2',
       'RestaurantsTakeOut', 'Caters', 'RestaurantsGoodForGroups',
       'NoiseLevel', 'GoodForKids', 'RestaurantsReservations', 'HasTV'],
      dtype='object')

In [144]:
df_attributes_yelp = df_attributes_yelp.rename(columns=agregar_espacios_mayuscula)
df_attributes_yelp.head()

Unnamed: 0,business_id,restaurants delivery,outdoor seating,business accepts credit cards,restaurants price range2,restaurants take out,caters,restaurants good for groups,noise level,good for kids,restaurants reservations,has tv
0,VblfPP6EwV70ldrztBHA1A,True,True,True,1,True,True,True,2,True,True,True
1,fU05rrHw5TpBvmie20YVkA,True,True,True,1,True,True,True,1,True,True,True
2,IzpyHmR5eXkF2C6CWvG1Nw,True,False,True,1,True,True,True,sin datos,True,True,True
3,GgcDDkFQuCU5puBqq0lImQ,True,True,True,1,True,True,True,1,True,False,True
4,OBTuS99XYFfLgR0eKMAHcg,True,True,True,1,True,True,sin datos,sin datos,True,False,True


In [145]:
df_attributes_yelp = df_attributes_yelp.replace('sin datos', None)

In [146]:
df_attributes_yelp.drop(['restaurants price range2','noise level'],axis=1,inplace=True)

In [147]:
df_attributes_yelp.head()

Unnamed: 0,business_id,restaurants delivery,outdoor seating,business accepts credit cards,restaurants take out,caters,restaurants good for groups,good for kids,restaurants reservations,has tv
0,VblfPP6EwV70ldrztBHA1A,True,True,True,True,True,True,True,True,True
1,fU05rrHw5TpBvmie20YVkA,True,True,True,True,True,True,True,True,True
2,IzpyHmR5eXkF2C6CWvG1Nw,True,False,True,True,True,True,True,True,True
3,GgcDDkFQuCU5puBqq0lImQ,True,True,True,True,True,True,True,False,True
4,OBTuS99XYFfLgR0eKMAHcg,True,True,True,True,True,,True,False,True


In [148]:
type(df_attributes_yelp['restaurants delivery'][0])

str

In [149]:
# Convertir valores de texto a booleanos utilizando astype(bool)
df_reemplazado = df_attributes_yelp[df_attributes_yelp.columns[1:]].applymap(lambda x: x.lower() == 'true' if x is not None else None).astype(bool)

In [150]:
# Obtener la primera columna y las columnas restantes
primera_columna = df_attributes_yelp.iloc[:, 0]
columnas_restantes = df_attributes_yelp.iloc[:, 1:]

# Convertir valores de texto a booleanos en las columnas restantes
columnas_restantes_booleanos = columnas_restantes.applymap(lambda x: x.lower() == 'true' if x is not None else None)

# Concatenar la primera columna y las columnas convertidas
df_attributes_yelp = pd.concat([primera_columna, columnas_restantes_booleanos], axis=1)

In [151]:
df_attributes_yelp.head()

Unnamed: 0,business_id,restaurants delivery,outdoor seating,business accepts credit cards,restaurants take out,caters,restaurants good for groups,good for kids,restaurants reservations,has tv
0,VblfPP6EwV70ldrztBHA1A,True,True,True,True,True,True,True,True,True
1,fU05rrHw5TpBvmie20YVkA,True,True,True,True,True,True,True,True,True
2,IzpyHmR5eXkF2C6CWvG1Nw,True,False,True,True,True,True,True,True,True
3,GgcDDkFQuCU5puBqq0lImQ,True,True,True,True,True,True,True,False,True
4,OBTuS99XYFfLgR0eKMAHcg,True,True,True,True,True,,True,False,True


In [152]:
df_attributes_yelp = norm_bools(df_attributes_yelp)
df_attributes_yelp.head()

Unnamed: 0,business_id,category
0,VblfPP6EwV70ldrztBHA1A,"[restaurants delivery, outdoor seating, busine..."
1,fU05rrHw5TpBvmie20YVkA,"[restaurants delivery, outdoor seating, busine..."
2,IzpyHmR5eXkF2C6CWvG1Nw,"[restaurants delivery, business accepts credit..."
3,GgcDDkFQuCU5puBqq0lImQ,"[restaurants delivery, outdoor seating, busine..."
4,OBTuS99XYFfLgR0eKMAHcg,"[restaurants delivery, outdoor seating, busine..."


***

In [153]:
query = f"""
SELECT *
FROM `{db}.{tb_atr_google}`;
"""
df_attributes_google = consulta(query)

In [154]:
df_attributes_google.head()

Unnamed: 0,business_id,Health_safety,Amenities,From_the_business,Accessibility,Popular_for,Offerings,Dining_options,Atmosphere,Planning,Payments,Highlights
0,0x880e5d4f15f8c9e5:0xa0fc87d1c5e81dd9,,Wi-Fi,,"Wheelchair-accessible car park, Wheelchair-acc...",,,,,Quick visit,"Debit cards, NFC mobile payments, Credit cards","Great dessert, LGBTQ-friendly"
1,0x54950a7ba806f4db:0x6bd404e483f67929,,,,"Wheelchair accessible seating, Wheelchair acce...",,,,,,,
2,0x89d375b2164f9f05:0xa7a39c4548020acf,,Good for kids,,Wheelchair accessible seating,"Lunch, Dinner",,,,,,
3,0x8864883e41f62d9b:0x4cfbc311493fd612,,Restroom,,"Wheelchair accessible parking lot, Wheelchair ...",,Alcohol,,,,,
4,0x880e318d845e5723:0x73a42c9522ca4b46,,"Good for kids, High chairs, Toilets",,"Wheelchair-accessible car park, Wheelchair-acc...",,"Comfort food, Small plates","Lunch, Dinner",,,"Debit cards, NFC mobile payments, Credit cards","LGBTQ-friendly, Transgender safe space"


In [155]:
# Aplicar la función a cada elemento del DataFrame
df_attributes_google = df_attributes_google.applymap(convertir_a_minusculas)
df_attributes_google.head()

Unnamed: 0,business_id,Health_safety,Amenities,From_the_business,Accessibility,Popular_for,Offerings,Dining_options,Atmosphere,Planning,Payments,Highlights
0,0x880e5d4f15f8c9e5:0xa0fc87d1c5e81dd9,,wi-fi,,"wheelchair-accessible car park, wheelchair-acc...",,,,,quick visit,"debit cards, nfc mobile payments, credit cards","great dessert, lgbtq-friendly"
1,0x54950a7ba806f4db:0x6bd404e483f67929,,,,"wheelchair accessible seating, wheelchair acce...",,,,,,,
2,0x89d375b2164f9f05:0xa7a39c4548020acf,,good for kids,,wheelchair accessible seating,"lunch, dinner",,,,,,
3,0x8864883e41f62d9b:0x4cfbc311493fd612,,restroom,,"wheelchair accessible parking lot, wheelchair ...",,alcohol,,,,,
4,0x880e318d845e5723:0x73a42c9522ca4b46,,"good for kids, high chairs, toilets",,"wheelchair-accessible car park, wheelchair-acc...",,"comfort food, small plates","lunch, dinner",,,"debit cards, nfc mobile payments, credit cards","lgbtq-friendly, transgender safe space"


In [156]:
# Crear una nueva columna que contenga una lista de los valores no None de las demás columnas
df_attributes_google['category'] = df_attributes_google.apply(lambda row: [val for val in row[1:] if val is not None], axis=1)

# Seleccionar solo las columnas 'id' y 'lista_valores'
df_attributes_google = df_attributes_google[['business_id', 'category']]
df_attributes_google.head()

Unnamed: 0,business_id,category
0,0x880e5d4f15f8c9e5:0xa0fc87d1c5e81dd9,"[wi-fi, wheelchair-accessible car park, wheelc..."
1,0x54950a7ba806f4db:0x6bd404e483f67929,"[wheelchair accessible seating, wheelchair acc..."
2,0x89d375b2164f9f05:0xa7a39c4548020acf,"[good for kids, wheelchair accessible seating,..."
3,0x8864883e41f62d9b:0x4cfbc311493fd612,"[restroom, wheelchair accessible parking lot, ..."
4,0x880e318d845e5723:0x73a42c9522ca4b46,"[good for kids, high chairs, toilets, wheelcha..."


***

In [157]:
# Encontrar los valores que se repiten en la columna de df1 en la columna de df2
valores_repetidos = df_category_google[df_category_google['business_id'].isin(df_category_yelp['business_id'])]

print("Valores que se repiten en la columna de df1 en la columna de df2:")
print(valores_repetidos)

Valores que se repiten en la columna de df1 en la columna de df2:
Empty DataFrame
Columns: [business_id, category]
Index: []


In [158]:
df_concat_category = pd.concat([df_category_google, df_category_yelp], axis=0)

In [159]:
df_concat_category.head()

Unnamed: 0,business_id,category
0,0x40771faa13e3c185:0xec402d308a3cbf48,[]
1,0x4cb555faaafddd59:0x5800723c79470569,[]
2,0x4cc988052259d771:0x7e8ad4ae56f776a,[]
3,0x4d352ae3b0e17efd:0xf0f6d5e6b27a7e4a,[]
4,0x4d4d1c7203ed6cbd:0x2799a2f45daace0c,[]


In [160]:
len(df_concat_category)

55552

***

In [161]:
# Encontrar los valores que se repiten en la columna de df1 en la columna de df2
valores_repetidos = df_attributes_google[df_attributes_google['business_id'].isin(df_attributes_yelp['business_id'])]

print("Valores que se repiten en la columna de df1 en la columna de df2:")
print(valores_repetidos)

Valores que se repiten en la columna de df1 en la columna de df2:
Empty DataFrame
Columns: [business_id, category]
Index: []


In [162]:
df_concat_atr = pd.concat([df_attributes_google, df_attributes_yelp], axis=0)
df_concat_atr.head()

Unnamed: 0,business_id,category
0,0x880e5d4f15f8c9e5:0xa0fc87d1c5e81dd9,"[wi-fi, wheelchair-accessible car park, wheelc..."
1,0x54950a7ba806f4db:0x6bd404e483f67929,"[wheelchair accessible seating, wheelchair acc..."
2,0x89d375b2164f9f05:0xa7a39c4548020acf,"[good for kids, wheelchair accessible seating,..."
3,0x8864883e41f62d9b:0x4cfbc311493fd612,"[restroom, wheelchair accessible parking lot, ..."
4,0x880e318d845e5723:0x73a42c9522ca4b46,"[good for kids, high chairs, toilets, wheelcha..."


In [163]:
df_concat_atr = df_concat_atr.rename(columns={'category': 'category2'})

In [164]:
len(df_concat_atr)

55552

***

In [181]:
df_final = pd.merge(df_main, df_concat_category, on='business_id', how='left')
df_final.head()

Unnamed: 0,business_id,name,latitude,longitude,category
0,0x88d8e21af668300b:0xea043d9812296a1f,$,26.348095,-80.084331,"[american restaurant, good for kids]"
1,0x880fbbbb91b4b39f:0x2b65f95cb0a57126,.,42.138498,-87.96294,"[mexican restaurant, takeout, delivery, dine-i..."
2,0x89c25809c94cf79d:0x2a27a2a91276498f,M,40.798193,-74.014895,"[wheelchair accessible entrance, late-night fo..."
3,0x549052beb33efd35:0xaaada7d2d2f2a33e,JW,47.337044,-122.593467,"[american restaurant, delivery]"
4,0x89c25bdf7f35ecaf:0x214868dca2152cbe,PT,40.712621,-73.962286,"[italian restaurant, delivery, bar onsite, res..."


In [182]:
df_final = pd.merge(df_final, df_concat_atr, on='business_id', how='left')
df_final.head()

Unnamed: 0,business_id,name,latitude,longitude,category,category2
0,0x88d8e21af668300b:0xea043d9812296a1f,$,26.348095,-80.084331,"[american restaurant, good for kids]",[good for kids]
1,0x880fbbbb91b4b39f:0x2b65f95cb0a57126,.,42.138498,-87.96294,"[mexican restaurant, takeout, delivery, dine-i...","[takeout, delivery, dine-in, good for kids, hi..."
2,0x89c25809c94cf79d:0x2a27a2a91276498f,M,40.798193,-74.014895,"[wheelchair accessible entrance, late-night fo...","[wheelchair accessible entrance, late-night fo..."
3,0x549052beb33efd35:0xaaada7d2d2f2a33e,JW,47.337044,-122.593467,"[american restaurant, delivery]",[delivery]
4,0x89c25bdf7f35ecaf:0x214868dca2152cbe,PT,40.712621,-73.962286,"[italian restaurant, delivery, bar onsite, res...","[delivery, bar onsite, restroom, dinner, solo ..."


In [183]:
# Función para agregar palabras de 'y' a 'x' sin repetir
def agregar_palabras_sin_repetir(x, y):
    for palabra in y:
        if palabra not in x:
            x.append(palabra)
    return x

# Aplicar la función para agregar palabras de 'y' a 'x'
df_final['category_x'] = df_final.apply(lambda row: agregar_palabras_sin_repetir(row['category'], row['category2']), axis=1)
df_final.head()


Unnamed: 0,business_id,name,latitude,longitude,category,category2,category_x
0,0x88d8e21af668300b:0xea043d9812296a1f,$,26.348095,-80.084331,"[american restaurant, good for kids]",[good for kids],"[american restaurant, good for kids]"
1,0x880fbbbb91b4b39f:0x2b65f95cb0a57126,.,42.138498,-87.96294,"[mexican restaurant, takeout, delivery, dine-i...","[takeout, delivery, dine-in, good for kids, hi...","[mexican restaurant, takeout, delivery, dine-i..."
2,0x89c25809c94cf79d:0x2a27a2a91276498f,M,40.798193,-74.014895,"[wheelchair accessible entrance, late-night fo...","[wheelchair accessible entrance, late-night fo...","[wheelchair accessible entrance, late-night fo..."
3,0x549052beb33efd35:0xaaada7d2d2f2a33e,JW,47.337044,-122.593467,"[american restaurant, delivery]",[delivery],"[american restaurant, delivery]"
4,0x89c25bdf7f35ecaf:0x214868dca2152cbe,PT,40.712621,-73.962286,"[italian restaurant, delivery, bar onsite, res...","[delivery, bar onsite, restroom, dinner, solo ...","[italian restaurant, delivery, bar onsite, res..."


In [184]:
df_final.drop(['category_x','category2'],axis=1,inplace=True)
df_final.head()

Unnamed: 0,business_id,name,latitude,longitude,category
0,0x88d8e21af668300b:0xea043d9812296a1f,$,26.348095,-80.084331,"[american restaurant, good for kids]"
1,0x880fbbbb91b4b39f:0x2b65f95cb0a57126,.,42.138498,-87.96294,"[mexican restaurant, takeout, delivery, dine-i..."
2,0x89c25809c94cf79d:0x2a27a2a91276498f,M,40.798193,-74.014895,"[wheelchair accessible entrance, late-night fo..."
3,0x549052beb33efd35:0xaaada7d2d2f2a33e,JW,47.337044,-122.593467,"[american restaurant, delivery]"
4,0x89c25bdf7f35ecaf:0x214868dca2152cbe,PT,40.712621,-73.962286,"[italian restaurant, delivery, bar onsite, res..."


In [185]:
len(df_final)

53490

In [186]:
# Crear la columna 'coord' combinando 'latitude' y 'longitude'
df_final['coord'] = list(zip(round(df_final['latitude'],7), round(df_final['longitude'],7)))
# Eliminar las columnas 'latitude' y 'longitude'
df_final.drop(['latitude', 'longitude'], axis=1, inplace=True)
df_final.head()

Unnamed: 0,business_id,name,category,coord
0,0x88d8e21af668300b:0xea043d9812296a1f,$,"[american restaurant, good for kids]","(26.3480954, -80.0843315)"
1,0x880fbbbb91b4b39f:0x2b65f95cb0a57126,.,"[mexican restaurant, takeout, delivery, dine-i...","(42.1384984, -87.9629398)"
2,0x89c25809c94cf79d:0x2a27a2a91276498f,M,"[wheelchair accessible entrance, late-night fo...","(40.7981928, -74.0148954)"
3,0x549052beb33efd35:0xaaada7d2d2f2a33e,JW,"[american restaurant, delivery]","(47.3370444, -122.5934667)"
4,0x89c25bdf7f35ecaf:0x214868dca2152cbe,PT,"[italian restaurant, delivery, bar onsite, res...","(40.7126207, -73.9622863)"


In [187]:
df_final.to_pickle('../Datasets_ML/Rest_final_MLl.pickle')