In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import spacy
import torch
from google.cloud import bigquery
import os
import re
from tqdm import tqdm

In [65]:
# Carga de credencial para acceder a googlecloud
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../ETL/credencial.json"
# Crea un instancia de la clase bigquery
client = bigquery.Client()

In [66]:
# Recibe una query y deveulve el resultado en un dataframe
def consulta(query):
    query_job = client.query(query)
    results = query_job.result()
    rows = []
    for row in results:
        rows.append(row)
    if rows:
        df = pd.DataFrame(data=[list(x.values()) for x in rows], columns=list(rows[0].keys()))
        return df

In [67]:
# Imprime los valores únicos globales
def df_unique(df):
    valores_unicos_globales = df[df.columns[1:]].values.flatten()
    valores_unicos_globales = pd.unique(valores_unicos_globales)
    print("Valores únicos globales:")
    print(valores_unicos_globales)

In [68]:
# Recibe un dataframe normalizado y devuelve otro dataframe con las columnas con valores verdaderos en una lista
def norm_bools(df):
    df_filtered = df.iloc[:, 1:].apply(lambda x: df.columns[1:][x == True].tolist(), axis=1)
    df_result = pd.DataFrame({
        "business_id": df["business_id"],
        "category": df_filtered
    })
    return df_result

In [69]:
# Transforma nombres de columna a minúsculas y reemplazar guiones bajos por espacios
def norm_columns(columna):
    columna = columna.lower()
    columna = columna.replace('_', ' ') 
    return columna

In [70]:
# Convierte a minuscualas un texto
def convertir_a_minusculas(valor):
    if isinstance(valor, str):
        return valor.lower()
    else:
        return valor

In [71]:
# Utilizar expresión regular para agregar espacio entre palabras que comienzan con mayúscula
def agregar_espacios_mayuscula(cadena):
    nueva_cadena = re.sub(r'(?<=[a-z])([A-Z])', r' \1', cadena)
    return nueva_cadena.lower()

In [72]:
# Guarda el schema, bases de datos y tablas en varables
db = 'restaurant_staging_dataset'
tb_names = 'restaurants_dim'
tb_geo = 'geographical_data_dim'
tb_cat_google ='category_google_dim'
tb_cat_yelp = 'category_yelp_dim'
tb_atr_google = 'attributes_google_dim'
tb_atr_yelp = 'attributes_yelp_dim'

In [73]:
# Query para googlecloud en sql
query = f"""
    SELECT
            name.business_id,
            name.name,
            geo.latitude,
            geo.longitude,
            geo.address,
            geo.city
    FROM
    `{db}.{tb_names}` AS name
    JOIN
    `{db}.{tb_geo}` AS geo
    ON
    name.business_id = geo.business_id;
    """

df_main = consulta(query)
df_main.head()

Unnamed: 0,business_id,name,latitude,longitude,address,city
0,0x872d8f6b3060731f:0xbab7d25cfd777e60,Paleo Brio Healthy Kitchen-Flagstaff,35.446822,-112.052423,"CWWX+P2 Parks, AZ, USA",Parks
1,0x872a4991146626e7:0x7d5c5ed7e1a58fa3,Santan Barbecue & Grille,32.962694,-111.667508,"X87J+3X Casa Grande, AZ, USA",Casa Grande
2,0x89e36d1b29148441:0xf577f42878da4838,El Pulgarcito,42.477942,-70.966168,"F2HM+5G Lynn, MA, USA",Lynn
3,0x89a9f643802fd7bf:0xd4948847f210b7cf,Subway,34.14657,-77.897356,"44W3+J3 Silver Lake, NC, USA",Silver Lake
4,0x80c8dcfe21b0ac85:0x1f1ba837d0fb5612,Champion's Grille at Sunrise Vista,36.211192,-115.050931,"6W6X+FJ Sunrise Manor, NV, USA",Sunrise Manor


In [74]:
len(df_main)

53490

In [75]:
# Conbina los valores de dos columnas en un array 
def combine_into_array(row):
    return [row['latitude'], row['longitude']]

# Aplica la funcion anterior
df_main['coord'] = df_main.apply(combine_into_array, axis=1)

In [76]:
# Elimina las columnas latitude y longitude
df_main.drop(['latitude', 'longitude'], axis=1, inplace=True)
df_main.head()

Unnamed: 0,business_id,name,address,city,coord
0,0x872d8f6b3060731f:0xbab7d25cfd777e60,Paleo Brio Healthy Kitchen-Flagstaff,"CWWX+P2 Parks, AZ, USA",Parks,"[35.4468219, -112.0524234]"
1,0x872a4991146626e7:0x7d5c5ed7e1a58fa3,Santan Barbecue & Grille,"X87J+3X Casa Grande, AZ, USA",Casa Grande,"[32.9626935, -111.6675081]"
2,0x89e36d1b29148441:0xf577f42878da4838,El Pulgarcito,"F2HM+5G Lynn, MA, USA",Lynn,"[42.4779416, -70.966168]"
3,0x89a9f643802fd7bf:0xd4948847f210b7cf,Subway,"44W3+J3 Silver Lake, NC, USA",Silver Lake,"[34.1465696, -77.8973559]"
4,0x80c8dcfe21b0ac85:0x1f1ba837d0fb5612,Champion's Grille at Sunrise Vista,"6W6X+FJ Sunrise Manor, NV, USA",Sunrise Manor,"[36.2111922, -115.050931]"


***

In [77]:
# Query para googlecloud en sql
query = f"""
SELECT *
FROM `{db}.{tb_cat_google}`;
"""
df_category_google = consulta(query)

In [78]:
df_category_google.head()

Unnamed: 0,business_id,American_restaurant,Asian_restaurant,Barbecue_restaurant,Breakfast_restaurant,Caribbean_restaurant,Chicken_restaurant,Chicken_wings_restaurant,Chinese_restaurant,Dessert_restaurant,...,Peruvian_restaurant,Pizza_restaurant,Salvadoran_restaurant,Seafood_restaurant,Soul_food_restaurant,Sushi_restaurant,Taco_restaurant,Thai_restaurant,Vegan_restaurant,Vietnamese_restaurant
0,0x40771faa13e3c185:0xec402d308a3cbf48,,,,,,,,,,...,,,,,,,,,,
1,0x4cb555faaafddd59:0x5800723c79470569,,,,,,,,,,...,,,,,,,,,,
2,0x4cc988052259d771:0x7e8ad4ae56f776a,,,,,,,,,,...,,,,,,,,,,
3,0x4d352ae3b0e17efd:0xf0f6d5e6b27a7e4a,,,,,,,,,,...,,,,,,,,,,
4,0x4d4d1c7203ed6cbd:0x2799a2f45daace0c,,,,,,,,,,...,,,,,,,,,,


In [79]:
df_unique(df_category_google)

Valores únicos globales:
[nan  0.  1.]


In [80]:
# Aplicar la función a los nombres de columna
df_category_google.columns = df_category_google.columns.map(norm_columns)
df_category_google.head()

Unnamed: 0,business id,american restaurant,asian restaurant,barbecue restaurant,breakfast restaurant,caribbean restaurant,chicken restaurant,chicken wings restaurant,chinese restaurant,dessert restaurant,...,peruvian restaurant,pizza restaurant,salvadoran restaurant,seafood restaurant,soul food restaurant,sushi restaurant,taco restaurant,thai restaurant,vegan restaurant,vietnamese restaurant
0,0x40771faa13e3c185:0xec402d308a3cbf48,,,,,,,,,,...,,,,,,,,,,
1,0x4cb555faaafddd59:0x5800723c79470569,,,,,,,,,,...,,,,,,,,,,
2,0x4cc988052259d771:0x7e8ad4ae56f776a,,,,,,,,,,...,,,,,,,,,,
3,0x4d352ae3b0e17efd:0xf0f6d5e6b27a7e4a,,,,,,,,,,...,,,,,,,,,,
4,0x4d4d1c7203ed6cbd:0x2799a2f45daace0c,,,,,,,,,,...,,,,,,,,,,


In [81]:
df_category_google = df_category_google.rename(columns={'business id': 'business_id'})

In [82]:
df_category_google = norm_bools(df_category_google)
df_category_google.head()

Unnamed: 0,business_id,category
0,0x40771faa13e3c185:0xec402d308a3cbf48,[]
1,0x4cb555faaafddd59:0x5800723c79470569,[]
2,0x4cc988052259d771:0x7e8ad4ae56f776a,[]
3,0x4d352ae3b0e17efd:0xf0f6d5e6b27a7e4a,[]
4,0x4d4d1c7203ed6cbd:0x2799a2f45daace0c,[]


***

In [83]:
# Query para googlecloud en sql
query = f"""
SELECT *
FROM `{db}.{tb_cat_yelp}`;
"""
df_category_yelp = consulta(query)

In [84]:
df_category_yelp.head()

Unnamed: 0,business_id,pizza,sandwiches,fast food,american traditional,mexican,nightlife,breakfast & brunch,burgers,bars,...,salad,cafes,delis,japanese,sushi bars,barbeque,diners,asian fusion,steakhouses,caterers
0,MTSW4McQd7CbVtyjqoe9mw,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9OG5YkX1g2GReZM0AskizA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,tMkwHmWFUEXrC9ZduonpTg,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,2xVsWBNFwZOxIOdd9Mwnww,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,wghnIlMb_i5U46HMBGx9ig,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
df_unique(df_category_yelp)

Valores únicos globales:
[0 1]


In [86]:
len(df_category_yelp)

15305

In [87]:
df_category_yelp = norm_bools(df_category_yelp)
df_category_yelp.head()

Unnamed: 0,business_id,category
0,MTSW4McQd7CbVtyjqoe9mw,[coffee & tea]
1,9OG5YkX1g2GReZM0AskizA,[italian]
2,tMkwHmWFUEXrC9ZduonpTg,"[seafood, japanese]"
3,2xVsWBNFwZOxIOdd9Mwnww,[burgers]
4,wghnIlMb_i5U46HMBGx9ig,[chinese]


***

In [88]:
# Query para googlecloud en sql
query = f"""
SELECT *
FROM `{db}.{tb_atr_yelp}`;
"""
df_attributes_yelp = consulta(query)

In [89]:
df_attributes_yelp.head()

Unnamed: 0,business_id,RestaurantsDelivery,OutdoorSeating,BusinessAcceptsCreditCards,RestaurantsPriceRange2,RestaurantsTakeOut,Caters,RestaurantsGoodForGroups,NoiseLevel,GoodForKids,RestaurantsReservations,HasTV
0,VblfPP6EwV70ldrztBHA1A,True,True,True,1,True,True,True,2,True,True,True
1,fU05rrHw5TpBvmie20YVkA,True,True,True,1,True,True,True,1,True,True,True
2,IzpyHmR5eXkF2C6CWvG1Nw,True,False,True,1,True,True,True,sin datos,True,True,True
3,GgcDDkFQuCU5puBqq0lImQ,True,True,True,1,True,True,True,1,True,False,True
4,OBTuS99XYFfLgR0eKMAHcg,True,True,True,1,True,True,sin datos,sin datos,True,False,True


In [90]:
df_unique(df_attributes_yelp)

Valores únicos globales:
['True' '1' '2' 'False' 'sin datos' '3' '4']


In [91]:
df_attributes_yelp.columns

Index(['business_id', 'RestaurantsDelivery', 'OutdoorSeating',
       'BusinessAcceptsCreditCards', 'RestaurantsPriceRange2',
       'RestaurantsTakeOut', 'Caters', 'RestaurantsGoodForGroups',
       'NoiseLevel', 'GoodForKids', 'RestaurantsReservations', 'HasTV'],
      dtype='object')

In [92]:
df_attributes_yelp = df_attributes_yelp.rename(columns=agregar_espacios_mayuscula)
df_attributes_yelp.head()

Unnamed: 0,business_id,restaurants delivery,outdoor seating,business accepts credit cards,restaurants price range2,restaurants take out,caters,restaurants good for groups,noise level,good for kids,restaurants reservations,has tv
0,VblfPP6EwV70ldrztBHA1A,True,True,True,1,True,True,True,2,True,True,True
1,fU05rrHw5TpBvmie20YVkA,True,True,True,1,True,True,True,1,True,True,True
2,IzpyHmR5eXkF2C6CWvG1Nw,True,False,True,1,True,True,True,sin datos,True,True,True
3,GgcDDkFQuCU5puBqq0lImQ,True,True,True,1,True,True,True,1,True,False,True
4,OBTuS99XYFfLgR0eKMAHcg,True,True,True,1,True,True,sin datos,sin datos,True,False,True


In [93]:
df_attributes_yelp = df_attributes_yelp.replace('sin datos', None)

In [94]:
df_attributes_yelp.drop(['restaurants price range2','noise level'],axis=1,inplace=True)

In [95]:
df_attributes_yelp.head()

Unnamed: 0,business_id,restaurants delivery,outdoor seating,business accepts credit cards,restaurants take out,caters,restaurants good for groups,good for kids,restaurants reservations,has tv
0,VblfPP6EwV70ldrztBHA1A,True,True,True,True,True,True,True,True,True
1,fU05rrHw5TpBvmie20YVkA,True,True,True,True,True,True,True,True,True
2,IzpyHmR5eXkF2C6CWvG1Nw,True,False,True,True,True,True,True,True,True
3,GgcDDkFQuCU5puBqq0lImQ,True,True,True,True,True,True,True,False,True
4,OBTuS99XYFfLgR0eKMAHcg,True,True,True,True,True,,True,False,True


In [96]:
type(df_attributes_yelp['restaurants delivery'][0])

str

In [97]:
# Transforma valores de texto a booleanos utilizando astype(bool)
df_reemplazado = df_attributes_yelp[df_attributes_yelp.columns[1:]].applymap(lambda x: x.lower() == 'true' if x is not None else None).astype(bool)

In [98]:
# Obtener la primera columna y las columnas restantes
primera_columna = df_attributes_yelp.iloc[:, 0]
columnas_restantes = df_attributes_yelp.iloc[:, 1:]

# Convertir valores de texto a booleanos en las columnas restantes
columnas_restantes_booleanos = columnas_restantes.applymap(lambda x: x.lower() == 'true' if x is not None else None)

# Concatenar la primera columna y las columnas convertidas
df_attributes_yelp = pd.concat([primera_columna, columnas_restantes_booleanos], axis=1)

In [99]:
df_attributes_yelp.head()

Unnamed: 0,business_id,restaurants delivery,outdoor seating,business accepts credit cards,restaurants take out,caters,restaurants good for groups,good for kids,restaurants reservations,has tv
0,VblfPP6EwV70ldrztBHA1A,True,True,True,True,True,True,True,True,True
1,fU05rrHw5TpBvmie20YVkA,True,True,True,True,True,True,True,True,True
2,IzpyHmR5eXkF2C6CWvG1Nw,True,False,True,True,True,True,True,True,True
3,GgcDDkFQuCU5puBqq0lImQ,True,True,True,True,True,True,True,False,True
4,OBTuS99XYFfLgR0eKMAHcg,True,True,True,True,True,,True,False,True


In [100]:
df_attributes_yelp = norm_bools(df_attributes_yelp)
df_attributes_yelp.head()

Unnamed: 0,business_id,category
0,VblfPP6EwV70ldrztBHA1A,"[restaurants delivery, outdoor seating, busine..."
1,fU05rrHw5TpBvmie20YVkA,"[restaurants delivery, outdoor seating, busine..."
2,IzpyHmR5eXkF2C6CWvG1Nw,"[restaurants delivery, business accepts credit..."
3,GgcDDkFQuCU5puBqq0lImQ,"[restaurants delivery, outdoor seating, busine..."
4,OBTuS99XYFfLgR0eKMAHcg,"[restaurants delivery, outdoor seating, busine..."


***

In [101]:
# Query para googlecloud en sql
query = f"""
SELECT *
FROM `{db}.{tb_atr_google}`;
"""
df_attributes_google = consulta(query)

In [102]:
df_attributes_google.head()

Unnamed: 0,business_id,Health_safety,Amenities,From_the_business,Accessibility,Popular_for,Offerings,Dining_options,Atmosphere,Planning,Payments,Highlights
0,0x880e5d4f15f8c9e5:0xa0fc87d1c5e81dd9,,Wi-Fi,,"Wheelchair-accessible car park, Wheelchair-acc...",,,,,Quick visit,"Debit cards, NFC mobile payments, Credit cards","Great dessert, LGBTQ-friendly"
1,0x54950a7ba806f4db:0x6bd404e483f67929,,,,"Wheelchair accessible seating, Wheelchair acce...",,,,,,,
2,0x89d375b2164f9f05:0xa7a39c4548020acf,,Good for kids,,Wheelchair accessible seating,"Lunch, Dinner",,,,,,
3,0x8864883e41f62d9b:0x4cfbc311493fd612,,Restroom,,"Wheelchair accessible parking lot, Wheelchair ...",,Alcohol,,,,,
4,0x880e318d845e5723:0x73a42c9522ca4b46,,"Good for kids, High chairs, Toilets",,"Wheelchair-accessible car park, Wheelchair-acc...",,"Comfort food, Small plates","Lunch, Dinner",,,"Debit cards, NFC mobile payments, Credit cards","LGBTQ-friendly, Transgender safe space"


In [103]:
# Aplicar la función a cada elemento del DataFrame
df_attributes_google = df_attributes_google.applymap(convertir_a_minusculas)
df_attributes_google.head()

Unnamed: 0,business_id,Health_safety,Amenities,From_the_business,Accessibility,Popular_for,Offerings,Dining_options,Atmosphere,Planning,Payments,Highlights
0,0x880e5d4f15f8c9e5:0xa0fc87d1c5e81dd9,,wi-fi,,"wheelchair-accessible car park, wheelchair-acc...",,,,,quick visit,"debit cards, nfc mobile payments, credit cards","great dessert, lgbtq-friendly"
1,0x54950a7ba806f4db:0x6bd404e483f67929,,,,"wheelchair accessible seating, wheelchair acce...",,,,,,,
2,0x89d375b2164f9f05:0xa7a39c4548020acf,,good for kids,,wheelchair accessible seating,"lunch, dinner",,,,,,
3,0x8864883e41f62d9b:0x4cfbc311493fd612,,restroom,,"wheelchair accessible parking lot, wheelchair ...",,alcohol,,,,,
4,0x880e318d845e5723:0x73a42c9522ca4b46,,"good for kids, high chairs, toilets",,"wheelchair-accessible car park, wheelchair-acc...",,"comfort food, small plates","lunch, dinner",,,"debit cards, nfc mobile payments, credit cards","lgbtq-friendly, transgender safe space"


In [104]:
# Crear una nueva columna que contenga una lista de los valores no None de las demás columnas
df_attributes_google['category'] = df_attributes_google.apply(lambda row: [val for val in row[1:] if val is not None], axis=1)

# Seleccionar solo las columnas 'id' y 'lista_valores'
df_attributes_google = df_attributes_google[['business_id', 'category']]
df_attributes_google.head()

Unnamed: 0,business_id,category
0,0x880e5d4f15f8c9e5:0xa0fc87d1c5e81dd9,"[wi-fi, wheelchair-accessible car park, wheelc..."
1,0x54950a7ba806f4db:0x6bd404e483f67929,"[wheelchair accessible seating, wheelchair acc..."
2,0x89d375b2164f9f05:0xa7a39c4548020acf,"[good for kids, wheelchair accessible seating,..."
3,0x8864883e41f62d9b:0x4cfbc311493fd612,"[restroom, wheelchair accessible parking lot, ..."
4,0x880e318d845e5723:0x73a42c9522ca4b46,"[good for kids, high chairs, toilets, wheelcha..."


***

In [105]:
# Encontrar los valores que se repiten en la columna de df1 en la columna de df2
valores_repetidos = df_category_google[df_category_google['business_id'].isin(df_category_yelp['business_id'])]

print("Valores que se repiten en la columna de df1 en la columna de df2:")
print(valores_repetidos)

Valores que se repiten en la columna de df1 en la columna de df2:
Empty DataFrame
Columns: [business_id, category]
Index: []


In [106]:
df_concat_category = pd.concat([df_category_google, df_category_yelp], axis=0)

In [107]:
df_concat_category.head()

Unnamed: 0,business_id,category
0,0x40771faa13e3c185:0xec402d308a3cbf48,[]
1,0x4cb555faaafddd59:0x5800723c79470569,[]
2,0x4cc988052259d771:0x7e8ad4ae56f776a,[]
3,0x4d352ae3b0e17efd:0xf0f6d5e6b27a7e4a,[]
4,0x4d4d1c7203ed6cbd:0x2799a2f45daace0c,[]


In [108]:
len(df_concat_category)

55552

***

In [109]:
# Encontrar los valores que se repiten en la columna de df1 en la columna de df2
valores_repetidos = df_attributes_google[df_attributes_google['business_id'].isin(df_attributes_yelp['business_id'])]

print("Valores que se repiten en la columna de df1 en la columna de df2:")
print(valores_repetidos)

Valores que se repiten en la columna de df1 en la columna de df2:
Empty DataFrame
Columns: [business_id, category]
Index: []


In [110]:
df_concat_atr = pd.concat([df_attributes_google, df_attributes_yelp], axis=0)
df_concat_atr.head()

Unnamed: 0,business_id,category
0,0x880e5d4f15f8c9e5:0xa0fc87d1c5e81dd9,"[wi-fi, wheelchair-accessible car park, wheelc..."
1,0x54950a7ba806f4db:0x6bd404e483f67929,"[wheelchair accessible seating, wheelchair acc..."
2,0x89d375b2164f9f05:0xa7a39c4548020acf,"[good for kids, wheelchair accessible seating,..."
3,0x8864883e41f62d9b:0x4cfbc311493fd612,"[restroom, wheelchair accessible parking lot, ..."
4,0x880e318d845e5723:0x73a42c9522ca4b46,"[good for kids, high chairs, toilets, wheelcha..."


In [111]:
df_concat_atr = df_concat_atr.rename(columns={'category': 'category2'})

In [112]:
len(df_concat_atr)

55552

***

In [113]:
df_final = pd.merge(df_main, df_concat_category, on='business_id', how='left')
df_final.head()

Unnamed: 0,business_id,name,address,city,coord,category
0,0x872d8f6b3060731f:0xbab7d25cfd777e60,Paleo Brio Healthy Kitchen-Flagstaff,"CWWX+P2 Parks, AZ, USA",Parks,"[35.4468219, -112.0524234]",[american restaurant]
1,0x872a4991146626e7:0x7d5c5ed7e1a58fa3,Santan Barbecue & Grille,"X87J+3X Casa Grande, AZ, USA",Casa Grande,"[32.9626935, -111.6675081]",[barbecue restaurant]
2,0x89e36d1b29148441:0xf577f42878da4838,El Pulgarcito,"F2HM+5G Lynn, MA, USA",Lynn,"[42.4779416, -70.966168]",[latin american restaurant]
3,0x89a9f643802fd7bf:0xd4948847f210b7cf,Subway,"44W3+J3 Silver Lake, NC, USA",Silver Lake,"[34.1465696, -77.8973559]",[fast food restaurant]
4,0x80c8dcfe21b0ac85:0x1f1ba837d0fb5612,Champion's Grille at Sunrise Vista,"6W6X+FJ Sunrise Manor, NV, USA",Sunrise Manor,"[36.2111922, -115.050931]",[hamburger restaurant]


In [114]:
df_final = pd.merge(df_final, df_concat_atr, on='business_id', how='left')
df_final.head()

Unnamed: 0,business_id,name,address,city,coord,category,category2
0,0x872d8f6b3060731f:0xbab7d25cfd777e60,Paleo Brio Healthy Kitchen-Flagstaff,"CWWX+P2 Parks, AZ, USA",Parks,"[35.4468219, -112.0524234]",[american restaurant],"[takeout, delivery, good for kids, wheelchair ..."
1,0x872a4991146626e7:0x7d5c5ed7e1a58fa3,Santan Barbecue & Grille,"X87J+3X Casa Grande, AZ, USA",Casa Grande,"[32.9626935, -111.6675081]",[barbecue restaurant],"[delivery, good for kids, wheelchair accessibl..."
2,0x89e36d1b29148441:0xf577f42878da4838,El Pulgarcito,"F2HM+5G Lynn, MA, USA",Lynn,"[42.4779416, -70.966168]",[latin american restaurant],[]
3,0x89a9f643802fd7bf:0xd4948847f210b7cf,Subway,"44W3+J3 Silver Lake, NC, USA",Silver Lake,"[34.1465696, -77.8973559]",[fast food restaurant],"[delivery, takeout, dine-in, good for kids, wh..."
4,0x80c8dcfe21b0ac85:0x1f1ba837d0fb5612,Champion's Grille at Sunrise Vista,"6W6X+FJ Sunrise Manor, NV, USA",Sunrise Manor,"[36.2111922, -115.050931]",[hamburger restaurant],"[outdoor seating, takeout, delivery, good for ..."


In [115]:
# Función para agregar palabras de 'y' a 'x' sin repetir
def agregar_palabras_sin_repetir(x, y):
    for palabra in y:
        if palabra not in x:
            x.append(palabra)
    return x

# Aplicar la función para agregar palabras de 'y' a 'x'
df_final['keywords'] = df_final.apply(lambda row: agregar_palabras_sin_repetir(row['category'], row['category2']), axis=1)
df_final.head()


Unnamed: 0,business_id,name,address,city,coord,category,category2,keywords
0,0x872d8f6b3060731f:0xbab7d25cfd777e60,Paleo Brio Healthy Kitchen-Flagstaff,"CWWX+P2 Parks, AZ, USA",Parks,"[35.4468219, -112.0524234]","[american restaurant, takeout, delivery, good ...","[takeout, delivery, good for kids, wheelchair ...","[american restaurant, takeout, delivery, good ..."
1,0x872a4991146626e7:0x7d5c5ed7e1a58fa3,Santan Barbecue & Grille,"X87J+3X Casa Grande, AZ, USA",Casa Grande,"[32.9626935, -111.6675081]","[barbecue restaurant, delivery, good for kids,...","[delivery, good for kids, wheelchair accessibl...","[barbecue restaurant, delivery, good for kids,..."
2,0x89e36d1b29148441:0xf577f42878da4838,El Pulgarcito,"F2HM+5G Lynn, MA, USA",Lynn,"[42.4779416, -70.966168]",[latin american restaurant],[],[latin american restaurant]
3,0x89a9f643802fd7bf:0xd4948847f210b7cf,Subway,"44W3+J3 Silver Lake, NC, USA",Silver Lake,"[34.1465696, -77.8973559]","[fast food restaurant, delivery, takeout, dine...","[delivery, takeout, dine-in, good for kids, wh...","[fast food restaurant, delivery, takeout, dine..."
4,0x80c8dcfe21b0ac85:0x1f1ba837d0fb5612,Champion's Grille at Sunrise Vista,"6W6X+FJ Sunrise Manor, NV, USA",Sunrise Manor,"[36.2111922, -115.050931]","[hamburger restaurant, outdoor seating, takeou...","[outdoor seating, takeout, delivery, good for ...","[hamburger restaurant, outdoor seating, takeou..."


In [116]:
df_final.drop(['category','category2'],axis=1,inplace=True)
df_final.head()

Unnamed: 0,business_id,name,address,city,coord,keywords
0,0x872d8f6b3060731f:0xbab7d25cfd777e60,Paleo Brio Healthy Kitchen-Flagstaff,"CWWX+P2 Parks, AZ, USA",Parks,"[35.4468219, -112.0524234]","[american restaurant, takeout, delivery, good ..."
1,0x872a4991146626e7:0x7d5c5ed7e1a58fa3,Santan Barbecue & Grille,"X87J+3X Casa Grande, AZ, USA",Casa Grande,"[32.9626935, -111.6675081]","[barbecue restaurant, delivery, good for kids,..."
2,0x89e36d1b29148441:0xf577f42878da4838,El Pulgarcito,"F2HM+5G Lynn, MA, USA",Lynn,"[42.4779416, -70.966168]",[latin american restaurant]
3,0x89a9f643802fd7bf:0xd4948847f210b7cf,Subway,"44W3+J3 Silver Lake, NC, USA",Silver Lake,"[34.1465696, -77.8973559]","[fast food restaurant, delivery, takeout, dine..."
4,0x80c8dcfe21b0ac85:0x1f1ba837d0fb5612,Champion's Grille at Sunrise Vista,"6W6X+FJ Sunrise Manor, NV, USA",Sunrise Manor,"[36.2111922, -115.050931]","[hamburger restaurant, outdoor seating, takeou..."


In [None]:
def norm_lista(lista):
    # Crear una nueva lista para almacenar las palabras divididas
    lista_palabras_divididas = []

    # Iterar sobre cada palabra en la lista
    for palabra in lista:
        # Verificar si la palabra contiene una coma
        if ',' in palabra:
            # Dividir la palabra en función de la coma y agregar las partes a la lista
            palabras_divididas = palabra.split(',')
            for palabra_div in palabras_divididas:
                lista_palabras_divididas.append(palabra_div.strip())
        else:
            # Si no contiene coma, agregar la palabra sin cambios a la lista
            lista_palabras_divididas.append(palabra.strip())
    return lista_palabras_divididas

In [None]:
df_final['keywords'] = df_final['keywords'].apply(norm_lista)
df_final.head()

In [117]:
len(df_final)

53490

In [118]:
df_final.to_pickle('../Datasets_ML/Rest_completo_sin_ML.pickle')

***

In [2]:
Rest_completo_sin_ML = pd.read_pickle('../Datasets_ML/Rest_completo_sin_ML.pickle')
Rest_completo_sin_ML.head()

Unnamed: 0,business_id,name,address,city,coord,keywords
0,0x872d8f6b3060731f:0xbab7d25cfd777e60,Paleo Brio Healthy Kitchen-Flagstaff,"CWWX+P2 Parks, AZ, USA",Parks,"[35.4468219, -112.0524234]","[american restaurant, takeout, delivery, good ..."
1,0x872a4991146626e7:0x7d5c5ed7e1a58fa3,Santan Barbecue & Grille,"X87J+3X Casa Grande, AZ, USA",Casa Grande,"[32.9626935, -111.6675081]","[barbecue restaurant, delivery, good for kids,..."
2,0x89e36d1b29148441:0xf577f42878da4838,El Pulgarcito,"F2HM+5G Lynn, MA, USA",Lynn,"[42.4779416, -70.966168]",[latin american restaurant]
3,0x89a9f643802fd7bf:0xd4948847f210b7cf,Subway,"44W3+J3 Silver Lake, NC, USA",Silver Lake,"[34.1465696, -77.8973559]","[fast food restaurant, delivery, takeout, dine..."
4,0x80c8dcfe21b0ac85:0x1f1ba837d0fb5612,Champion's Grille at Sunrise Vista,"6W6X+FJ Sunrise Manor, NV, USA",Sunrise Manor,"[36.2111922, -115.050931]","[hamburger restaurant, outdoor seating, takeou..."


In [3]:
model_name = "en_core_web_md"

def load_model(model_name): 
    try:
        nlp = spacy.load(model_name)
        print(f"Modelo {model_name} cargado exitosamente.")
        return nlp
    except OSError:
        print(f"El modelo {model_name} no está instalado. Descargando e instalando...")
        spacy.cli.download(model_name)
        nlp = spacy.load(model_name)
        return nlp
    
nlp_en = load_model(model_name)

Modelo en_core_web_md cargado exitosamente.


In [4]:
def df_embedding(lista):
    if isinstance(lista, list) and len(lista) > 0:
        embeddings_lista = np.array([nlp_en(palabra).vector for palabra in lista])
        if torch.cuda.is_available() and isinstance(embeddings_lista, torch.Tensor):
            embeddings_lista = embeddings_lista.cuda()

        return embeddings_lista
    else:
        return None

In [5]:
Rest_completo_sin_ML['embedding'] = Rest_completo_sin_ML['keywords'].apply(df_embedding)

In [6]:
Rest_completo_sin_ML.head()

Unnamed: 0,business_id,name,address,city,coord,keywords,embedding
0,0x872d8f6b3060731f:0xbab7d25cfd777e60,Paleo Brio Healthy Kitchen-Flagstaff,"CWWX+P2 Parks, AZ, USA",Parks,"[35.4468219, -112.0524234]","[american restaurant, takeout, delivery, good ...","[[-0.99531996, -1.1494, -2.63405, -0.592645, 4..."
1,0x872a4991146626e7:0x7d5c5ed7e1a58fa3,Santan Barbecue & Grille,"X87J+3X Casa Grande, AZ, USA",Casa Grande,"[32.9626935, -111.6675081]","[barbecue restaurant, delivery, good for kids,...","[[0.21536501, -0.36992, -1.88865, 0.36057, 2.2..."
2,0x89e36d1b29148441:0xf577f42878da4838,El Pulgarcito,"F2HM+5G Lynn, MA, USA",Lynn,"[42.4779416, -70.966168]",[latin american restaurant],"[[-0.69983, -0.58922666, 0.42566672, -0.123566..."
3,0x89a9f643802fd7bf:0xd4948847f210b7cf,Subway,"44W3+J3 Silver Lake, NC, USA",Silver Lake,"[34.1465696, -77.8973559]","[fast food restaurant, delivery, takeout, dine...","[[2.6815202, -1.0777999, -5.9206333, 0.7224250..."
4,0x80c8dcfe21b0ac85:0x1f1ba837d0fb5612,Champion's Grille at Sunrise Vista,"6W6X+FJ Sunrise Manor, NV, USA",Sunrise Manor,"[36.2111922, -115.050931]","[hamburger restaurant, outdoor seating, takeou...","[[0.94348, -0.88408, -2.66125, 0.18630001, 2.4..."


In [7]:
len(Rest_completo_sin_ML)

53490

In [8]:
Rest_completo_sin_ML.to_pickle('../Datasets_ML/Rest_completo_ML.pickle')

***

In [14]:
Rest_completo_ML = pd.read_pickle('../Datasets_ML/Rest_completo_ML.pickle')
Rest_completo_ML.head()

Unnamed: 0,business_id,name,address,city,coord,keywords,embedding
0,0x872d8f6b3060731f:0xbab7d25cfd777e60,Paleo Brio Healthy Kitchen-Flagstaff,"CWWX+P2 Parks, AZ, USA",Parks,"[35.4468219, -112.0524234]","[american restaurant, takeout, delivery, good ...","[[-0.99531996, -1.1494, -2.63405, -0.592645, 4..."
1,0x872a4991146626e7:0x7d5c5ed7e1a58fa3,Santan Barbecue & Grille,"X87J+3X Casa Grande, AZ, USA",Casa Grande,"[32.9626935, -111.6675081]","[barbecue restaurant, delivery, good for kids,...","[[0.21536501, -0.36992, -1.88865, 0.36057, 2.2..."
2,0x89e36d1b29148441:0xf577f42878da4838,El Pulgarcito,"F2HM+5G Lynn, MA, USA",Lynn,"[42.4779416, -70.966168]",[latin american restaurant],"[[-0.69983, -0.58922666, 0.42566672, -0.123566..."
3,0x89a9f643802fd7bf:0xd4948847f210b7cf,Subway,"44W3+J3 Silver Lake, NC, USA",Silver Lake,"[34.1465696, -77.8973559]","[fast food restaurant, delivery, takeout, dine...","[[2.6815202, -1.0777999, -5.9206333, 0.7224250..."
4,0x80c8dcfe21b0ac85:0x1f1ba837d0fb5612,Champion's Grille at Sunrise Vista,"6W6X+FJ Sunrise Manor, NV, USA",Sunrise Manor,"[36.2111922, -115.050931]","[hamburger restaurant, outdoor seating, takeou...","[[0.94348, -0.88408, -2.66125, 0.18630001, 2.4..."


In [10]:
Rest_completo_ML.drop(['address','name','coord','keywords','city'],axis=1,inplace=True)
Rest_completo_ML.head()


Unnamed: 0,business_id,embedding
0,0x872d8f6b3060731f:0xbab7d25cfd777e60,"[[-0.99531996, -1.1494, -2.63405, -0.592645, 4..."
1,0x872a4991146626e7:0x7d5c5ed7e1a58fa3,"[[0.21536501, -0.36992, -1.88865, 0.36057, 2.2..."
2,0x89e36d1b29148441:0xf577f42878da4838,"[[-0.69983, -0.58922666, 0.42566672, -0.123566..."
3,0x89a9f643802fd7bf:0xd4948847f210b7cf,"[[2.6815202, -1.0777999, -5.9206333, 0.7224250..."
4,0x80c8dcfe21b0ac85:0x1f1ba837d0fb5612,"[[0.94348, -0.88408, -2.66125, 0.18630001, 2.4..."


In [11]:
Rest_completo_ML.to_pickle('../Datasets_ML/Rest_solo_ML.pickle')

In [12]:
len(Rest_completo_ML)

53490

***