In [2]:
!pip install scikit-learn



In [5]:
import pandas as pd
import pandas_gbq
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors

In [6]:
PROJECT_ID: str = "proyectofinal-389001"
pandas_gbq.context.project = PROJECT_ID
pandas_gbq.context.dialect = "standard"

In [7]:
query: str = f"""--sql
SELECT
    *
FROM `{PROJECT_ID}.Google.Metadata`;
"""

gm_meta_df = pd.read_gbq(
    query=query,
    location="us"
)

Filter necessary columns

In [10]:
keep_cols = ["business_id", "avg_rating", "num_of_reviews", "states", "city", "category"]

In [11]:
dfMeta_filtrado = gm_meta_df[keep_cols]

Create a pipeline for numerical variables

In [12]:
numeric_pipe = Pipeline([
    ("scaler", StandardScaler())
])

Create a pipeline to process categorical variables

In [13]:
categorical_pipe = Pipeline([
    ("encoder", OneHotEncoder(drop = "first"))
])

Transform columns according to their type

In [14]:
col_transf = ColumnTransformer([
    ("numeric", numeric_pipe, dfMeta_filtrado._get_numeric_data().columns.tolist()),
    ("categoric", categorical_pipe, dfMeta_filtrado.select_dtypes("object").columns.tolist())
])

In [16]:
dfMeta_filtrado

Unnamed: 0,business_id,avg_rating,num_of_reviews,states,city,category
0,0x80c29bd311c23057:0x27ee44132468b0d,2.0,1,CA,Reseda,mexican restaurant
1,0x809ac645d5622a3d:0xe25d578f4047e0de,2.0,1,CA,Elk Grove,indian restaurant
2,0x80dbff942204d48b:0x2c3718b3a701035b,2.0,1,CA,San Diego,"lounge, bar, restaurant"
3,0x80c1f61f014fe231:0xbfdbef8b48977060,2.0,1,CA,Tehachapi,restaurant
4,0x80db1ba52db1f361:0x29a4af82b1053054,2.0,1,CA,Palm Springs,bar
...,...,...,...,...,...,...
111142,0x864e710688eec113:0xd7aa6e326088c468,1.8,67,TX,Fort Worth,"pizza delivery, delivery restaurant, takeout r..."
111143,0x8640d960088771b3:0xabbd86a1e912a3cd,1.8,4,TX,Houston,"convenience store, coffee shop, diesel fuel su..."
111144,0x8640dceb8a831567:0xcc96a1c7ffa8c73a,1.8,5,TX,Houston,"convenience store, coffee shop, diesel fuel su..."
111145,0x864e899597d177bd:0x145624ed5f3ee3b5,1.8,18,TX,Grand Prairie,"fast food restaurant, breakfast restaurant, bu..."


In [19]:
dfMeta_filtrado = dfMeta_filtrado.fillna(0)

Unnamed: 0,business_id,avg_rating,num_of_reviews,states,city,category
0,0x80c29bd311c23057:0x27ee44132468b0d,2.0,1,CA,Reseda,mexican restaurant
1,0x809ac645d5622a3d:0xe25d578f4047e0de,2.0,1,CA,Elk Grove,indian restaurant
2,0x80dbff942204d48b:0x2c3718b3a701035b,2.0,1,CA,San Diego,"lounge, bar, restaurant"
3,0x80c1f61f014fe231:0xbfdbef8b48977060,2.0,1,CA,Tehachapi,restaurant
4,0x80db1ba52db1f361:0x29a4af82b1053054,2.0,1,CA,Palm Springs,bar
...,...,...,...,...,...,...
111142,0x864e710688eec113:0xd7aa6e326088c468,1.8,67,TX,Fort Worth,"pizza delivery, delivery restaurant, takeout r..."
111143,0x8640d960088771b3:0xabbd86a1e912a3cd,1.8,4,TX,Houston,"convenience store, coffee shop, diesel fuel su..."
111144,0x8640dceb8a831567:0xcc96a1c7ffa8c73a,1.8,5,TX,Houston,"convenience store, coffee shop, diesel fuel su..."
111145,0x864e899597d177bd:0x145624ed5f3ee3b5,1.8,18,TX,Grand Prairie,"fast food restaurant, breakfast restaurant, bu..."


In [20]:
col_transf_fit = col_transf.fit(dfMeta_filtrado)
dfMeta_filtrado_transf = col_transf_fit.transform(dfMeta_filtrado)
dfMeta_filtrado_transf

<111147x137398 sparse matrix of type '<class 'numpy.float64'>'
	with 632735 stored elements in Compressed Sparse Row format>

Clasificación o regresión basada en vecinos cercanos

Algoritmo NearestNeighbors de Scikit-learn
Este parámetro especifica el número de vecinos más cercanos que se utilizarán para encontrar los puntos más cercanos a un punto de consulta

In [22]:
n_neighbors = 5
nneighbors = NearestNeighbors(
    n_neighbors=n_neighbors,
    metric="cosine"
).fit(dfMeta_filtrado_transf)

In [23]:
dif, ind = nneighbors.kneighbors(dfMeta_filtrado_transf[1])

In [29]:
print("Lugares que ha gustado")
print("="*80)
gm_meta_df .loc[ind[0][0], :]
print("Lugares recomendados")
print("="*80)
gm_meta_df .loc[ind[0][1:], :]

Lugares que ha gustado
Lugares recomendados


Unnamed: 0,local_name,business_id,latitude,longitude,category,avg_rating,num_of_reviews,url,states,city,main_category,platform
26973,Persis,0x808fec05a2d956b7:0x8de09b737825b8d4,37.704612,-121.911751,indian restaurant,1.5,4,https://www.google.com/maps/place//data=!4m2!3...,CA,Dublin,food services,google
111100,Amber Moon,0x808f9df11a395555:0xa487d7efd38992e3,37.577187,-122.348591,indian restaurant,1.8,5,https://www.google.com/maps/place//data=!4m2!3...,CA,Burlingame,food services,google
91394,Gunpowder cafe,0x89c6c7f9890be9ed:0xc8115809af9b0599,39.949131,-75.214187,indian restaurant,1.6,8,https://www.google.com/maps/place//data=!4m2!3...,PA,Philadelphia,food services,google
26798,Cafe Belle 2,0x80dbffb014ee5963:0x19cede01485cfed6,32.83408,-117.137015,coffee shop,1.0,1,https://www.google.com/maps/place//data=!4m2!3...,CA,San Diego,food services,google


In [30]:
print("Lugares que ha gustado")
print("="*80)
#gm_meta_df .loc[ind[0][0], :]
print("Lugares recomendados")
print("="*80)
gm_meta_df .loc[ind[0][1:], :]

Lugares que ha gustado
Lugares recomendados


Unnamed: 0,local_name,business_id,latitude,longitude,category,avg_rating,num_of_reviews,url,states,city,main_category,platform
26973,Persis,0x808fec05a2d956b7:0x8de09b737825b8d4,37.704612,-121.911751,indian restaurant,1.5,4,https://www.google.com/maps/place//data=!4m2!3...,CA,Dublin,food services,google
111100,Amber Moon,0x808f9df11a395555:0xa487d7efd38992e3,37.577187,-122.348591,indian restaurant,1.8,5,https://www.google.com/maps/place//data=!4m2!3...,CA,Burlingame,food services,google
91394,Gunpowder cafe,0x89c6c7f9890be9ed:0xc8115809af9b0599,39.949131,-75.214187,indian restaurant,1.6,8,https://www.google.com/maps/place//data=!4m2!3...,PA,Philadelphia,food services,google
26798,Cafe Belle 2,0x80dbffb014ee5963:0x19cede01485cfed6,32.83408,-117.137015,coffee shop,1.0,1,https://www.google.com/maps/place//data=!4m2!3...,CA,San Diego,food services,google
