In [28]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
import math
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Guille\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Guille\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
# Cargar un modelo de incrustaciones de palabras pre-entrenado (Word2Vec en este caso)
word2vec_model = api.load('word2vec-google-news-300')

In [36]:
df = pd.read_parquet('./Datasets_ML/Rest_google.parquet')
df.head()

Unnamed: 0,name,address,description,category,MISC,coord
0,Roux's Roadhouse 73,"Roux's Roadhouse 73, 784 WI-73, Nekoosa, WI 54457",,[Restaurant],{'Accessibility': ['Wheelchair accessible entr...,"[44.3069541, -89.8457834]"
1,Crepes n' Tacos Mexican Grill,"Crepes n' Tacos Mexican Grill, suit #5, 5390, ...",,"[Mexican restaurant, Crêperie, Ice cream shop,...","{'Accessibility': None, 'Amenities': ['Good fo...","[36.1646332, -115.0607804]"
2,The Big Eazy,"The Big Eazy, 2053 N Doctor M.L.K. Jr Dr, Milw...",,"[Cajun restaurant, Creole restaurant]","{'Accessibility': None, 'Amenities': ['Good fo...","[43.0576581, -87.914643]"
3,Subway,"Subway, 250 E Wolf Run, Mukwonago, WI 53149",Casual counter-serve chain for build-your-own ...,"[Sandwich shop, Caterer, Fast food restaurant,...","{'Accessibility': None, 'Amenities': None, 'At...","[42.8541679, -88.3142471]"
4,Papa Kelsey's,"Papa Kelsey's, 165 E 1400 N, Logan, UT 84341","Chain eatery serving baked subs, classic pizza...",[Pizza restaurant],{'Accessibility': ['Wheelchair accessible entr...,"[41.7580941, -111.8302904]"


In [39]:
df.rename(columns={'category': 'categories'}, inplace=True)

In [31]:
# Tokenización y limpieza de texto
def preprocess_input(input_text):
    tokens = word_tokenize(input_text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

In [43]:
preprocess_input("I want to eat barbecue meat")

['want', 'eat', 'barbecue', 'meat']

In [32]:
def calculate_distance(user_location, restaurant_location):
    # Extraer las coordenadas de latitud y longitud del usuario y del restaurante
    user_lat, user_lon = user_location
    restaurant_lat, restaurant_lon = restaurant_location

    # Radio de la Tierra en kilómetros
    R = 6371.0

    # Convertir latitud y longitud de grados a radianes
    user_lat = math.radians(user_lat)
    user_lon = math.radians(user_lon)
    restaurant_lat = math.radians(restaurant_lat)
    restaurant_lon = math.radians(restaurant_lon)

    # Diferencia en latitud y longitud
    dlon = restaurant_lon - user_lon
    dlat = restaurant_lat - user_lat

    # Fórmula de Haversine para calcular la distancia
    a = math.sin(dlat / 2)**2 + math.cos(user_lat) * math.cos(restaurant_lat) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Distancia en kilómetros
    distance = R * c

    return distance

In [33]:
def recommend_restaurants(input_text, df, word2vec_model, user_location):
    input_keywords = preprocess_input(input_text)

    # Obtener los índices de las palabras que existen en el modelo
    existing_word_indices = [word2vec_model.key_to_index[word] for word in input_keywords if word in word2vec_model.key_to_index]

    # Obtener los vectores para las palabras que existen
    keyword_vectors = [word2vec_model.vectors[index] for index in existing_word_indices]

    # Calcular el vector promedio de las palabras clave de entrada
    if keyword_vectors:
        input_vector = np.mean(keyword_vectors, axis=0)

        # Calcular la similitud de coseno entre la entrada y las categorías de los restaurantes
        similarities = []
        for index, row in df.iterrows():
            category_words = [word for word in row['category'] if word in word2vec_model.key_to_index]
            if category_words:
                category_vector = np.mean([word2vec_model[word] for word in category_words], axis=0)
                similarity = cosine_similarity(input_vector.reshape(1, -1), category_vector.reshape(1, -1))[0][0]
                similarities.append(similarity)
            else:
                similarities.append(np.nan)
        
        # Actualizar la columna de similitud
        df['similarity'] = similarities

        # Calcular la distancia desde la ubicación del usuario
        df['distance'] = df['coord'].apply(lambda x: calculate_distance(user_location, x))

        # Ponderar similitud y distancia para obtener una puntuación final
        df['score'] = df['similarity'] + 0.5 * (1 - df['distance'])

        # Ordenar por puntuación
        recommended_restaurants = df.sort_values(by='score', ascending=False).head(5)
        return recommended_restaurants[['name', 'category', 'coord']]
    else:
        return "No se encontraron palabras clave válidas en la entrada."



In [34]:
# Ejemplo de cómo llamar a la función
input_text = "I want to eat barbecue meat"
user_location = 'UbicacionUsuario'  # Reemplaza con la ubicación real del usuario
recommended_restaurants = recommend_restaurants(input_text, df, word2vec_model, (44.3069541, -89.8457834))
recommended_restaurants

Unnamed: 0,name,category,coord
0,Roux's Roadhouse 73,[Restaurant],"[44.3069541, -89.8457834]"
641,Subway,"[Sandwich shop, Caterer, Fast food restaurant,...","[44.3062594, -89.810468]"
67,Jabberjawz Bar And Dining,"[Bar, Bar & grill, Pizza restaurant, Volleybal...","[44.3443605, -89.9069743]"
2263,Baker Street Grill,"[Bar & grill, Fine dining restaurant, Restaurant]","[44.3930556, -89.8052778]"
356,Loyal-Phant Market,"[Grocery store, Asian grocery store, Asian res...","[44.4077647, -89.7883925]"


***