In [2]:
import gspread
import time 
from datetime import datetime
import re
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import pandas as pd
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [56]:
class ScrapearYelp:

    def __init__(self) -> None:
        
        self.driver = webdriver.Chrome()

    # Funcion para encontrar el business id
    def get_business_id (self, business):
        try:
            # Encontrar el elemento por su selector CSS
            element = business.driver.find_element(By.CSS_SELECTOR, 'h3.y-css-hcgwj4 > a.y-css-12ly5yx')

            # Obtener el atributo href del elemento
            href = element.get_attribute('href')
            
            # Usar una expresión regular para encontrar el valor de 'ad_business_id'
            match = re.search(r'ad_business_id=([^&]+)', href)

            if match:
                ad_business_id = match.group(1)
            else:
                ad_business_id = href.split('/biz/')[-1].split('?')[0]
            
            return ad_business_id
                
        except:

            return None
        
    # Funcion para encontrar el business name
    def get_business_name (self):

        try:
            # Por medio del Xpath halla el nombre del business y extrae el texto
            name = self.driver.find_element(By.XPATH, '/html/body/yelp-react-root/div[1]/div[4]/div[1]/div[1]/div/div/div[1]/h1').text
            return name
        except:
            return None
        
    # Funcion para extraer la ciudad
    def get_city (self):
        # Creamos una lista con las ciudades que vamos a usar
        ciudades = ['Orlando', 'Miami','Tampa','Clearwater', 'Saint Petesburg', 'Brandon', 'Largo', 'Palm Harbor', 'Dunedin', 'Pinellas Park']

        try:
            # Encontrar el elemento por su selector CSS
            element = self.driver.find_element(By.XPATH, '//*[@id="location-and-hours"]/section/div[2]/div[1]/div/div/div/div[1]/address/p[2]')
            # Obtener el texto del elemento
            text = element.text

            # Asumimos que la ciudad es la primera posicion
            city = text.split(',')[0]
            # Analisamos si se encuentra dentro de nuestro listado de ciudades ya que la informacion tiene dos posibles rutas en las que se puede encontrar
            if city not in ciudades:
                try:
                     
                    element = self.driver.find_element(By.XPATH, '//*[@id="location-and-hours"]/section/div[2]/div[1]/div/div/div/div[1]/address/p[3]')
                    # Obtener el texto del elemento
                    text = element.text

                    # Asumimos que la ciudad es la primera palabra
                    city = text.split(',')[0]
                except:
                    return None
            return city    
        
        except:
            return None

    # Funcion para obtener el estado   
    def get_state (self):
        # unicamente se retorna Florida por que en este caso unicamente se buscaran en ciudades que estan dentro de este estado
        return 'Florida'
    
    # Funcion para extraer la latitud y longitud
    def get_coord (self):
        try:
            # Encontrar el elemento por su selector CSS
            element = self.driver.find_element(By.CSS_SELECTOR, 'div.container__09f24__fZQnf.y-css-9q7a37 > img')

            # Obtener el atributo src del elemento
            src = element.get_attribute('src')

            # Usar una expresión regular para encontrar los valores de latitud y longitud en el parámetro 'center'
            match = re.search(r'center=([-+]?[0-9]*\.?[0-9]+)%2C([-+]?[0-9]*\.?[0-9]+)', src)

            # Hacemos el match y devolvemos como flotantes en dos variables distintas
            if match:
                latitude = match.group(1)
                longitude = match.group(2)

                return float(latitude), float(longitude)
            
        except:
            return None, None
        
    # Funcion para extraer el rating promedio de el business
    def get_stars (self):
        try:
            # Por medio del Xpath encuentra el dato y toma el texto
            stars = self.driver.find_element(By.XPATH, '/html/body/yelp-react-root/div[1]/div[4]/div[1]/div[1]/div/div/div[2]/div[2]/span[1]').text
            # Se transforma a formato float
            return float(stars)
        
        except:
            return None
        
    # Funcion oara extraer la cantidad de reviews por negocio
    def get_reviews_count (self):
        try:
            # Usa el Xpath para extraer el texto y luego usando 'Replace', se elimina lo que no es numerico
            reviews = self.driver.find_element(By.XPATH, '/html/body/yelp-react-root/div[1]/div[4]/div[1]/div[1]/div/div/div[2]/div[2]/span[2]').text
            reviews = reviews.replace('(', '').replace(' reviews)', '').replace(',','')
            # Se retorna en formato int
            return int(reviews)
        
        except:
            return None

    # Funcion para extraer la fecha de la reseña      
    def get_date (self, reseña):
        try:
            # Espera hasta que el elemento con la clase 'y-css-wfbtsu' esté presente
            element = reseña.find_element(By.CLASS_NAME, 'y-css-wfbtsu')

            # Extrae el texto del elemento
            date_text = element.text

            # Convierte el texto a un objeto datetime
            date_format = "%b %d, %Y"  # Formato de fecha en inglés: 'Jun 27, 2024'
            date_object = datetime.strptime(date_text, date_format)

            return date_object

        except:
            return None
    
    # Funcion para hacer el Scraping
    def scraping (self, link):

        #try:
        # Abrimos el link
        self.driver.get(link)

        time.sleep(3)
        # Hallamos el cuadro de texto para buscar la categoria que necesitamos
        category = self.driver.find_element(By.XPATH, '//*[@id="search_description"]')
        # Limpiamos y llenamos el espacio con la info necesaria
        category.clear()
        category.click()
        category.send_keys('Mexican Food')
        time.sleep(1)
        # Buscamos el recuadro para buscar la ubicacion deseada
        ciudad = self.driver.find_element(By.XPATH, '//*[@id="search_location"]')
        # Limpiamos y llenamos con la informacion necesaria
        ciudad.clear()
        ciudad.click()
        ciudad.send_keys('Orlando, FL, EEUU')
        time.sleep(1)
        # Hacemos Click en el boton de buscar
        buscar = self.driver.find_element(By.XPATH, '//*[@id="header_find_form"]/div[3]/button')

        buscar.click()

        siguiente = True

        while siguiente == True:

            # Listas de las columnas para el DF de sitios
            business_id = []
            name = []
            city = []
            state = []
            latitude = []
            longitude = []
            stars = []
            reviews = []

            # Listas para el DF de reviews

            user_id_review = []
            business_id2 = []
            stars_review = []
            date_review = []

            sitios_df = pd.read_parquet('../../Data/Parquet/Sitios_nuevo_yelp.parquet')
            reviews_df = pd.read_parquet('../../Data/Parquet/Reviews_nuevo_yelp.parquet')

            # Hacemos una lista de los restaurantes en la pagina actual
            restaurantes = self.driver.find_elements(By.CSS_SELECTOR, 'div.y-css-1he6azc[data-testid="serp-ia-card"]')
            cantidad = len(restaurantes)
            try:
                for i in range(2, cantidad-1):

                    restaurante = restaurantes[i]

                    element = restaurante.find_element(By.CSS_SELECTOR, 'h3.y-css-hcgwj4 > a.y-css-12ly5yx')

                    # Obtener el atributo href del elemento
                    href = element.get_attribute('href')
                    
                    # Usar una expresión regular para encontrar el valor de 'ad_business_id'
                    match = re.search(r'ad_business_id=([^&]+)', href)

                    if match:
                        idrestaurant = match.group(1)
                    else:
                        idrestaurant = href.split('/biz/')[-1].split('?')[0]

                    if idrestaurant not in sitios_df['business_id'].values:

                        restaurante2 = restaurante.find_element(By.CLASS_NAME, 'y-css-hcgwj4')
                        restaurante2.click()
                        time.sleep(4)

                        self.driver.switch_to.window(self.driver.window_handles[1])

                        try:
                            baner = self.driver.find_element(By.XPATH, '//*[@id="modal-portal-container"]/div[1]/div/div/div')
                            cerrar = baner.find_element(By.XPATH, '//*[@id="modal-portal-container"]/div[1]/div/div/div/div[1]/button/span')
                            cerrar.click()
                            time.sleep(2)
                        except:
                            None
                        
                        name_tmp = self.get_business_name()
                        city_tmp = self.get_city()
                        state_tmp = self.get_state()
                        latitud_tmp, longitud_tmp = self.get_coord()
                        rate_tmp = self.get_stars()
                        reviews_tmp = self.get_reviews_count()

                        business_id.append(idrestaurant)
                        name.append(name_tmp)
                        city.append(city_tmp)
                        state.append(state_tmp)
                        latitude.append(latitud_tmp)
                        longitude.append(longitud_tmp)
                        stars.append(rate_tmp)
                        reviews.append(reviews_tmp)

                        time.sleep(5)
                        lista_reseñas = self.driver.find_element(By.XPATH, '//*[@id="reviews"]/section/div[2]/ul')
                        reseñas = lista_reseñas.find_elements(By.CLASS_NAME, 'y-css-1jp2syp')
                        cantidad_reseñas = len(reseñas)

                        for i in range(0,cantidad_reseñas):
                            # Por medio de exprecion regular extraemos la fecha del review
                            date = reseñas[i].find_element(By.CLASS_NAME, 'y-css-wfbtsu').text
                            date_format = "%b %d, %Y"
                            date_object = datetime.strptime(date, date_format)
                            date_object = date_object.date()
                            # Extraemos por medio de un atributo la calificacion de la review
                            calificacion = reseñas[i].find_element(By.CLASS_NAME, 'y-css-9tnml4')
                            calificacion = calificacion.get_attribute('aria-label')
                            star_rating = int(calificacion.split(' ')[0])
                            # Extraemos el user ID
                            userid = reseñas[i].find_element(By.CLASS_NAME, 'y-css-12ly5yx')
                            userid2 = userid.get_attribute('href')
                            user_id = userid2.split('userid=')[1]

                            user_id_review.append(user_id)
                            business_id2.append(idrestaurant)
                            stars_review.append(star_rating)
                            date_review.append(date_object)

                        self.driver.close()
                        self.driver.switch_to.window(self.driver.window_handles[0])
                        time.sleep(1)

            except:
                for i in range(3, cantidad-1):

                    restaurante = restaurantes[i]
                    idrestaurant = self.get_business_id()
                    if idrestaurant not in sitios_df['business_id'].values:
                        restaurante2 = restaurante.find_element(By.CLASS_NAME, 'y-css-hcgwj4')
                        restaurante2.click()
                        restaurante2.click()
                        time.sleep(4)

                        self.driver.switch_to.window(self.driver.window_handles[1])
                        
                        name_tmp = self.get_business_name()
                        city_tmp = self.get_city()
                        state_tmp = self.get_state()
                        latitud_tmp, longitud_tmp = self.get_coord()
                        rate_tmp = self.get_stars()
                        reviews_tmp = self.get_reviews_count()

                        business_id.append(idrestaurant)
                        name.append(name_tmp)
                        city.append(city_tmp)
                        state.append(state_tmp)
                        latitude.append(latitud_tmp)
                        longitude.append(longitud_tmp)
                        stars.append(rate_tmp)
                        reviews.append(reviews_tmp)

                        time.sleep(5)
                        lista_reseñas = self.driver.find_element(By.XPATH, '//*[@id="reviews"]/section/div[2]/ul')
                        reseñas = lista_reseñas.find_elements(By.CLASS_NAME, 'y-css-1jp2syp')
                        cantidad_reseñas = len(reseñas)

                        for i in range(0,cantidad_reseñas):
                            # Por medio de exprecion regular extraemos la fecha del review
                            date = reseñas[i].find_element(By.CLASS_NAME, 'y-css-wfbtsu').text
                            date_format = "%b %d, %Y"
                            date_object = datetime.strptime(date, date_format)
                            date_object = date_object.date()
                            # Extraemos por medio de un atributo la calificacion de la review
                            calificacion = reseñas[i].find_element(By.CLASS_NAME, 'y-css-9tnml4')
                            calificacion = calificacion.get_attribute('aria-label')
                            star_rating = int(calificacion.split(' ')[0])
                            # Extraemos el user ID
                            userid = reseñas[i].find_element(By.CLASS_NAME, 'y-css-12ly5yx')
                            userid2 = userid.get_attribute('href')
                            user_id = userid2.split('userid=')[1]

                            user_id_review.append(user_id)
                            business_id2.append(idrestaurant)
                            stars_review.append(star_rating)
                            date_review.append(date_object)

                        self.driver.close()
                        self.driver.switch_to.window(self.driver.window_handles[0])
                        time.sleep(1)
            finally:
                sitios_extraido = pd.DataFrame({'business_id': business_id,
                                'name': name,
                                'city':city,
                                'state':state,
                                'latitude':latitude,
                                'longitude':longitude,
                                'stars': stars,
                                'reviews':reviews})
                sitios_cocatenados = sitios_extraido, sitios_df
                sitios_ultimo = pd.concat(sitios_cocatenados)
                
                reviews_extraido = pd.DataFrame({'user_id': user_id_review,
                                        'business_id':business_id2,
                                        'stars_review': stars_review,
                                        'date': date_review})
                reviews_concatenados = reviews_extraido, reviews_df
                reviews_ultimo = pd.concat(reviews_concatenados)
                
                sitios_ultimo.to_parquet('../../Data/Parquet/Sitios_nuevo_yelp.parquet')
                reviews_ultimo.to_parquet('../../Data/Parquet/Reviews_nuevo_yelp.parquet')
                
                try:
                    siguiente_pagina = self.driver.find_element(By.XPATH, '//*[@id="main-content"]/ul/li[22]/div/div/div[11]')
                    siguiente_pagina.click()
                    time.sleep(7)
                except:
                    siguiente = False  

        #except:
         #   print('Se produjo un error en la extraccion de datos')


In [67]:
link = 'https://www.yelp.com/'
scraping = ScrapearYelp()
scraping.scraping(link)

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="search_description"]"}
  (Session info: chrome=126.0.6478.127); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF7D97C3E32+31618]
	(No symbol) [0x00007FF7D973B099]
	(No symbol) [0x00007FF7D95F888A]
	(No symbol) [0x00007FF7D9648524]
	(No symbol) [0x00007FF7D964862C]
	(No symbol) [0x00007FF7D968F787]
	(No symbol) [0x00007FF7D966D14F]
	(No symbol) [0x00007FF7D968CA80]
	(No symbol) [0x00007FF7D966CEB3]
	(No symbol) [0x00007FF7D963A46B]
	(No symbol) [0x00007FF7D963B001]
	GetHandleVerifier [0x00007FF7D9AC9FFD+3202381]
	GetHandleVerifier [0x00007FF7D9B16A1D+3516269]
	GetHandleVerifier [0x00007FF7D9B0C490+3473888]
	GetHandleVerifier [0x00007FF7D9875D36+760454]
	(No symbol) [0x00007FF7D9746B3F]
	(No symbol) [0x00007FF7D9741CD4]
	(No symbol) [0x00007FF7D9741E62]
	(No symbol) [0x00007FF7D973120F]
	BaseThreadInitThunk [0x00007FFAA2F0257D+29]
	RtlUserThreadStart [0x00007FFAA3CEAF28+40]


In [16]:
ciudades = ['Orlando, FL, EEUU',
'Miami, FL, EEUU',
'Tampa, FL, EEUU',
'Clearwater, FL, EEUU',
'Saint Petesburg, FL, EEUU',
'Brandon, FL, EEUU',
'Largo, FL, EEUU',
'Palm Harbor, FL, EEUU',
'Dunedin, FL, EEUU',
'Pinellas Park, FL, EEUU']
categoria = 'Mexican'

In [20]:
driver = webdriver.Chrome()
link = 'https://www.yelp.com/'
driver.get(link)

time.sleep(2)
category = driver.find_element(By.XPATH, '//*[@id="search_description"]')

category.clear()
category.click()
category.send_keys('Mexican Food')
time.sleep(1)

ciudad = driver.find_element(By.XPATH, '//*[@id="search_location"]')

ciudad.clear()
ciudad.click()
ciudad.send_keys(ciudades[0])
time.sleep(1)

buscar = driver.find_element(By.XPATH, '//*[@id="header_find_form"]/div[3]/button')

buscar.click()

restaurantes = driver.find_elements(By.CLASS_NAME, 'y-css-cxcdjj')
restaurante = restaurantes[3]
# Encontrar el elemento por su selector CSS
element = restaurante.find_element(By.CSS_SELECTOR, 'h3.y-css-hcgwj4 > a.y-css-12ly5yx')

# Obtener el atributo href del elemento
href = element.get_attribute('href')

# Usar una expresión regular para encontrar el valor de 'ad_business_id'
match = re.search(r'ad_business_id=([^&]+)', href)

if match:
    ad_business_id = match.group(1)
    print(ad_business_id)
else:
    ad_business_id = href.split('/biz/')[-1].split('?')[0]
    print(ad_business_id)
    
siguiente_pagina = driver.find_element(By.XPATH, '//*[@id="main-content"]/ul/li[21]/div/button')  
siguiente_pagina.click()
#driver.close()



el-burro-loco-orlando-2


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="main-content"]/ul/li[21]/div/button"}
  (Session info: chrome=126.0.6478.127); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF7D97C3E32+31618]
	(No symbol) [0x00007FF7D973B099]
	(No symbol) [0x00007FF7D95F888A]
	(No symbol) [0x00007FF7D9648524]
	(No symbol) [0x00007FF7D964862C]
	(No symbol) [0x00007FF7D968F787]
	(No symbol) [0x00007FF7D966D14F]
	(No symbol) [0x00007FF7D968CA80]
	(No symbol) [0x00007FF7D966CEB3]
	(No symbol) [0x00007FF7D963A46B]
	(No symbol) [0x00007FF7D963B001]
	GetHandleVerifier [0x00007FF7D9AC9FFD+3202381]
	GetHandleVerifier [0x00007FF7D9B16A1D+3516269]
	GetHandleVerifier [0x00007FF7D9B0C490+3473888]
	GetHandleVerifier [0x00007FF7D9875D36+760454]
	(No symbol) [0x00007FF7D9746B3F]
	(No symbol) [0x00007FF7D9741CD4]
	(No symbol) [0x00007FF7D9741E62]
	(No symbol) [0x00007FF7D973120F]
	BaseThreadInitThunk [0x00007FFAA2F0257D+29]
	RtlUserThreadStart [0x00007FFAA3CEAF28+40]


In [17]:
restaurantes

[<selenium.webdriver.remote.webelement.WebElement (session="c9186dfac09027afac1a3bf8d77370cb", element="f.A5F08F6424A779D0D78EEDD2869760B5.d.00DFEF65E483DE847F955DBB1C6B26C3.e.156")>,
 <selenium.webdriver.remote.webelement.WebElement (session="c9186dfac09027afac1a3bf8d77370cb", element="f.A5F08F6424A779D0D78EEDD2869760B5.d.00DFEF65E483DE847F955DBB1C6B26C3.e.157")>,
 <selenium.webdriver.remote.webelement.WebElement (session="c9186dfac09027afac1a3bf8d77370cb", element="f.A5F08F6424A779D0D78EEDD2869760B5.d.00DFEF65E483DE847F955DBB1C6B26C3.e.158")>,
 <selenium.webdriver.remote.webelement.WebElement (session="c9186dfac09027afac1a3bf8d77370cb", element="f.A5F08F6424A779D0D78EEDD2869760B5.d.00DFEF65E483DE847F955DBB1C6B26C3.e.159")>,
 <selenium.webdriver.remote.webelement.WebElement (session="c9186dfac09027afac1a3bf8d77370cb", element="f.A5F08F6424A779D0D78EEDD2869760B5.d.00DFEF65E483DE847F955DBB1C6B26C3.e.160")>,
 <selenium.webdriver.remote.webelement.WebElement (session="c9186dfac09027afac1a

In [38]:
restaurantes[3].click()

In [110]:
name = driver.find_element(By.XPATH, '/html/body/yelp-react-root/div[1]/div[4]/div[1]/div[1]/div/div/div[1]/h1').text

name

'El Patron Restaurante Mexicano'

In [109]:
city = driver.find_element(By.XPATH, '/html/body/yelp-react-root/div[1]/div[4]/div[1]/div[1]/div/div/div[2]/div[2]/span[2]').text
city.replace('(', '').replace(' reviews)', '')

'2,378'

In [35]:
city.replace('(', '').replace(' reviews)', '')

'544'

In [5]:
lista = driver.find_element(By.XPATH, '//*[@id="reviews"]/section/div[2]/ul')

In [14]:
reseñas = lista.find_elements(By.CLASS_NAME, 'y-css-1jp2syp')
reseña = reseñas[1]

In [27]:
userid = reseña.find_element(By.CLASS_NAME, 'y-css-12ly5yx')
userid2 = userid.get_attribute('href')
user_id = userid2.split('userid=')[1]
print(user_id)

BKSwbMBKA6rOuPPmRbBZCg


In [28]:
resultados = driver.find_element(By.CLASS_NAME, 'y-css-16jkyte')
restaurantes = driver.find_elements(By.CSS_SELECTOR, 'div.y-css-1he6azc[data-testid="serp-ia-card"]')

In [29]:
restaurantes

[]

In [136]:
driver.switch_to.window(driver.window_handles[0])

In [8]:
sitios = pd.read_parquet('../../Data/Parquet/Sitios_nuevo_yelp.parquet')
sitios

Unnamed: 0,business_id,name,city,state,latitude,longitude,stars,reviews
0,talkin-tacos-orlando-orlando-2,Talkin' Tacos SoDo,Orlando,Florida,28.509031,-81.375099,4.4,132
1,el-potro-orlando,El Potro,Orlando,Florida,28.494208,-81.46127,3.9,267
2,catrinas-mexican-fusion-orlando,Catrinas Mexican Fusion,Orlando,Florida,28.558537,-81.310429,4.4,71
3,cilantros-taqueria-orlando-3,Cilantro's Taqueria,Orlando,Florida,28.524506,-81.351408,4.4,293
4,tacos-el-rancho-orlando,Tacos El Rancho,Orlando,Florida,28.481712,-81.330285,3.8,190
5,torchys-tacos-orlando,Torchy's Tacos,Orlando,Florida,28.394608,-81.48372,4.3,27
6,antojitos-authentic-mexican-food-orlando,Antojitos Authentic Mexican Food,Orlando,Florida,28.472188,-81.465734,3.3,1861
7,las-cazuelas-orlando,Las Cazuelas,Orlando,Florida,28.494098,-81.331272,4.5,131
8,the-mexican-camel-orlando,The Mexican Camel,Orlando,Florida,28.461396,-81.458011,4.6,794
9,tortas-el-rey-orlando,Tortas El Rey,Orlando,Florida,28.467363,-81.396391,4.2,423


In [5]:
sitios.dropna(inplace = True)

sitios['reviews'] = sitios['reviews'].astype(int)

In [6]:
sitios

Unnamed: 0,business_id,name,city,state,latitude,longitude,stars,reviews
0,talkin-tacos-orlando-orlando-2,Talkin' Tacos SoDo,Orlando,Florida,28.509031,-81.375099,4.4,132
1,el-potro-orlando,El Potro,Orlando,Florida,28.494208,-81.46127,3.9,267
2,catrinas-mexican-fusion-orlando,Catrinas Mexican Fusion,Orlando,Florida,28.558537,-81.310429,4.4,71
3,cilantros-taqueria-orlando-3,Cilantro's Taqueria,Orlando,Florida,28.524506,-81.351408,4.4,293
4,tacos-el-rancho-orlando,Tacos El Rancho,Orlando,Florida,28.481712,-81.330285,3.8,190
6,torchys-tacos-orlando,Torchy's Tacos,Orlando,Florida,28.394608,-81.48372,4.3,27
7,antojitos-authentic-mexican-food-orlando,Antojitos Authentic Mexican Food,Orlando,Florida,28.472188,-81.465734,3.3,1861
8,las-cazuelas-orlando,Las Cazuelas,Orlando,Florida,28.494098,-81.331272,4.5,131
9,the-mexican-camel-orlando,The Mexican Camel,Orlando,Florida,28.461396,-81.458011,4.6,794
10,tortas-el-rey-orlando,Tortas El Rey,Orlando,Florida,28.467363,-81.396391,4.2,423


In [7]:
sitios.to_parquet('../../Data/Parquet/Sitios_nuevo_yelp.parquet', index=False)

In [15]:
reviews = pd.read_parquet('../../Data/Parquet/Reviews_nuevo_yelp.parquet')
reviews

Unnamed: 0,user_id,business_id,stars_review,date
0,qRUj9Bz80KvfeR6ivOsJaA,talkin-tacos-orlando-orlando-2,5.0,2024-06-22
1,2vxZPEQYKHz0-zXQlLGDJw,talkin-tacos-orlando-orlando-2,4.0,2024-06-26
2,Mv7HN5mxDAQNv1g-xpi8bw,talkin-tacos-orlando-orlando-2,5.0,2024-05-27
3,ucybK-7qH0WenKPHor3stQ,talkin-tacos-orlando-orlando-2,3.0,2024-05-06
4,iTMhjOeJIMXD4YqBWI_X2A,talkin-tacos-orlando-orlando-2,5.0,2024-06-10
...,...,...,...,...
246,8gwagXTkoqoBoy_i_W0BsQ,solita-tacos-and-margaritas-orlando-3,5.0,2024-01-27
247,95UzXqjjS8K_mpI-i1Gi_w,solita-tacos-and-margaritas-orlando-3,4.0,2024-04-14
248,oZP2b7XcDseEWckI5bhsTQ,solita-tacos-and-margaritas-orlando-3,5.0,2024-05-22
249,VDLQnEfd4-0YPyBih-tWuA,solita-tacos-and-margaritas-orlando-3,4.0,2023-11-28


In [10]:
reviews.dropna(inplace=True)
reviews

Unnamed: 0,user_id,business_id,stars_review,date
0,qRUj9Bz80KvfeR6ivOsJaA,talkin-tacos-orlando-orlando-2,5.0,2024-06-22
1,2vxZPEQYKHz0-zXQlLGDJw,talkin-tacos-orlando-orlando-2,4.0,2024-06-26
2,Mv7HN5mxDAQNv1g-xpi8bw,talkin-tacos-orlando-orlando-2,5.0,2024-05-27
3,ucybK-7qH0WenKPHor3stQ,talkin-tacos-orlando-orlando-2,3.0,2024-05-06
4,iTMhjOeJIMXD4YqBWI_X2A,talkin-tacos-orlando-orlando-2,5.0,2024-06-10
...,...,...,...,...
96,8gwagXTkoqoBoy_i_W0BsQ,solita-tacos-and-margaritas-orlando-3,5.0,2024-01-27
97,95UzXqjjS8K_mpI-i1Gi_w,solita-tacos-and-margaritas-orlando-3,4.0,2024-04-14
98,oZP2b7XcDseEWckI5bhsTQ,solita-tacos-and-margaritas-orlando-3,5.0,2024-05-22
99,VDLQnEfd4-0YPyBih-tWuA,solita-tacos-and-margaritas-orlando-3,4.0,2023-11-28


In [19]:
reviews.reset_index(inplace= True)
reviews.rename(columns={'index': 'review_id'}, inplace= True)

In [22]:
reviews['review_id'] = reviews['review_id'].astype(str)

In [25]:
reviews.to_parquet('../../Data/Parquet/Reviews_nuevo_yelp.parquet', index=False)