In [14]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from google.cloud import storage
import os


In [9]:
sites_atributos = pd.read_parquet("../Data/dfgy_attributes.parquet")

In [10]:
# Se observan los atributos.
sites_atributos["attributes"].unique()

array(['RestaurantsDelivery', 'OutdoorSeating',
       'BusinessAcceptsCreditCards', 'BusinessParking', 'BikeParking',
       'RestaurantsPriceRange2', 'RestaurantsTakeOut',
       'ByAppointmentOnly', 'WiFi', 'Alcohol', 'Caters',
       'WheelchairAccessible', 'GoodForKids', 'RestaurantsAttire',
       'RestaurantsReservations', 'Ambience', 'CoatCheck', 'DogsAllowed',
       'RestaurantsTableService', 'RestaurantsGoodForGroups', 'HasTV',
       'HappyHour', 'DriveThru', 'NoiseLevel', 'GoodForMeal',
       'BusinessAcceptsBitcoin', 'Smoking', 'Music', 'GoodForDancing',
       'BestNights', 'BYOB', 'Corkage', 'BYOBCorkage', 'AcceptsInsurance',
       'RestaurantsCounterService', 'Open24Hours', 'AgesAllowed',
       'DietaryRestrictions', 'HairSpecializesIn', 'Good for kids',
       'Restroom', 'Bar onsite', 'Wi-Fi', 'Gender-neutral restroom',
       'Public restroom', 'High chairs', 'Restaurant', 'Golf course',
       'Mechanic', 'Air conditioning', 'Bar on site', 'Toilets',
       'Gen

In [11]:
# Se seleccionan los atributos que van a ser unificados o corregidos a través de un diccionario de mapeo.
reemplazo_atributos = {
    "Accessible entrance":['Wheelchair accessible entrance','Wheelchair-accessible entrance',"WheelchairAccessible"],
    "Accessible elevator":["Wheelchair accessible elevator","Wheelchair-accessible lift"],
    "Accessible seating":['Wheelchair accessible seating','Wheelchair-accessible seating'],
    "Accessible restroom":['Wheelchair accessible restroom','Wheelchair-accessible toilet'],
    "Accessible parking":['Wheelchair accessible parking lot','Wheelchair-accessible car park'],
    "Wifi":['WiFi','Wi-Fi'],
    "Delivery&TakeOut":["RestaurantsDelivery",'RestaurantsTakeOut'],
    "Outdoor Seating":['OutdoorSeating'],
    "Accepts Cards":['BusinessAcceptsCreditCards',"BusinessAcceptsBitcoin"],
    "Parking":["BusinessParking",'BikeParking'],
    "Appointment Only":['ByAppointmentOnly'],
    "Reservations":['RestaurantsReservations'],
    "HappyHour":['HappyHour','BestNights'],
    "BYOB":['BYOB', 'Corkage', 'BYOBCorkage'],
    'Good for kids':['Good for kids','GoodForKids'],
    "Sells Alcohol":['Alcohol','Bar onsite','Bar on site'],
    "Restroom":['Restroom','Gender-neutral restroom','Public restroom','Toilets','Gender-neutral toilets','Public toilet'],
    'Baggage storage':['Baggage storage','CoatCheck']

}

# Se seleccionan los atributos que no proporcionan información útil para el proceso de análisis.
eliminar_atributos = ['Caters','RestaurantsPriceRange2','Ambience','RestaurantsTableService','NoiseLevel',
                      'Music','AcceptsInsurance','AgesAllowed','HairSpecializesIn','High chairs', 'Restaurant','Golf course',
                      'Mechanic','Swimming pool', 'All-inclusive', 'Stadium seating']

In [12]:
# Se invierte el diccionario de mapeo para usarlo en el método "replace".
reverse_mapping = {v: k for k, values in reemplazo_atributos.items() for v in values}

# Se reeemplazan los valores en la columna 'atributos' usando el diccionario invertido
sites_atributos['attributes'] = sites_atributos['attributes'].replace(reverse_mapping)

# Se filtra y eliminan las filas que contienen atributos en eliminar_atributos
sites_atributos = sites_atributos[~sites_atributos['attributes'].str.contains('|'.join(eliminar_atributos))]

In [13]:
# Se observan las categorías limpias.
sites_atributos["attributes"].unique()

array(['Delivery&TakeOut', 'Outdoor Seating', 'Accepts Cards', 'Parking',
       'Appointment Only', 'Wifi', 'Sells Alcohol', 'Accessible entrance',
       'Good for kids', 'Reservations', 'Baggage storage', 'DogsAllowed',
       'HasTV', 'HappyHour', 'DriveThru', 'GoodForMeal', 'Smoking',
       'GoodForDancing', 'BYOB', 'Open24Hours', 'DietaryRestrictions',
       'Restroom', 'Air conditioning', 'Accessible elevator',
       'Accessible seating', 'Accessible restroom', 'Accessible parking',
       'Assisted listening devices'], dtype=object)

In [15]:
# Subida del dataframe a Google Cloud 

# Configura tus credenciales de Google Cloud
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'path/to/your/credentials.json'

# Función que sube el archivo a Google Cloud Storage
def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
    """Sube un archivo a un bucket de Google Cloud Storage."""
    # Inicializa el cliente de GCS
    storage_client = storage.Client()
    # Obtén el bucket
    bucket = storage_client.bucket(bucket_name)
    # Crea un blob en el bucket
    blob = bucket.blob(destination_blob_name)
    # Sube el archivo al blob
    blob.upload_from_filename(source_file_name)
    print(f"Archivo {source_file_name} subido a {destination_blob_name}.")

# Función que guarda el dataframe en un archivo temporal de parquet y lo sube
def dataframe_to_parquet_and_upload(df, bucket_name, destination_blob_name):
    # Guarda el DataFrame como archivo Parquet
    table = pa.Table.from_pandas(df)
    pq.write_table(table, 'temp.parquet')

    # Sube el archivo Parquet a Google Cloud Storage
    upload_to_gcs(bucket_name, 'temp.parquet', destination_blob_name)

    # Elimina el archivo temporal
    os.remove('temp.parquet')

In [17]:
# Se sube el dataframe de atributos

# Nombre del bucket de GCS y del archivo destino
bucket_name = 'scripts-python-proyecto-henry'
destination_blob_name = 'dfgy_attributes.parquet'

dataframe_to_parquet_and_upload(sites_atributos,bucket_name,destination_blob_name)

Archivo temp.parquet subido a dfgy_attributes.parquet.
