In [2]:
%pip install gcsfs

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
%pip install fsspec

Note: you may need to restart the kernel to use updated packages.


In [1]:
# Importamos librerías
import pandas as pd
import numpy as np

# Trabajamos con el dataset 'business'

In [2]:

gcs_url = 'gs://data_cruda/Yelp/business.json'

# Leer el archivo JSON en un DataFrame de Pandas
df_business = pd.read_json(gcs_url, lines=True)

In [7]:
df_business.dtypes

business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
dtype: object

In [3]:
df_business.head


<bound method NDFrame.head of                    business_id                      name  \
0       Pns2l4eNsfO8kk83dixA6A  Abby Rappoport, LAC, CMQ   
1       mpf3x-BjTdTEA3yCZrAYPw             The UPS Store   
2       tUFrWirKiKi_TAnsVWINQQ                    Target   
3       MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries   
4       mWMc6_wTdE0EUBKIGXDVfA  Perkiomen Valley Brewery   
...                        ...                       ...   
150341  IUQopTMmYQG-qRtBk-8QnA              Binh's Nails   
150342  c8GjPIOTGVmIemT7j5_SyQ      Wild Birds Unlimited   
150343  _QAMST-NrQobXduilWEqSw         Claire's Boutique   
150344  mtGm22y5c2UHNXDFAjaPNw  Cyclery & Fitness Center   
150345  jV_XOycEzSlTx-65W906pg                   Sic Ink   

                                address           city state postal_code  \
0                1616 Chapala St, Ste 2  Santa Barbara    CA       93101   
1       87 Grasso Plaza Shopping Center         Affton    MO       63123   
2                  52

## Filtramos por categorías de rubro gastronómico

In [3]:
target_categories = [
'restaurant', 'coffee', 'rice', 'paan', 'ice cream',
    'tortilla', 'tofu', 'pie', 'soup',
    'cheese', 'cupcake', 'pasta', 'cookie', 'chocolate',
    'frozen yogurt', 'salad', 'cake', 'donut',
    'sandwich', 'chicken', 'pizza', 'burguer', 'hot dog'
]
target_categories_lower = [category.lower() for category in target_categories]

# Filtrar los registros que no tienen valores nulos en la columna 'Categories'
df_business_filtered = df_business.dropna(subset=['categories'])

# Filtrar los registros que contienen alguna de las categorías objetivo en la columna "Categories"
df_filteredy = df_business_filtered[df_business_filtered['categories'].str.lower().str.contains('|'.join(target_categories_lower))]

# Mostrar el resultado
print(df_filteredy.shape[0])  # Cantidad de registros filtrados


57190


## Filtramos, dejando solo los tres estados con los que trabajaremos

In [4]:
estados_interes = ['FL', 'PA', 'CA']
df_filtrado = df_filteredy[df_filteredy['state'].isin(estados_interes)]

df_filtrado.shape[0]

24645

Eliminamos duplicados

In [5]:
columns_to_check = [
    'business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude'
]
deduplicated_df = df_filtrado.drop_duplicates(subset=columns_to_check)

original_count = df_filtrado.shape[0]  
deduplicated_count = deduplicated_df.shape[0]

print("Número de filas originales:", original_count)
print("Número de filas después de la deduplicación:", deduplicated_count)

Número de filas originales: 24645
Número de filas después de la deduplicación: 24645


## Creamos un nuevo DataFrame con la columna 'attributes'

In [6]:

df_attributes = deduplicated_df[['business_id']].join(deduplicated_df['attributes'].apply(lambda x: pd.Series(x, dtype='object')))

# Mostrar el DataFrame resultante
df_attributes.head()



Unnamed: 0,business_id,RestaurantsDelivery,OutdoorSeating,BusinessAcceptsCreditCards,BusinessParking,BikeParking,RestaurantsPriceRange2,RestaurantsTakeOut,ByAppointmentOnly,WiFi,...,BestNights,DriveThru,Corkage,BYOBCorkage,Open24Hours,AgesAllowed,AcceptsInsurance,DietaryRestrictions,RestaurantsCounterService,HairSpecializesIn
3,MTSW4McQd7CbVtyjqoe9mw,False,False,False,"{'garage': False, 'street': True, 'validated':...",True,1.0,True,False,u'free',...,,,,,,,,,,
11,eEOYSgkmpB90uNA7lDOMRA,,,,"{'garage': False, 'street': False, 'validated'...",,,,,,...,,,,,,,,,,
14,0bPLkL0QhhPO5kt1_EXmNQ,True,False,True,"{'garage': False, 'street': False, 'validated'...",True,1.0,True,,u'no',...,,,,,,,,,,
15,MUTTqe8uqyMdBl186RmNeA,True,True,True,"{u'valet': False, u'garage': None, u'street': ...",,2.0,True,,'free',...,,,,,,,,,,
19,ROeacJQwBeh05Rqg7F6TCg,,,True,"{'garage': False, 'street': True, 'validated':...",True,1.0,True,,u'no',...,,,,,,,,,,


In [7]:
# Lista de columnas a eliminar
columns_to_drop = ['Ambience', 'BusinessParking', 'Music', 'GoodForMeal', 'AcceptsInsurance', 'AgesAllowed',
                  'BYOBCorkage', 'DietaryRestrictions', 'HairSpecializesIn', 'Open24Hours', 'RestaurantsCounterService', 'BestNights']

# Eliminar las columnas del DataFrame
df_attributes.drop(columns=columns_to_drop, inplace=True)

# Mostrar los tipos de datos resultantes
print(df_attributes.dtypes)


business_id                   object
RestaurantsDelivery           object
OutdoorSeating                object
BusinessAcceptsCreditCards    object
BikeParking                   object
RestaurantsPriceRange2        object
RestaurantsTakeOut            object
ByAppointmentOnly             object
WiFi                          object
Alcohol                       object
Caters                        object
RestaurantsReservations       object
RestaurantsGoodForGroups      object
RestaurantsAttire             object
NoiseLevel                    object
GoodForKids                   object
BusinessAcceptsBitcoin        object
RestaurantsTableService       object
HasTV                         object
WheelchairAccessible          object
DogsAllowed                   object
HappyHour                     object
Smoking                       object
GoodForDancing                object
CoatCheck                     object
BYOB                          object
DriveThru                     object
C

In [8]:
# Arreglamos cadenas unicode
for column in df_attributes.columns:
    if column != 'business_id' and df_attributes[column].dtype == 'object':
        df_attributes[column] = df_attributes[column].apply(lambda x: x.lstrip("u'").lstrip("'").rstrip("'") if isinstance(x, str) else x)


df_attributes.head()

Unnamed: 0,business_id,RestaurantsDelivery,OutdoorSeating,BusinessAcceptsCreditCards,BikeParking,RestaurantsPriceRange2,RestaurantsTakeOut,ByAppointmentOnly,WiFi,Alcohol,...,HasTV,WheelchairAccessible,DogsAllowed,HappyHour,Smoking,GoodForDancing,CoatCheck,BYOB,DriveThru,Corkage
3,MTSW4McQd7CbVtyjqoe9mw,False,False,False,True,1.0,True,False,free,none,...,,,,,,,,,,
11,eEOYSgkmpB90uNA7lDOMRA,,,,,,,,,none,...,,,,,,,,,,
14,0bPLkL0QhhPO5kt1_EXmNQ,True,False,True,True,1.0,True,,no,none,...,True,True,,,,,,,,
15,MUTTqe8uqyMdBl186RmNeA,True,True,True,,2.0,True,,free,full_bar,...,False,True,False,True,,,,,,
19,ROeacJQwBeh05Rqg7F6TCg,,,True,True,1.0,True,,no,none,...,True,,,,,,,,,


In [9]:
def convert_to_int(value):
    if value == 'None':
        return 0
    return int(value)

In [10]:
df_attributes.fillna(0, inplace=True)
columns_to_bool = ['BYOB', 'BikeParking', 'BusinessAcceptsBitcoin', 'BusinessAcceptsCreditCards', 'ByAppointmentOnly', 'Caters', \
                     'CoatCheck', 'Corkage', 'DogsAllowed', 'DriveThru', 'GoodForDancing', 'GoodForKids', 'HappyHour', 'HasTV', \
                     'OutdoorSeating', 'RestaurantsDelivery', 'RestaurantsGoodForGroups', 'RestaurantsReservations', \
                     'RestaurantsTableService', 'RestaurantsTakeOut', 'WheelchairAccessible']
columns_to_int = ['RestaurantsPriceRange2']
columns_to_str = ['business_id', 'Alcohol', 'NoiseLevel', 'RestaurantsAttire', 'Smoking', 'WiFi']

df_attributes[columns_to_bool] = df_attributes[columns_to_bool].astype(str)
df_attributes[columns_to_bool]=np.where(df_attributes[columns_to_bool] == 'True', True, False)
df_attributes[columns_to_int] = df_attributes[columns_to_int].fillna(0)
df_attributes[columns_to_int] = df_attributes[columns_to_int].applymap(convert_to_int)
df_attributes[columns_to_int] = df_attributes[columns_to_int].astype(int)

df_attributes[columns_to_int] = df_attributes[columns_to_int].fillna('',)
df_attributes[columns_to_str] = df_attributes[columns_to_str].astype(str)

df_attributes.dtypes

business_id                   object
RestaurantsDelivery             bool
OutdoorSeating                  bool
BusinessAcceptsCreditCards      bool
BikeParking                     bool
RestaurantsPriceRange2         int64
RestaurantsTakeOut              bool
ByAppointmentOnly               bool
WiFi                          object
Alcohol                       object
Caters                          bool
RestaurantsReservations         bool
RestaurantsGoodForGroups        bool
RestaurantsAttire             object
NoiseLevel                    object
GoodForKids                     bool
BusinessAcceptsBitcoin          bool
RestaurantsTableService         bool
HasTV                           bool
WheelchairAccessible            bool
DogsAllowed                     bool
HappyHour                       bool
Smoking                       object
GoodForDancing                  bool
CoatCheck                       bool
BYOB                            bool
DriveThru                       bool
C

In [11]:
df_attributes.head()

Unnamed: 0,business_id,RestaurantsDelivery,OutdoorSeating,BusinessAcceptsCreditCards,BikeParking,RestaurantsPriceRange2,RestaurantsTakeOut,ByAppointmentOnly,WiFi,Alcohol,...,HasTV,WheelchairAccessible,DogsAllowed,HappyHour,Smoking,GoodForDancing,CoatCheck,BYOB,DriveThru,Corkage
3,MTSW4McQd7CbVtyjqoe9mw,False,False,False,True,1,True,False,free,none,...,False,False,False,False,0,False,False,False,False,False
11,eEOYSgkmpB90uNA7lDOMRA,False,False,False,False,0,False,False,0,none,...,False,False,False,False,0,False,False,False,False,False
14,0bPLkL0QhhPO5kt1_EXmNQ,True,False,True,True,1,True,False,no,none,...,True,True,False,False,0,False,False,False,False,False
15,MUTTqe8uqyMdBl186RmNeA,True,True,True,False,2,True,False,free,full_bar,...,False,True,False,True,0,False,False,False,False,False
19,ROeacJQwBeh05Rqg7F6TCg,False,False,True,True,1,True,False,no,none,...,True,False,False,False,0,False,False,False,False,False


In [12]:
# Guardar como archivo Parquet
parquet_file_path = 'gs://data_limpia/Yelp/attributes.parquet'
df_attributes.to_parquet(parquet_file_path)

## Creamos un nuevo dataset con la columna 'hours'

In [6]:

df_hours = deduplicated_df[['business_id']].join(deduplicated_df['hours'].apply(lambda x: pd.Series(x, dtype='object')))

df_hours.head()

Unnamed: 0,business_id,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
3,MTSW4McQd7CbVtyjqoe9mw,7:0-20:0,7:0-20:0,7:0-20:0,7:0-20:0,7:0-21:0,7:0-21:0,7:0-21:0
11,eEOYSgkmpB90uNA7lDOMRA,11:0-14:0,11:0-14:0,11:0-14:0,11:0-14:0,11:0-14:0,5:0-10:0,15:0-18:0
14,0bPLkL0QhhPO5kt1_EXmNQ,10:0-18:0,10:0-20:0,10:0-20:0,10:0-20:0,10:0-20:0,10:0-20:0,
15,MUTTqe8uqyMdBl186RmNeA,,13:30-22:0,13:30-22:0,13:30-22:0,13:30-23:0,13:30-23:0,13:30-22:0
19,ROeacJQwBeh05Rqg7F6TCg,11:30-20:30,11:30-20:30,11:30-20:30,11:30-20:30,11:30-20:30,11:30-20:30,


In [7]:
df_hours.fillna('', inplace=True)

In [33]:
df_hours.columns

Index(['business_id', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
       'Saturday', 'Sunday'],
      dtype='object')

In [8]:
# Guardamos hours como archivo Parquet
parquet_file_path = 'gs://data_limpia/Yelp/hours.parquet'
df_hours.to_parquet(parquet_file_path)

## Creamos un nuevo dataset con la columna 'Categories' 

In [9]:
flat_categories_list = deduplicated_df['categories'].str.split(', ').explode()

category_counts = flat_categories_list.value_counts()

categories_with_50_or_more_occurrences = category_counts[category_counts >= 100]

selected_categories_df = categories_with_50_or_more_occurrences.reset_index()
selected_categories_df.columns = ['category', 'count']

selected_categories_df['category_id'] = range(1, len(selected_categories_df) + 1)

df_categories = selected_categories_df[['category_id', 'category']]

df_categories.head()


Unnamed: 0,category_id,category
0,1,Restaurants
1,2,Food
2,3,Sandwiches
3,4,Nightlife
4,5,Bars


In [10]:
# Guardamos categories como archivo Parquet
parquet_file_path = 'gs://data_limpia/Yelp/categories.parquet'
df_categories.to_parquet(parquet_file_path)

## Creamos una linked table entre 'business' y 'categories'

In [11]:
dataframes_list = []

for index, row in deduplicated_df.iterrows():
    business_id = row['business_id']
    categories = row['categories'].split(', ')

    for category in categories:
        matching_category = df_categories[df_categories['category'] == category]

        if not matching_category.empty:
            category_id = matching_category['category_id'].values[0]
            dataframes_list.append(pd.DataFrame({'business_id': [business_id], 'category_id': [category_id]}))

df_bus_cat = pd.concat(dataframes_list, ignore_index=True)

df_bus_cat.head()


Unnamed: 0,business_id,category_id
0,MTSW4McQd7CbVtyjqoe9mw,1
1,MTSW4McQd7CbVtyjqoe9mw,2
2,MTSW4McQd7CbVtyjqoe9mw,79
3,MTSW4McQd7CbVtyjqoe9mw,8
4,MTSW4McQd7CbVtyjqoe9mw,24


In [12]:
df_bus_cat.drop_duplicates(inplace=True)

In [13]:
# Guardamos bus_cat como archivo Parquet
parquet_file_path = 'gs://data_limpia/Yelp/bus_cat.parquet'
df_bus_cat.to_parquet(parquet_file_path)

In [14]:
deduplicated_df.drop(columns=['postal_code', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours'], inplace=True)

In [15]:
deduplicated_df.dtypes


business_id     object
name            object
address         object
city            object
state           object
latitude       float64
longitude      float64
dtype: object

In [16]:
# Guardamos business como archivo Parquet
parquet_file_path = 'gs://data_limpia/Yelp/business.parquet'
deduplicated_df.to_parquet(parquet_file_path)