##  EDA (Check-In Yelp)

Dado el tamaño del DataSet hemos optado por trabajar este archivo con Pandas

In [26]:
# Importamos las librerías necesarias
import pandas as pd

In [27]:
# Cargamos el dataset
df = pd.read_parquet('C:\Escritorio\PF\checkin\checkin.parquet')

In [28]:
df

Unnamed: 0,business_id,date
0,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020..."
1,--0iUa4sNDFiZFrAdIWhZQ,"2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011..."
2,--30_8IhuyMHbSOcNWd6DQ,"2013-06-14 23:29:17, 2014-08-13 23:20:22"
3,--7PUidqRWpRSpXebiyxTg,"2011-02-15 17:12:00, 2011-07-28 02:46:10, 2012..."
4,--7jw19RH9JKXgFohspgQw,"2014-04-21 20:42:11, 2014-04-28 21:04:46, 2014..."
...,...,...
131925,zznJox6-nmXlGYNWgTDwQQ,"2013-03-23 16:22:47, 2013-04-07 02:03:12, 2013..."
131926,zznZqH9CiAznbkV6fXyHWA,2021-06-12 01:16:12
131927,zzu6_r3DxBJuXcjnOYVdTw,"2011-05-24 01:35:13, 2012-01-01 23:44:33, 2012..."
131928,zzw66H6hVjXQEt0Js3Mo4A,"2016-12-03 23:33:26, 2018-12-02 19:08:45"


Procesamos el DataFrame para generar nuevas columnas a partir de los valores individuales de la columna 'date'. El objetivo es identificar el año, mes, día y hora más comunes en las marcas de 'checkin' para cada 'business_id' en el conjunto de datos

In [29]:
# Definimos una función que procesará cada fila individualmente
def process_row(row):
    dates = row['date'].split(', ')
    years = [int(date[:4]) for date in dates]
    most_common_year = max(set(years), key=years.count)
    
    # Obtener el mes más común
    months = [int(date[5:7]) for date in dates]
    most_common_month = max(set(months), key=months.count)
    
    # Obtener el día más común
    days = [int(date[8:10]) for date in dates]
    most_common_day = max(set(days), key=days.count)
    
    # Obtener la hora más común
    hours = [int(date[11:13]) for date in dates]
    most_common_hour = max(set(hours), key=hours.count)
    
    return pd.Series([dates, most_common_year, most_common_month, most_common_day, most_common_hour, len(dates)], 
                     index=['date_list', 'most_common_year', 'most_common_month', 'most_common_day', 'most_common_hour', 'total'])

# Aplicamos la función a cada fila
df = df.merge(df.apply(process_row, axis=1), left_index=True, right_index=True)

In [30]:
# Mostramos los resultados
df

Unnamed: 0,business_id,date,date_list,most_common_year,most_common_month,most_common_day,most_common_hour,total
0,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020...","[2020-03-13 21:10:56, 2020-06-02 22:18:06, 202...",2021,10,2,21,11
1,--0iUa4sNDFiZFrAdIWhZQ,"2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011...","[2010-09-13 21:43:09, 2011-05-04 23:08:15, 201...",2013,9,13,0,10
2,--30_8IhuyMHbSOcNWd6DQ,"2013-06-14 23:29:17, 2014-08-13 23:20:22","[2013-06-14 23:29:17, 2014-08-13 23:20:22]",2013,8,13,23,2
3,--7PUidqRWpRSpXebiyxTg,"2011-02-15 17:12:00, 2011-07-28 02:46:10, 2012...","[2011-02-15 17:12:00, 2011-07-28 02:46:10, 201...",2012,4,24,2,10
4,--7jw19RH9JKXgFohspgQw,"2014-04-21 20:42:11, 2014-04-28 21:04:46, 2014...","[2014-04-21 20:42:11, 2014-04-28 21:04:46, 201...",2016,4,21,12,26
...,...,...,...,...,...,...,...,...
131925,zznJox6-nmXlGYNWgTDwQQ,"2013-03-23 16:22:47, 2013-04-07 02:03:12, 2013...","[2013-03-23 16:22:47, 2013-04-07 02:03:12, 201...",2014,9,12,11,67
131926,zznZqH9CiAznbkV6fXyHWA,2021-06-12 01:16:12,[2021-06-12 01:16:12],2021,6,12,1,1
131927,zzu6_r3DxBJuXcjnOYVdTw,"2011-05-24 01:35:13, 2012-01-01 23:44:33, 2012...","[2011-05-24 01:35:13, 2012-01-01 23:44:33, 201...",2013,4,9,0,23
131928,zzw66H6hVjXQEt0Js3Mo4A,"2016-12-03 23:33:26, 2018-12-02 19:08:45","[2016-12-03 23:33:26, 2018-12-02 19:08:45]",2016,12,2,19,2


Realizamos una descripción de los datos obtenidos con el cual ya observamos que no hay valores outliers en los años, meses, días y horas

In [38]:
# Describimos los datos
df.describe().round()

Unnamed: 0,most_common_year,most_common_month,most_common_day,most_common_hour,total
count,131930.0,131930.0,131930.0,131930.0,131930.0
mean,2015.0,6.0,13.0,14.0,101.0
std,3.0,3.0,9.0,8.0,417.0
min,2010.0,1.0,1.0,0.0,1.0
25%,2013.0,3.0,5.0,11.0,6.0
50%,2015.0,6.0,12.0,17.0,20.0
75%,2018.0,9.0,20.0,20.0,72.0
max,2022.0,12.0,31.0,23.0,52144.0


In [32]:
# Comprobamos valores nulos
df.isnull().sum()

business_id          0
date                 0
date_list            0
most_common_year     0
most_common_month    0
most_common_day      0
most_common_hour     0
total                0
dtype: int64

Comprobamos que el largo del DataSet coincida con el de valores únicos para resolver que no tenemos valores duplicados 

In [33]:
# Comprobamos valores duplicados
print(len(df))
print(len(df.business_id.unique()))

131930
131930


In [34]:
# Mostramos los tipos de datos
df.dtypes

business_id          object
date                 object
date_list            object
most_common_year      int64
most_common_month     int64
most_common_day       int64
most_common_hour      int64
total                 int64
dtype: object

Por último se extrae el archivo resultante para poder cargarlo a la base de datos

In [35]:
ruta_archivo_parquet = "C:/Escritorio/PF/checkin/checkin_yelp.parquet"

# Guardamos el DataFrame como un archivo Parquet
df.to_parquet(ruta_archivo_parquet, index=False)