# Extracción de datos Datasets Google Maps y YELP

### Importaciones

In [1]:
import pandas as pd
import numpy as np
import json
import datetime
import os

### Utiles

In [5]:

state_abreviations = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", 
    "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", 
    "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", 
    "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", 
    "WI", "WY"
]

state_dictionary = {
    "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas", 
    "CA": "California", "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware", 
    "FL": "Florida", "GA": "Georgia", "HI": "Hawaii", "ID": "Idaho", "IL": "Illinois", 
    "IN": "Indiana", "IA": "Iowa", "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", 
    "ME": "Maine", "MD": "Maryland", "MA": "Massachusetts", "MI": "Michigan", 
    "MN": "Minnesota", "MS": "Mississippi", "MO": "Missouri", "MT": "Montana", 
    "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey", 
    "NM": "New Mexico", "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", 
    "OH": "Ohio", "OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", 
    "RI": "Rhode Island", "SC": "South Carolina", "SD": "South Dakota", "TN": "Tennessee", 
    "TX": "Texas", "UT": "Utah", "VT": "Vermont", "VA": "Virginia", "WA": "Washington", 
    "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming"
}

In [6]:
import os

generated_dir = "Generated"

os.mkdir(generated_dir)
os.chdir(generated_dir)
os.mkdir('Google')
os.mkdir('Yelp')
os.chdir('../')

os.getcwd()

'z:\\PF_DATAPT07'

# 1. Extracción  con los datasets de Google Maps

## 1.1 Metada de Sitios

Recorremos todo el directorio, lo hacemos archivo por archivo y línea por línea ya que no se puede abrir directamente los archivos por su dimensión y porque no están en formato de array, sino están constituidos en un registro por lìnea.
Durante la lectura filtramos los que incluyan <code>Restaurant</code> en el campo de categoría, para alivianar el dataset final.

In [7]:
# Tiempo de Demora Medio: 51 segundos.
lineas_json = []

# Son 11 archivos con un ordinal, del 1 al 11
for i in range(1, 12):
    path = f'Datasets/Google Maps/metadata-sitios/{i}.json'
    with open(path, 'r') as file:
        for l in file:
            try:
                linea_j = json.loads(l)
                if 'restaurant' in " ".join(linea_j['category']).lower():
                    lineas_json.append(linea_j)
            except:
                pass

df = pd.DataFrame(lineas_json)

df.head(3)

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",0x80c2c778e3b73d33:0xbdc58662a4a97d49,,34.058092,-118.29213,[Korean restaurant],4.4,18,,"[[Thursday, 6:30AM–6PM], [Friday, 6:30AM–6PM],...","{'Service options': ['Takeout', 'Dine-in', 'De...",Open ⋅ Closes 6PM,"[0x80c2c78249aba68f:0x35bf16ce61be751d, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
1,Vons Chicken,"Vons Chicken, 12740 La Mirada Blvd, La Mirada,...",0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,,33.916402,-118.010855,[Restaurant],4.5,18,,"[[Thursday, 11AM–9:30PM], [Friday, 11AM–9:30PM...","{'Service options': ['Outdoor seating', 'Curbs...",Open ⋅ Closes 9:30PM,,https://www.google.com/maps/place//data=!4m2!3...
2,"Sweet Rewards Gluten Free Bakery, LLC","Sweet Rewards Gluten Free Bakery, LLC, 85 NE D...",0x87ec235c54d25b31:0x3b75fb5facc602f,,41.616079,-93.865487,"[Bakery, Health food restaurant]",4.7,21,,"[[Thursday, 10AM–5:30PM], [Friday, 10AM–5:30PM...",{'Service options': ['Delivery']},Permanently closed,"[0x87ee974869295555:0x95f310d065882c9b, 0x87ec...",https://www.google.com/maps/place//data=!4m2!3...


In [8]:
df.loc[5, 'address']

'Cape Seafood Shack, 603 Del Prado Blvd S, Cape Coral, FL 33990'

Exportamos a formato Parquet

In [11]:
df.to_parquet(r'Generated\Google\metada_sitios.parquet')

Tamaño Directorio <code>metadata-sitios</code>: 2.76 Gb

Tamaño Archivo <code>metada_sitios.parquet</code>: 60.43 Mb


In [None]:
df.info()

Dimensiones: 212.014 filas x 15 Columnas

## 1.1.1 Obtención de información de Estados

En base al campo <code>Address</code> obtenemos el estado donde se encuentra el negocio. Nos servirá para luego seleccionar los estados con más restaurantes.

In [12]:
def get_state_ab(st):
    try:
        state = st.split(', ')[-1].split(' ')[0]
        if state in state_abreviations:
            return state
        else:
            return np.nan
    except:
        return np.nan
    
df['state_ab'] = df['address'].apply(get_state_ab)

De esta manera conseguimos el top 5 de los estados con más restaurantes

In [13]:
top_5 = df['state_ab'].value_counts().head(5).index.to_list()

In [14]:
top_5

['CA', 'TX', 'NY', 'FL', 'PA']

Completamos el campo estado que es más descriptivo

In [15]:
df['us_state'] = df['state_ab'].map(state_dictionary)

In [16]:
df['us_state'].head(5)

0      California
1      California
2            Iowa
3    Pennsylvania
4          Hawaii
Name: us_state, dtype: object

Obtenemos un arreglo de URLs de los archivos correspondientes para cada estado del top 5, con el fin de extraer los datos en un bucle.

In [17]:
top_5_url = [f"Datasets/Google Maps/reviews-estados/review-{state_dictionary[i].replace(' ', '_')}/" for i in top_5]

top_5_url


['Datasets/Google Maps/reviews-estados/review-California/',
 'Datasets/Google Maps/reviews-estados/review-Texas/',
 'Datasets/Google Maps/reviews-estados/review-New_York/',
 'Datasets/Google Maps/reviews-estados/review-Florida/',
 'Datasets/Google Maps/reviews-estados/review-Pennsylvania/']

Creamos un diccionario con la cantidad de archivos por cada directorio de estados, con el fin de utilizarlo en un bucle en la extracción de datos.

In [18]:
cantidad_archivos = {}

for i in top_5_url:
    for j in os.walk(i):
        cantidad_archivos[i] = len(j[2])

cantidad_archivos

{'Datasets/Google Maps/reviews-estados/review-California/': 18,
 'Datasets/Google Maps/reviews-estados/review-Texas/': 14,
 'Datasets/Google Maps/reviews-estados/review-New_York/': 18,
 'Datasets/Google Maps/reviews-estados/review-Florida/': 19,
 'Datasets/Google Maps/reviews-estados/review-Pennsylvania/': 15}

Comprobación

In [19]:
for i in os.walk('Datasets/Google Maps/reviews-estados/review-Pennsylvania'):
    print(len(i[2]))

15


In [20]:
cantidad_archivos['Datasets/Google Maps/reviews-estados/review-Pennsylvania/']

15

## 1.2 Reviews Estados

Ya con los estados elegidos estamos en condiciones de ingestar los datos de las carpetas correspondientes dentro del directorio <code>reviews-estados</code>.
Es información masiva lo que genera un archivo de grandes dimensiones, sin embargo previamente filtramos por el parámetro de año <code>2017-2019</code> valiéndonos del campo <code>time</code>, que tiene es un <code>timestamp</code>, pero con 3 digitos más que el usado por <code>datetime</code> de Python. Le agregamos el campo <code>Estado</code> que es más descriptivo.

In [22]:
### Demora 7 minutos y 40 segundos, 11 minutos, varía

lineas_json_revs_google = []

for i in top_5_url:
    count = 0
    for c in range(1,cantidad_archivos[i]+1):
        with open(str(i)+str(c)+".json", 'r', encoding='utf-8') as f:        
            for s in f:
                linea = json.loads(s)
                linea['anio'] = datetime.datetime.fromtimestamp(linea['time']/1000).year
                linea['estado'] = i.split('-')[-1][:-1]
                
                if linea['anio'] in [2017,2018,2019]:
                    lineas_json_revs_google.append(linea)

df_revs_google = pd.DataFrame(lineas_json_revs_google)

df_revs_google.head(3)

In [None]:
merge_site_reviews = pd.merge(df_revs_google, df, left_on='gmap_id', right_on='gmap_id')

In [None]:
merge_site_reviews

In [None]:
merge_site_reviews.to_parquet(r'Generated\Google\merge_site_reviews.parquet')

In [None]:
df_revs_google.to_parquet(r'Generated\Google\reviews-estados.parquet')

Tamaño archivo: 760 Mb

Tamaño dataset: 24.3 Gb

In [None]:
df_revs_google.info()

Tamaño 8.339.179 filas x 10 Columnas.

# 2. Extracción de los Dataset de YELP

### 2.1 Business

Contiene los datos de las entidades negocios de Yelp, a un primer vistazo tiene las columnas duplicadas, por lo que hay que hacer un recorte, ya que la segunda mitad tiene datos vacíos en su inmensa mayoría.

In [None]:
url_business = r'Datasets\Yelp\business.pkl'

df_business = pd.read_pickle(url_business)

df_business = df_business.iloc[:,:-14]

In [None]:
df_business.sample(3)

In [None]:
df_business.shape

Luego con la ayuda del campo <code>state</code> filtramos los negocios que se encuentran en los estados seleccionados en nuestro análisis.

In [None]:
df_business = df_business[df_business.state.isin(top_5)]

In [None]:
df_business.info()

Seguimos filtrando a través del campo <code>categories</code>, para obtener los negocios que son restaurantes.

In [None]:
def is_restaurant(st):
    try: 
        test = "".join(st).lower()
        return 'restaurant' in test
    except:
        return False

df_business = df_business[df_business['categories'].apply(is_restaurant)]

In [None]:
df_business.info()

In [None]:
df_business.to_parquet(r'Generated\Yelp\bussines.parquet')

### 2.2 Checkin

In [None]:
lineas_json = []
path_checkin = r'Datasets\Yelp\checkin.json'
with open(path_checkin, 'r', encoding='utf-8') as file:
    for l in file:
        try:
            linea_j = json.loads(l)
            anio = linea_j['date'][:4]
            # if 'restaurant' in " ".join(linea_j['category']).lower():
            if anio in ['2017', '2018', '2019']:
                lineas_json.append(linea_j)
        except:
            pass

df_checkin = pd.DataFrame(lineas_json)

In [None]:
df_checkin

In [None]:
merge_business_checkin = pd.merge(df_business, df_checkin, left_on='business_id', right_on='business_id')

In [None]:
merge_business_checkin.info()

In [None]:
df_checkin.to_parquet(r'Generated\Yelp\checkin.parquet')
merge_business_checkin.to_parquet(r'Generated\YELP\business_checkin.parquet')

### 2.3 Tips 

Realizamos la extracción de los datos y filtramos por año según nuestro análisis.

In [None]:
lineas_json = []
path_tip = r'Datasets\Yelp\tip.json'
with open(path_tip, 'r', encoding='utf-8') as file:
    for l in file:
        try:
            linea_j = json.loads(l)
            anio = linea_j['date'][:4]
            if anio in ['2017', '2018', '2019']:
                lineas_json.append(linea_j)
        except:
            pass

df_tip = pd.DataFrame(lineas_json)


In [None]:
df_tip.sample(5)

In [None]:
df_tip.to_parquet(r'Generated\Yelp\tip.parquet')

Unimos el el dataframe de tips con el de negocios

In [None]:
tips_merged = pd.merge(df_tip, df_business, left_on='business_id', right_on='business_id')

In [None]:
tips_merged.sort_values('business_id').head(3)

In [None]:
tips_merged.to_parquet(r'Generated\Yelp\business_tip.parquet')

### 2.4 Review

In [None]:
df_reviews_url =  r'Datasets\Yelp\review.json'

Usamos el mismo método de linea por linea, y en el proceso filtramos por año y por las reseñas que han sido votadas como útiles.

In [None]:
# 44 segundos

lineas_json_review = []

with open(df_reviews_url, 'r', encoding='utf-8') as f:
    count = 0
    for i in f:
        linea = json.loads(i)
        anio = linea['date'][:4]
        if anio in ['2017', '2018', '2019'] and linea['useful'] == 1:    
            lineas_json_review.append(linea)


df_reviews = pd.DataFrame(lineas_json_review)

In [None]:
df_reviews.sample(10)

In [None]:
df_reviews.info()

Aligeramos el dataset con unos downgrades de tipo de variables.

In [None]:
df_reviews['funny'] = df_reviews['funny'].astype('int8')
df_reviews['stars'] = df_reviews['stars'].astype('int8')
df_reviews['cool'] = df_reviews['cool'].astype('int8')

df_reviews.drop('useful', axis=1, inplace=True, errors='ignore')


In [None]:
df_reviews.to_parquet(r'Generated\Yelp\review.parquet')

### 2.4 Users Yelp

In [None]:
import pyarrow.parquet as pq


parquet_file = pq.ParquetFile(r'Datasets\Yelp\user.parquet')

arr_df = []

for batch in parquet_file.iter_batches():
    # count = count +1
    batch_df = batch.to_pandas()
    batch_df['elite'] = batch_df['elite'].apply(lambda x: x.split(','))
    batch_df['elite_len'] = batch_df['elite'].apply(lambda x: len(x))
    batch_df = batch_df.query("elite_len > 1")
    arr_df.append(batch_df)

df_users = pd.concat(arr_df)

In [None]:
df_users.sample(5)

In [None]:
df_users.info()

In [None]:
df_users.reset_index(inplace=True)
df_users.drop('index', axis=1, inplace=True, errors="ignore")

In [None]:
df_users.to_parquet(r"Generated\Yelp\users_extracted.parquet")

## NORMALIZACIÓN DE DATOS

In [3]:
import pandas as pd
from datetime import datetime

### **GOOGLE**

#### *1. MERGE_SITE_REVIEWS*

In [18]:
df_siterev = pd.read_parquet("Generated\Google\merge_site_reviews.parquet")

In [19]:
df_siterev.sample(3)

Unnamed: 0,user_id,name_x,time,rating,text,pics,resp,gmap_id,anio,estado,...,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url,state_ab,us_state
348493,107942381515148867431,Sandy P,1529050068250,5,,,,0x8085612890558ddf:0x125ed0ca2f98a892,2018,California,...,3.6,1078,₩,"[[Wednesday, 5AM–2AM], [Thursday, 5AM–12AM], [...",{'Accessibility': ['Wheelchair-accessible car ...,Open ⋅ Closes 2AM,"[0x808561288074c9d9:0x129e8b6ca0ddc163, 0x8085...",https://www.google.com/maps/place//data=!4m2!3...,,
1972544,108973483426689098114,j b,1552163444809,4,,,,0x88e77010a00e128d:0x761d451cbcaed653,2019,Florida,...,4.5,1772,₩₩,"[[Saturday, 7AM–5PM], [Sunday, 8AM–5PM], [Mond...",{'Accessibility': ['Wheelchair-accessible car ...,Open ⋅ Closes 5PM,"[0x88e7701a0633b12d:0xd328ea95e4f2b0fc, 0x88e7...",https://www.google.com/maps/place//data=!4m2!3...,,
365299,117154565760944188399,MICHAEL Bansuelo,1517295312556,5,,,,0x80c2c0aeee51ffb5:0xb2f24102f8a608c8,2018,California,...,4.4,688,₩₩,"[[Wednesday, 9:30AM–10:30PM], [Thursday, 9:30A...",{'Accessibility': ['Wheelchair-accessible entr...,Closed ⋅ Opens 9:30AM,"[0x80c2c12c56569857:0xb2307fc2d0d613c, 0x80c2b...",https://www.google.com/maps/place//data=!4m2!3...,,


In [20]:
df_siterev.columns.values

array(['user_id', 'name_x', 'time', 'rating', 'text', 'pics', 'resp',
       'gmap_id', 'anio', 'estado', 'name_y', 'address', 'description',
       'latitude', 'longitude', 'category', 'avg_rating',
       'num_of_reviews', 'price', 'hours', 'MISC', 'state',
       'relative_results', 'url', 'state_ab', 'us_state'], dtype=object)

In [26]:
df_siterev.category

0                                        [Korean restaurant]
1                                        [Korean restaurant]
2                                        [Korean restaurant]
3                                        [Korean restaurant]
4                                        [Korean restaurant]
                                 ...                        
2393447    [Coffee shop, Bagel shop, Bakery, Breakfast re...
2393448    [Coffee shop, Bagel shop, Bakery, Breakfast re...
2393449    [Coffee shop, Bagel shop, Bakery, Breakfast re...
2393450    [Coffee shop, Bagel shop, Bakery, Breakfast re...
2393451    [Coffee shop, Bagel shop, Bakery, Breakfast re...
Name: category, Length: 2393452, dtype: object

In [8]:
df_siterev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2393452 entries, 0 to 2393451
Data columns (total 26 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   name_x            object 
 2   time              int64  
 3   rating            int64  
 4   text              object 
 5   pics              object 
 6   resp              object 
 7   gmap_id           object 
 8   anio              int64  
 9   estado            object 
 10  name_y            object 
 11  address           object 
 12  description       object 
 13  latitude          float64
 14  longitude         float64
 15  category          object 
 16  avg_rating        float64
 17  num_of_reviews    int64  
 18  price             object 
 19  hours             object 
 20  MISC              object 
 21  state             object 
 22  relative_results  object 
 23  url               object 
 24  state_ab          object 
 25  us_state          object 
dtypes: float64(3),

In [7]:
df_siterev[~df_siterev.pics.isnull()].sample(3)

Unnamed: 0,user_id,name_x,time,rating,text,pics,resp,gmap_id,anio,estado,...,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url,state_ab,us_state
366219,115191459158252243116,Sarah Bal,1572469393340,4,I have been wanting to review this place for a...,[{'url': ['https://lh5.googleusercontent.com/p...,,0x808e34cc915bd159:0x659ec9ee6650efc6,2019,California,...,4.4,1004,₩₩,"[[Wednesday, 11AM–8PM], [Thursday, 11AM–8PM], ...",{'Accessibility': ['Wheelchair-accessible car ...,Closed ⋅ Opens 11AM,"[0x808e352a573a1a49:0xe805950681af6c8d, 0x808e...",https://www.google.com/maps/place//data=!4m2!3...,,
1463632,118251272936885295934,andricci romero,1566164123252,1,"(Translated by Google) Super dissatisfied, it ...",[{'url': ['https://lh5.googleusercontent.com/p...,,0x88e77d85a70512f1:0xf1404b79022cfbca,2019,Florida,...,2.7,48,,"[[Monday, Closed], [Tuesday, 5PM–12AM], [Wedne...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 5PM Tue,"[0x88e77c4c9e301503:0x9c555cd78c12d447, 0x88e7...",https://www.google.com/maps/place//data=!4m2!3...,FL,Florida
1532735,114771237847844057444,Ron Kish,1570223395559,4,Tried the pizza 1st trip.Good stuff.Will be back,[{'url': ['https://lh5.googleusercontent.com/p...,{'text': 'We're looking forward to that 2nd vi...,0x88c29402cebd3b25:0xe05511b81b2b8fe9,2019,Florida,...,4.2,468,$$,"[[Saturday, 11AM–8PM], [Sunday, 11AM–8PM], [Mo...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 11AM,"[0x88c293f7b0e06b11:0x155d18d0ebde0896, 0x88c2...",https://www.google.com/maps/place//data=!4m2!3...,FL,Florida


In [27]:
num_nulls_pics = df_siterev.pics.isnull().sum()
num_nulls_pics

2307136

In [28]:
num_nulls_resp = df_siterev.resp.isnull().sum()
num_nulls_resp

2178044

In [31]:
df_siterev['dtfmt'] = df_siterev.time.apply(lambda x: datetime.utcfromtimestamp(x / 1000))
df_siterev['mes']  = df_siterev.dtfmt.dt.month
df_siterev['dia']  = df_siterev.dtfmt.dt.day
df_siterev['hora'] = df_siterev.dtfmt.dt.hour
df_siterev.name_x = df_siterev.name_x.str.title()
df_siterev.text = df_siterev.text.str.lower()
df_siterev.sample(3)

Unnamed: 0,user_id,name_x,time,rating,text,pics,resp,gmap_id,anio,estado,...,MISC,state,relative_results,url,state_ab,us_state,mes,dia,hora,dtfmt
1183263,113424582563873154993,Dharmdeo Singh,1535308630716,4,,,{'text': 'Dharmdeo Singh Thanks so much for vi...,0x89c259abcaeb40c1:0x55103d15febe0abf,2018,New_York,...,{'Accessibility': ['Wheelchair-accessible entr...,Closed ⋅ Opens 8AM,"[0x89c259acd1ac34a5:0x339cd110ed2d757c, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...,,,8,26,18,2018-08-26 18:37:10.716
2094206,113095237064971138450,Bryant Filomeno,1577718286252,5,spent the day in orlando. came across this gem...,,,0x88dd8304e39de7f3:0x2bbad0853a4b45c2,2019,Florida,...,{'Accessibility': ['Wheelchair-accessible car ...,Closed ⋅ Opens 6AM Wed,"[0x88dd83d646087305:0x5dd6aa576a740dd1, 0x88dd...",https://www.google.com/maps/place//data=!4m2!3...,,,12,30,15,2019-12-30 15:04:46.252
257567,107745563584556050187,Christy Richmond,1577131085961,4,good food and service. parking still very chal...,,,0x80c2bbfbf52c5fc3:0xe17c36c86833ef1f,2019,California,...,{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 11AM,"[0x80c2bc07bdc9b387:0x69bc949e4ffaef57, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...,CA,California,12,23,19,2019-12-23 19:58:05.961


#### ***<u>COMENTARIO</u>***
Las columnas **pics** y **resp** deberían eliminarse porque la cantidades de datos nulos sobrepasa los 90%.

*Obtenemos las columnas de año, mes, día y hora para posteriores análisis*