##  EDA (Reviews de States de Google)

In [1]:
# # Importamos las bibliotecas necesarias
import findspark
findspark.init()
import pyspark
findspark.find()
import os

In [2]:
# Importamos las bibliotecas necesarias para Koalas y definir alias
from functools import reduce
from pyspark.sql import functions as F
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

In [3]:
# Inicializamos la sesión de Spark
spark

In [4]:
# Importamos la biblioteca para Koalas
import collections.abc
collections.Iterable = collections.abc.Iterable
collections.Mapping = collections.abc.Mapping
collections.MutableSet = collections.abc.MutableSet
collections.MutableMapping = collections.abc.MutableMapping
collections.Callable = collections.abc.Callable

import databricks.koalas as ks

In [5]:
# Se define una función para leer archivos Parquet y convertirlos a DataFrames de Koalas
def read_parquet_files(directory, estado):
    dataframes_koalas = []

    for archivo in os.listdir(directory):
        if archivo.endswith('.parquet'):
            ruta_archivo = os.path.join(directory, archivo)
            df_spark = spark.read.parquet(ruta_archivo)
            df_koalas = ks.DataFrame(df_spark)  # Convertimos a DataFrame de Koalas
            df_koalas['state'] = estado  # Agregamos la columna "Estado"
            dataframes_koalas.append(df_koalas)

    return ks.concat(dataframes_koalas, ignore_index=True)

# Directorios de los estados
directorio_California = r'C:\Escritorio\PF\states_reviews\states_reviews\review-California'
directorio_Florida = r'C:\Escritorio\PF\states_reviews\states_reviews\review-Florida'
directorio_Illinois = r'C:\Escritorio\PF\states_reviews\states_reviews\review-Illinois'
directorio_New_York = r'C:\Escritorio\PF\states_reviews\states_reviews\review-New_York'
directorio_Texas = r'C:\Escritorio\PF\states_reviews\states_reviews\review-Texas'

# Leer y concatenar los DataFrames para cada estado
df_California = read_parquet_files(directorio_California, 'California')
df_Florida = read_parquet_files(directorio_Florida, 'Florida')
df_Illinois = read_parquet_files(directorio_Illinois, 'Illinois')
df_New_York = read_parquet_files(directorio_New_York, 'New York')
df_Texas = read_parquet_files(directorio_Texas, 'Texas')

In [6]:
# Concatenamos los DataFrames de los estados en uno solo
df_reviews_top_5 = ks.concat([df_California, df_Florida, df_Illinois, df_New_York, df_Texas])
df_reviews_top_5.head()

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id,state
0,108991152262655788985,Song Ro,1609909927056,5,Love there korean rice cake.,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49,California
1,111290322219796215751,Rafa Robles,1612849648663,5,Good very good,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49,California
2,112640357449611959087,David Han,1583643882296,4,They make Korean traditional food very properly.,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49,California
3,117440349723823658676,Anthony Kim,1551938216355,5,Short ribs are very delicious.,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49,California
4,100580770836123539210,Mario Marzouk,1494910901933,5,Great food and prices the portions are large,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49,California


In [7]:
# Obtenemos la longitud del DataFrame
len(df_reviews_top_5)

11746824

In [8]:
# Controlamos valores nulos en el DataFrame
df_reviews_top_5.isnull().sum()

user_id           0
name              0
time              0
rating            0
text        4870400
pics       11348752
resp       10246105
gmap_id           0
state             0
Name: 0, dtype: int64

In [9]:
# Creamos un nuevo DataFrame eliminando columnas específicas
df_reviews_top_5_clean = df_reviews_top_5.drop(columns=['text', 'pics', 'resp'])

In [10]:
# Mostramos las primeras filas del DataFrame limpio
df_reviews_top_5_clean.head()

Unnamed: 0,user_id,name,time,rating,gmap_id,state
0,108991152262655788985,Song Ro,1609909927056,5,0x80c2c778e3b73d33:0xbdc58662a4a97d49,California
1,111290322219796215751,Rafa Robles,1612849648663,5,0x80c2c778e3b73d33:0xbdc58662a4a97d49,California
2,112640357449611959087,David Han,1583643882296,4,0x80c2c778e3b73d33:0xbdc58662a4a97d49,California
3,117440349723823658676,Anthony Kim,1551938216355,5,0x80c2c778e3b73d33:0xbdc58662a4a97d49,California
4,100580770836123539210,Mario Marzouk,1494910901933,5,0x80c2c778e3b73d33:0xbdc58662a4a97d49,California


In [11]:
# Eliminamos las filas duplicadas en el DataFrame
df_reviews_google_top_5 = df_reviews_top_5_clean.drop_duplicates()
df_reviews_google_top_5.head()

Unnamed: 0,user_id,name,time,rating,gmap_id,state
1812280,100000019059943251257,Jim Hajek,1532172902488,5,0x880fd36b093a9a07:0x940cc06f90294db,Illinois
1418443,100000019059943251257,Jim Hajek,1542748194590,5,0x880e4cae16c03f93:0x8ad15acaf8ffd496,Illinois
466029,100000020623254171349,Zachary Kular,1498657367260,4,0x880fab1b8261c301:0xc13854f55b0feec4,Illinois
1792773,100000020623254171349,Zachary Kular,1558493360838,5,0x880e2de4633757b5:0x4679b7845e3ee752,Illinois
1735594,100000029979508209337,Raychel Perez,1593401857411,5,0x80eacb93b18677bb:0x6eb86c000772bf33,California


In [12]:
# Obtenemos la longitud del nuevo DataFrame sin duplicados
len(df_reviews_google_top_5)

11392427

In [13]:
# Controlamos que en el rating no haya valores outliers
unique_ratings = df_reviews_google_top_5["rating"].unique().to_list()
print(unique_ratings)

[5, 1, 3, 2, 4]


In [14]:
# Ruta para guardar el archivo Parquet
ruta_exportacion = 'C:/Escritorio/PF/states_reviews/reviews_estados_elegidos.parquet'

# Exportamos el DataFrame a formato Parquet con pandas
df_reviews_google_top_5.to_parquet(ruta_exportacion, engine='pyarrow', compression='snappy')