## Yelp Reviews EDA

In [1]:
# Encuentra la ubicacion de spark
import findspark
findspark.init()
import pyspark
findspark.find()

'C:\\Spark'

In [2]:
# Importamos las bibliotecas necesarias para Koalas y definir alias
import os
from functools import reduce
import pyspark
from pyspark.sql import Row
from pyspark.sql import functions as F
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

# Configuramos Spark para poder procesar de forma local archivos de gran tamaño
conf = SparkConf().setAppName('appName').setMaster('local') \
    .set("spark.network.timeout", "600s") \
    .set("spark.driver.memory", "12g") \
    .set("spark.executor.memory", "10g") \
    .set("spark.executor.cores", "4") \
    .set("spark.dynamicAllocation.maxExecutors", "2") \
    .set("spark.jars", r"C:\mysql-connector-j-8.1.0\mysql-connector-j-8.1.0.jar")

sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [3]:
spark

In [4]:
import databricks.koalas as ks
import os



In [5]:
# Lectura de datos y transformacion a koalas
df_spark = spark.read.load(r'D:\Proyecto Integrador Parquet\Yelp\review-002.parquet', format='parquet', inferSchema=True)
df = df_spark.to_koalas()

In [6]:
# Informacion sobre las columnas
df.info()

  for name, col in reset_index.iteritems():
  [


<class 'databricks.koalas.frame.DataFrame'>
Int64Index: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   review_id    6990280 non-null  object 
 1   user_id      6990280 non-null  object 
 2   business_id  6990280 non-null  object 
 3   stars        6990280 non-null  float64
 4   useful       6990280 non-null  int64  
 5   funny        6990280 non-null  int64  
 6   cool         6990280 non-null  int64  
 7   text         6990280 non-null  object 
 8   date         6990280 non-null  object 
dtypes: float64(1), int64(3), object(5)

In [7]:
# Control de duplicados
df.duplicated().sum()

0

In [8]:
# Visualizacion del dataframe con spark
df_spark.show()

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|If you decide to ...|2018-07-07 22:09:11|
|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...|  5.0|     1|    0|   1|I've taken a lot ...|2012-01-03 15:28:18|
|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|     0|    0|   0|Family diner. Had...|2014-02-05 20:30:30|
|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...|  5.0|     1|    0|   1|Wow!  Yummy, diff...|2015-01-04 00:01:03|
|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...|  4.0|     1|    0|   1|Cu

In [9]:
# Informacion sobre las columnas
df_spark.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- text: string (nullable = true)
 |-- date: string (nullable = true)



In [10]:
# Estadisticas descriptivas del dataframe con spark
df_spark.describe().show()

+-------+--------------------+--------------------+--------------------+------------------+------------------+-------------------+------------------+----------------------+-------------------+
|summary|           review_id|             user_id|         business_id|             stars|            useful|              funny|              cool|                  text|               date|
+-------+--------------------+--------------------+--------------------+------------------+------------------+-------------------+------------------+----------------------+-------------------+
|  count|             6990280|             6990280|             6990280|           6990280|           6990280|            6990280|           6990280|               6990280|            6990280|
|   mean|                null|                null|                null|  3.74858374771826|1.1846089140921394|0.32655959417934616|0.4986175088837643|                  null|               null|
| stddev|                null|     

In [11]:
# Longitud del dataframe
df_spark.count()

6990280

## Analisis por columna

Columna "business_id"

In [12]:
# Cantidad de negocios con reviews
len(df.business_id.unique())

150346

In [13]:
# Top 20 negocios con mas reviews
top20_business_reviews = df.business_id.value_counts().head(20)
top20_business_reviews

_ab50qdWOk0DdB6XOrBitw    7673
ac1AeYqs8Z4_e2X5M3if2A    7516
GXFMD0Z4jEVZBCsbPf4CTQ    6160
ytynqOUb3hjKeJfRj5Tshw    5778
oBNrLz4EDhiscSlbOl8uAw    5264
iSRTaT9WngzB8JJ2YKJUig    5254
VQcCL9PiNL_wkGf-uF3fjg    5146
_C7QiQQc47AOEv4PE3Kong    4969
GBTPC53ZrG1ZBY3DT8Mbcw    4661
6a4gLLFSgr-Q6CZXDLzBGQ    4480
PP3BBaVxZLcJU54uP_wL6Q    4293
1b5mnK8bMnnju_cvU65GqQ    4247
I_3LMZ_1m2mzR0oLIOePIg    4093
VaO-VW3e1kARkU9bP1E7Fw    4034
qb28j-FNX1_6xm7u372TZA    3971
gTC8IQ_i8zXytWSly3Ttvg    3917
yPSejq3_erxo9zdVYTBnZA    3889
wz8ZPfySQczcPgSyd33-HQ    3634
VVH6k9-ycttH3TV_lk5WfQ    3633
IkY2ticzHEn4QFn8hQLSWg    3428
Name: business_id, dtype: int64

Columna 'review_id'

In [14]:
# Cantidad de reviews distintas
len(df.review_id.unique())

6990280

Columna 'user_id'

In [15]:
# Cantidad de usuarios distintos
len(df.user_id.unique())

1987929

In [16]:
# Top 20 usuarios que mas reviews dejaron
df.user_id.value_counts().head(20)

_BcWyKQL16ndpBdggh2kNA    3048
Xw7ZjaGfr0WNVt6s_5KZfA    1840
0Igx-a1wAstiBDerGxXk2A    1747
-G7Zkl1wIWBBmD0KRy_sCw    1682
ET8n-r7glWYqZhuR6GcdNw    1653
bYENop4BuQepBjM1-BI3fA    1578
1HM81n6n4iPIFU5d2Lokhw    1554
fr1Hz2acAb3OaL3l6DyKNg    1447
wXdbkFZsfDR7utJvbWElyA    1396
Um5bfs5DH6eizgjH3xZsvg    1391
qjfMBIZpQT9DDtw_BWCopQ    1324
VL12EhEdT4OWqGq0nIqkzw    1308
bJ5FtCtZX3ZZacz2_2PJjA    1298
pou3BbKsIozfH50rxmnMew    1247
ouODopBKF3AqfCkuQEnrDg    1129
B-s-8VUnuBjGTP3d01jsyw    1087
-kLVfaJytOJY2-QdQoCcNQ    1076
vHc-UrI9yfL_pnnc6nJtyQ    1071
CfX4sTIFFNaRchNswqhVfg    1047
AHRrG3T1gJpHvtpZ-K0G_g    1041
Name: user_id, dtype: int64

In [17]:
# Promedio de reviews por usuario
df.user_id.value_counts().mean()

3.51636300894046

In [18]:
# Desviacion estandar de reviews por usuario
df.user_id.value_counts().std()

12.77087790086078

Columna 'date

In [19]:
from datetime import datetime

In [20]:
# Transforma a tipo datetime
df.date = df.date.astype(datetime)

In [21]:
# Encuentra la review mas antigua
df.date.min()

  series = series.astype(t, copy=False)


Timestamp('2005-02-16 03:23:22')

In [22]:
# Encuentra la review mas reciente
df.date.max()

  series = series.astype(t, copy=False)


Timestamp('2022-01-19 19:48:45')

Columna 'stars'

In [23]:
# Numero de reviews por calificacion
df.stars.value_counts()

5.0    3231627
4.0    1452918
1.0    1069561
3.0     691934
2.0     544240
Name: stars, dtype: int64

Columna 'useful'

In [24]:
# Top 20 numero de calificaciones 'useful' en las reviews
df.useful.value_counts().head(20)

0     3840492
1     1539953
2      687425
3      343742
4      186984
5      112204
6       71214
7       47679
8       34000
9       24783
10      18475
11      14319
12      11103
13       8751
14       7112
15       5633
16       4688
17       3874
18       3287
19       2780
Name: useful, dtype: int64

Columna 'funny'

In [25]:
# Top 20 numero de calificaciones funny en las reviews
df.funny.value_counts().head(20)

0     5894117
1      691994
2      195290
3       82111
4       42254
5       24723
6       15545
7       10178
8        7147
9        5223
10       3739
11       2992
12       2367
13       1896
14       1419
15       1274
16       1014
17        781
18        694
19        599
Name: funny, dtype: int64

Columna 'cool'

In [26]:
# Top 20 numero de calificaciones cool en las reviews
df.cool.value_counts().head(20)

0     5377964
1     1016736
2      296999
3      114763
4       56609
5       32352
6       21530
7       15010
8       11028
9        8085
10       6349
11       4981
12       4011
13       3125
14       2549
15       2105
16       1800
17       1458
18       1270
19       1092
Name: cool, dtype: int64

Columna 'text'

In [27]:
len(df.text.unique())

6974127

In [28]:
from pyspark.sql.functions import udf, split, size
from pyspark.sql.types import IntegerType

def count_words(text):
    if text is not None:
        return len(text.split())
    else:
        return 0

count_words_udf = udf(count_words, IntegerType())

df_spark = df_spark.withColumn('word_count', count_words_udf(df_spark['text']))

In [29]:
# Estadisticas descriptivas del numero de palabras por review
df_spark.select('word_count').summary().show()

+-------+------------------+
|summary|        word_count|
+-------+------------------+
|  count|           6990280|
|   mean|104.77632326602082|
| stddev| 97.92226559475209|
|    min|                 1|
|    25%|                42|
|    50%|                75|
|    75%|               133|
|    max|              1070|
+-------+------------------+



In [30]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

In [31]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to C:\Users\Matías
[nltk_data]     Tejerina\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [32]:
sia = SentimentIntensityAnalyzer()

In [33]:
def get_sentiment_score(text):
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']

udf_get_sentiment_score = udf(get_sentiment_score, DoubleType())

In [34]:
df_final = df_spark.withColumn('sentiment_score', udf_get_sentiment_score('text'))

In [35]:
df_final.show()

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+----------+---------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|word_count|sentiment_score|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+----------+---------------+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|If you decide to ...|2018-07-07 22:09:11|       101|         0.8597|
|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...|  5.0|     1|    0|   1|I've taken a lot ...|2012-01-03 15:28:18|       151|         0.9858|
|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|     0|    0|   0|Family diner. Had...|2014-02-05 20:30:30|        55|         0.9201|
|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQ

In [55]:
from pyspark.sql.functions import col, to_timestamp
df_final = df_final.withColumn('date', to_timestamp(col('date'), 'yyyy-MM-dd HH:mm:ss'))

In [56]:
df_final.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- text: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- word_count: integer (nullable = true)
 |-- sentiment_score: double (nullable = true)

