### Importamos las librerias necesarias para ejecutar nuestro modelo de analisis de sentimientos:

In [1]:
from pyspark.sql.functions import udf, when, col, concat
from pyspark.sql.types import *
from pyspark.sql import SparkSession

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
nltk.download('vader_lexicon')

In [3]:
sia = SentimentIntensityAnalyzer()

In [4]:
def get_sentiment_score(text):
    return sia.polarity_scores(text)['compound']

In [5]:
# Convertir la función UDF en una función Spark
udf_get_sentiment_score = udf(get_sentiment_score, StringType())

In [7]:
# Definir el esquema de las columnas
schema = StructType([
    StructField("review_id", StringType()),
    StructField("user_id", StringType()),
    StructField("gmap_id", StringType()),
    StructField("rating", IntegerType()),
    StructField("text", StringType()),
    StructField("date", DateType())
])

###

In [8]:
path_reviews_gold = f"abfss://datumtech@datumlake.dfs.core.windows.net/gold/GoogleMapsgold/reviews-estados-gold/*" 
# Cargar los datos Parquet especificando el esquema
df_reviews = spark.read.format("parquet").schema(schema).load(path_reviews_gold)

In [9]:
df_reviews.count()

In [10]:
df_reviews.dtypes

In [11]:
# Calcular el puntaje de sentimiento compuesto y asignar la etiqueta de sentimiento
df_reviews = df_reviews.withColumn('sentiment_score', udf_get_sentiment_score(df_reviews['text']))
df_reviews = df_reviews.withColumn('sentiment', \
                                    when(col('sentiment_score') >= 0.3, 'positive') \
                                    .when((col('sentiment_score') > -0.3) & (col('sentiment_score') < 0.3), 'neutral') \
                                    .otherwise('negative'))

In [12]:
df_sentiment = df_reviews.select("review_id", "sentiment_score", "sentiment")

In [14]:
df_sentiment.dtypes

In [15]:
df_sentiment = df_sentiment.withColumn("sentiment_score", col("sentiment_score").cast("double"))

In [16]:
path_sentiment_gold = f"abfss://datumtech@datumlake.dfs.core.windows.net/gold/GoogleMapsgold/sentiment-gold/review-sentiment"   
df_sentiment.write.format("parquet").save(path_sentiment_gold)

#### Para comprobar que todo se haya ejecutado de manera correcta, podemos traer cualquier archivo de la tabla gold y hacer algunas verificaciones.

In [17]:
df = spark.read.format("parquet").load(path_sentiment_gold)

In [19]:
df.show(5)

In [20]:
df.count()

In [21]:
#is_unique = df_reviews.select('review_id').distinct().count()