In [0]:
from pyspark.sql import functions as F

# Leer tablas Delta de la capa Bronze
bronze_business_path = "s3a://lakehouseyelp/bronze/business"
bronze_review_path = "s3a://lakehouseyelp/bronze/review"

df_business = spark.read.format("delta").load(bronze_business_path)
df_review = spark.read.format("delta").load(bronze_review_path)

# Limpieza inicial: Filtrar negocios que no tengan nombre y reseñas sin texto
df_business_clean = df_business.where(F.col("name").isNotNull())
df_review_clean = df_review.where(F.col("text").isNotNull()).limit(5000) 

In [0]:
%sql
select * 
from delta.`s3a://lakehouseyelp/bronze/review` 
limit 5

business_id,cool,date,funny,review_id,stars,text,useful,user_id,ingestion_timestamp
2oav5QoWgnvTI2gO5xFMjw,1,2014-05-10 15:01:11,0,Ztbn5JcWKOv8ZUjYw_hrbQ,5.0,"Having stayed here now countless times on business trips, the Monaco continues to be my go-to in Philadelphia. As one who travels over 100K miles globally each year, the Monaco is quite possibly my favorite hotel worldwide. In short, the Monaco boasts 1) large, clean, and beautifully-appointed rooms, 2) an excellent location near the best of Philly, and 3) most important, a motivated and extremely helpful staff. From Brad at the front door to Matthew at the front desk, this is a team of hospitality professionals. Quite literally, every single person I encounter in this hotel -- from the room service personnel to those serving wine at 5 PM -- is a genuine pleasure. I don't know who manages this property, but he or she is doing something very well indeed. I look forward to more future visits. There are many hotels that could learn a lot from the Monaco.",0,LZkKrVrSJ3L_B-bddIalag,2025-12-31T15:03:20.564Z
bjhCtlYHrkgA5Ku8l-rB3g,0,2013-05-13 18:11:16,0,XI6jtuTDemM0sHBUiSQU_w,3.0,"Breakfast was good. Offering gluten free bread is a plus. Not sure if Sustainable, Organic or Local. Probably not a cooperative or not sure how employee beneficial it is. No sign of accepting alternative currencies. The search continues. :)",1,P5T4eBDKKUiic9ZqRa_PJQ,2025-12-31T15:03:20.564Z
EpREWeEpmR8f1qLHzzF0AA,0,2017-03-06 18:51:30,0,NiD0iSS5_SCbaPH0OWPh_w,1.0,"I really liked the oatmeal stout, the ambiance, activity. We were there celebrating my wife's birthday along with friends, folks were ordering food, we ordered, 30-40 minutes later waitress comes by and says the order got lost did we still want our order, by now some friends were leaving, so we left in disgust.",0,QIvC4ATKjsFhCORvOeo29A,2025-12-31T15:03:20.564Z
W0vdz23JQtVQX5vJkiCj3g,1,2014-07-17 03:57:48,0,oDVYH3dDRtpXPCaNXVEVdg,4.0,"All right! Great drinks, great service & surprisingly good food. Tonight C&A really delivered. Thank you for a great night out. We'll be back!",0,YEety1WutYuODfNPvgqL6A,2025-12-31T15:03:20.564Z
SuQgcpL-aZeuyRjBdA0pHw,0,2015-08-11 12:20:26,0,FNg_F8PAf_jOTE7EkQiiGQ,5.0,"Samurai, has the best Sushi in Villanova. I am never disappointed when I eat there, there rolls are fresh, and unique. Their house salad the ""Ginger Dressing"" is always a perfect starter to the meal. My normal meal is the Deluxe Sushi Sampler, or a Bento Box both of which are amazing, I am not a tofu lover, but their miso soup is superb. The service is also great, very kind and it is very fast service. A small and quaint little restaurant, with amazing food and service . I have had Herosame, and it is nothing compared to Samurai. And I believe Samurai Sushi actually had better sushi then Mori Moto. Thank you Samurai!",0,ux4Jtc_FEeKod-x2hhuu2w,2025-12-31T15:03:20.564Z


In [0]:
%pip install transformers torch

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting torch
  Downloading torch-2.9.1-cp312-cp312-manylinux_2_28_aarch64.whl.metadata (30 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.7.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (4.1 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-

In [0]:
from pyspark.sql.functions import pandas_udf
import pandas as pd
import os

@pandas_udf("string")
def predict_sentiment_udf(texts: pd.Series) -> pd.Series:
    # 1. Configuramos el entorno ANTES de importar transformers
    os.environ['HF_HOME'] = '/tmp/huggingface_cache'
    os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
    
    # Creamos el directorio por si acaso (el worker tiene permiso en /tmp)
    if not os.path.exists('/tmp/huggingface_cache'):
        os.makedirs('/tmp/huggingface_cache', exist_ok=True)
    
    # 2. Ahora sí, importamos la librería
    from transformers import pipeline
    
    # 3. Inicializamos el modelo SIN el argumento cache_dir
    # La librería usará automáticamente la ruta que pusimos en os.environ
    classifier = pipeline(
        "sentiment-analysis", 
        model="distilbert-base-uncased-finetuned-sst-2-english"
    )
    
    # 4. Procesamos
    results = classifier(texts.to_list(), truncation=True)
    return pd.Series([res['label'] for res in results])

# Ejecución
df_review_enriched = df_review_clean.withColumn("ai_sentiment", predict_sentiment_udf(df_review_clean["text"]))

# Acción para disparar el cálculo
display(df_review_enriched.select("text", "ai_sentiment").limit(5))

text,ai_sentiment
"If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. The food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.",POSITIVE
"I've taken a lot of spin classes over the years, and nothing compares to the classes at Body Cycle. From the nice, clean space and amazing bikes, to the welcoming and motivating instructors, every class is a top notch work out. For anyone who struggles to fit workouts in, the online scheduling system makes it easy to plan ahead (and there's no need to line up way in advanced like many gyms make you do). There is no way I can write this review without giving Russell, the owner of Body Cycle, a shout out. Russell's passion for fitness and cycling is so evident, as is his desire for all of his clients to succeed. He is always dropping in to classes to check in/provide encouragement, and is open to ideas and recommendations from anyone. Russell always wears a smile on his face, even when he's kicking your butt in class!",POSITIVE
"Family diner. Had the buffet. Eclectic assortment: a large chicken leg, fried jalapeño, tamale, two rolled grape leaves, fresh melon. All good. Lots of Mexican choices there. Also has a menu with breakfast served all day long. Friendly, attentive staff. Good place for a casual relaxed meal with no expectations. Next to the Clarion Hotel.",POSITIVE
"Wow! Yummy, different, delicious. Our favorite is the lamb curry and korma. With 10 different kinds of naan!!! Don't let the outside deter you (because we almost changed our minds)...go in and try something new! You'll be glad you did!",POSITIVE
"Cute interior and owner (?) gave us tour of upcoming patio/rooftop area which will be great on beautiful days like today. Cheese curds were very good and very filling. Really like that sandwiches come w salad, esp after eating too many curds! Had the onion, gruyere, tomato sandwich. Wasn't too much cheese which I liked. Needed something else...pepper jelly maybe. Would like to see more menu options added such as salads w fun cheeses. Lots of beer and wine as well as limited cocktails. Next time I will try one of the draft wines.",POSITIVE


In [0]:
# Agrupamos por el sentimiento generado por la IA
sentiment_counts = df_review_enriched.groupBy("ai_sentiment").count()

# Mostramos el resultado
display(sentiment_counts)



com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:139)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:139)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:136)
	at scala.collection.immutable.Range.foreach(Range.scala:192)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:136)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:721)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:441)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:441)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
silver_base_path = "s3a://lakehouseyelp/silver/"

# Guardar Business limpio
df_business_clean.write.format("delta").mode("overwrite").save(silver_base_path + "business_clean")

# Guardar Reviews enriquecidas con IA
df_review_enriched.write.format("delta").mode("overwrite").save(silver_base_path + "review_enriched")

print("¡Capa Silver completada y datos enriquecidos con IA!")

¡Capa Silver completada y datos enriquecidos con IA!
