In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from textblob import TextBlob

# Cargar el dataset
df = pd.read_parquet('reviews_dataset.parquet')

# Filtrar restaurantes con más de 5 reseñas y comentarios no nulos
df_restaurants = df[df['text'].notnull() & df['business_name'].notnull()]
restaurant_counts = df_restaurants['business_name'].value_counts()
filtered_restaurants = restaurant_counts[restaurant_counts > 5].index
df_restaurants = df_restaurants[df_restaurants['business_name'].isin(filtered_restaurants)]

# Limpieza básica del texto
def clean_text(text):
    return " ".join(word.lower() for word in text.split() if word.isalpha() and word not in ENGLISH_STOP_WORDS)

df_restaurants['cleaned_text'] = df_restaurants['text'].apply(clean_text)

# Función para calcular análisis de sentimiento
def analyze_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity  # -1 (negativo) a 1 (positivo)
    return polarity

df_restaurants['sentiment'] = df_restaurants['cleaned_text'].apply(analyze_sentiment)

# Definir palabras clave para cada característica
characteristics = {
    "Cleanliness": ["clean", "dirty", "spotless", "hygiene"],
    "Customer Service": ["service", "staff", "friendly", "rude"],
    "Food Quality": ["taste", "delicious", "bland", "flavor"],
    "Presentation": ["presentation", "look", "beautiful", "messy"],
    "Wait Time": ["wait", "slow", "quick", "fast"]
}

# Función para asignar puntuaciones
def score_characteristics(text, characteristics):
    scores = {key: 0 for key in characteristics}
    for key, keywords in characteristics.items():
        for word in keywords:
            if word in text:
                scores[key] += 1
    return scores

df_restaurants['scores'] = df_restaurants['cleaned_text'].apply(
    lambda x: score_characteristics(x, characteristics)
)

In [None]:
df_restaurants