In [6]:
import pandas as pd
data = pd.read_parquet('df_modelo2.parquet')

## SENTIMENT ANALISIS + Modelo de clasificacion 
*El propósito del modelo es predecir el sentimiento (positivo, negativo o neutral) de una nueva reseña basada en su contenido textual mediante un modelo de clasificacion. Esto puede ayudar a un restaurante a monitorear y responder proactivamente a las opiniones de los clientes.*

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Cargar los datos desde el archivo parquet
data = pd.read_parquet('df_modelo2.parquet')

# Reemplazar los valores None o NaN con una cadena vacía
data['review_text'] = data['review_text'].fillna('')

# Convertir el texto a minúsculas
data['review_text'] = data['review_text'].str.lower()

# Realizar análisis de sentimiento utilizando TextBlob
from textblob import TextBlob
data['sentiment'] = data['review_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Asignar etiquetas de sentimiento
data['sentiment'] = data['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral' if x == 0 else 'None')

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(data['review_text'], data['sentiment'], test_size=0.2, random_state=42)

# Crear el vectorizador TF-IDF para convertir el texto en vectores numéricos para el modelo
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Crear el modelo de clasificación Naive Bayes Multinomial para análisis
model = MultinomialNB()

# Entrenar el modelo de clasificación
model.fit(X_train_vectorized, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = model.predict(X_test_vectorized)

# Generar el informe de clasificación
classification_report = classification_report(y_test, y_pred)
print(classification_report)

              precision    recall  f1-score   support

    negative       0.91      0.06      0.12      2050
     neutral       0.50      0.00      0.00       516
    positive       0.86      1.00      0.92     14557

    accuracy                           0.86     17123
   macro avg       0.75      0.35      0.35     17123
weighted avg       0.85      0.86      0.80     17123



In [8]:
data

Unnamed: 0,business_id,business_name,city,latitude,longitude,business_rating,review_text,review_stars,review_date,sentiment
0,aNtKyc2rr-uK5cqzY9TVQQ,Chipotle Mexican Grill,Largo,27.894167,-82.779866,3.0,the new chipotle mexican grill conveniently lo...,4,2017-08-30,positive
1,aNtKyc2rr-uK5cqzY9TVQQ,Chipotle Mexican Grill,Largo,27.894167,-82.779866,3.0,i am a chipotle enthusiast but from the first ...,5,2018-01-18,positive
2,aNtKyc2rr-uK5cqzY9TVQQ,Chipotle Mexican Grill,Largo,27.894167,-82.779866,3.0,went into this location about an hour before t...,5,2018-09-08,positive
3,aNtKyc2rr-uK5cqzY9TVQQ,Chipotle Mexican Grill,Largo,27.894167,-82.779866,3.0,i come to this chipotle often and it's one of ...,5,2019-06-24,positive
4,aNtKyc2rr-uK5cqzY9TVQQ,Chipotle Mexican Grill,Largo,27.894167,-82.779866,3.0,"the food is good, as per usual at any chipotle...",2,2018-06-20,positive
...,...,...,...,...,...,...,...,...,...,...
85608,,Taco Bus Miami,Miami,,,4.4,(translated by google) the best taco bus foodt...,5,2021-03-13,positive
85609,,Taco Bus Miami,Miami,,,4.4,(translated by google) very rich everything\n\...,5,2021-02-23,positive
85610,,Taco Bus Miami,Miami,,,4.4,(translated by google) little padding in the t...,3,2021-03-03,negative
85611,,Taco Bus Miami,Miami,,,4.4,(translated by google) excellent food in the b...,5,2020-09-11,positive


In [12]:
#guardar dataframe data en archivo parquet llamado df_modelo2_NLP.parquet
data.to_parquet('df_modelo2_NLP.parquet')

In [13]:
#guardar dataframe data en archivo parquet llamado df_modelo2_NLP.parquet
data.to_csv('df_modelo2_NLP.csv')

In [None]:
# Use the trained model to make predictions on new data
new_data = pd.DataFrame({'review_text': ['This restaurant is amazing!','terrible' ]})

# Preprocess the new data
new_data['review_text'] = new_data['review_text'].fillna('')
new_data['review_text'] = new_data['review_text'].str.lower()

# Vectorize the new data
new_data_vectorized = vectorizer.transform(new_data['review_text'])

# Make predictions using the trained model
new_predictions = model.predict(new_data_vectorized)

# Print the predictions
print(new_predictions)

In [9]:
#muestra 10 ejemplos de reviews con sentimiento negativo
data[data['sentiment'] == 'negative']['review_text'].head(10)

6      the kids working here may still be "in trainin...
10     i always get disappointed when i go to chipotl...
14     this was a terrible experience i had ordered 2...
15     not great at all! cashier touched my food whil...
17     this chipotle location is the worst!  it's the...
77     for $8.30 (including $1 tip) i had two very ti...
128    i hate to be outnumbered here  but this place ...
146    in one sentence: a taco truck that's now in a ...
173    i have been living close to this taco joint fo...
183    the atmosphere here is neat, but i was put off...
Name: review_text, dtype: object

### RESULTADOS DE SENTIMENT ANALYSIS CON TEXTBLOB

In [10]:
# para el business_name 'Taco Bell', contar cuántas reseñas tienen sentimiento positivo, negativo y neutral y calcualar el porcentaje de cada uno
taco_bell_data = data[data['business_name'] == 'Taco Bell']
sentiment_counts = taco_bell_data['sentiment'].value_counts()
sentiment_counts

sentiment
positive    4270
negative    1693
neutral      679
Name: count, dtype: int64

In [11]:
sentiment_percentages = sentiment_counts / sentiment_counts.sum() * 100
print(sentiment_percentages)

sentiment
positive    64.287865
negative    25.489310
neutral     10.222824
Name: count, dtype: float64
