In [1]:
import pandas as pd
import numpy as np
import re
import ast

import datetime

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [2]:
# Load the processed and cleaned data
processed_data_path = '../data/processed/'
raw_data_path = '../data/raw/'

name = 'Oceana Grill'

reviews_pro = pd.read_csv(processed_data_path + name + '_reviews.csv')
resumme_raw = pd.read_csv(raw_data_path + 'resumme_' + name + '.csv')

display(resumme_raw)
display(reviews_pro.sample(5))

reviews = reviews_pro.copy()
resumme = resumme_raw.copy()

Unnamed: 0,stars,reviews
0,5,4012
1,4,1853
2,3,788
3,2,464
4,1,399


Unnamed: 0,rating_score,date,review,local_guide_reviews,service,meal_type,price_per_person_category,food_score,service_score,atmosphere_score,recommendations_list,avg_price_per_person
2121,5,2014-10-21,Best place ever. Came here for the first time ...,1,Ate there,Dinner,20-30 €,5,4,5,[''],30.0
3005,5,2018-10-03,Wow we had an amazing waitress so kind and fri...,21,Ate there,,,5,4,5,[''],
5486,4,2018-11-18,I love Oceana and make it a must every time I ...,28,Take Away,Dinner,20-30 €,3,2,2,[''],30.0
6633,4,2011-07-12,Cool place to grab small bite and cold beer! W...,56,Ate there,,10-20 €,4,2,3,[''],20.0
5981,5,2019-09-14,Delicious meal and fun atmosphere! Thank you t...,22,Take Away,,,5,4,4,[''],


### First draft summary plots 

In [3]:
# Calculate the average for each score
average_food = reviews['food_score'].mean()
average_service = reviews['service_score'].mean()
average_atmosphere = reviews['atmosphere_score'].mean()
average_reviews = (resumme_raw['stars'] * resumme_raw['reviews']).sum() / resumme_raw['reviews'].sum()

# Create a figure with horizontal subplots
fig = make_subplots(rows=1, cols=3, 
                    specs=[[{"type": "xy"}, {"type": "bar"}, {"type": "bar"}]], 
                    subplot_titles=("Average Score", "Number of Reviews", "Categories"))

# First subplot: Display the average review as large text
fig.add_trace(
    go.Scatter(x=[0], y=[0], text=[f"{average_reviews:.2f}"], mode="text", textfont=dict(size=120)),
    row=1, col=1
)

fig.update_xaxes(showgrid=False, zeroline=False, showticklabels=False, row=1, col=1)
fig.update_yaxes(showgrid=False, zeroline=False, showticklabels=False, row=1, col=1)


# Second subplot: Bar plot for reviews
fig.add_trace(
    go.Bar(x=resumme_raw['reviews'], y=resumme_raw['stars'], marker=dict(color='lightskyblue'),
           text=resumme_raw['reviews'], textposition='auto', name="Reviews", orientation='h'),
    row=1, col=2
)

# Third subplot: Bar plot for categories (Food, Service, Atmosphere)
fig.add_trace(
    go.Bar(x=[average_food, average_service, average_atmosphere], 
           y=['Food', 'Service', 'Atmosphere'], 
           marker=dict(color='lightgreen'), 
           text=[f"{average_food:.2f}", f"{average_service:.2f}", f"{average_atmosphere:.2f}"], 
           textposition='auto', 
           orientation='h', 
           name="Categories"),
    row=1, col=3
)

fig.update_layout(height=500, width=1200,  plot_bgcolor="white", paper_bgcolor="white", showlegend=False)
fig.show()

In [4]:
# Convert date column to datetime format
reviews['date'] = pd.to_datetime(reviews['date'], errors='coerce')
reviews['month'] = reviews['date'].dt.to_period('M')
reviews['year'] = reviews['date'].dt.year
reviews['week'] = reviews['date'].dt.to_period('W')
reviews['week'] = reviews['date'] - pd.to_timedelta(reviews['date'].dt.weekday, unit='d')
reviews['week'] = reviews['week'].dt.strftime('%Y-%m-%d')

# Filter data for the last periods (months, years, weeks)
limit_date = reviews['date'].max()#pd.to_datetime('today')
last_months = reviews[reviews['date'] >= limit_date - pd.DateOffset(months=12)]
last_years = reviews[reviews['date'] >= limit_date - pd.DateOffset(years=8)]
last_weeks = reviews[reviews['date'] >= limit_date - pd.DateOffset(weeks=5)]

# Compute averages for the required periods
monthly_avg_scores = last_months.groupby('month')[['rating_score', 'food_score', 'service_score', 'atmosphere_score']].mean()
yearly_avg_scores = last_years.groupby('year')[['rating_score']].mean()
weekly_avg_scores = last_weeks.groupby('week')[['rating_score', 'food_score', 'service_score', 'atmosphere_score']].mean()

# Update the axis labels for each score to be more readable
label_mapping = {
    'rating_score': 'Rating',
    'food_score': 'Food',
    'service_score': 'Service',
    'atmosphere_score': 'Atmosphere'
}

# Create a figure with subplots using the Z-layout
fig = make_subplots(rows=2, cols=2,
                    specs=[[{"colspan": 2}, None],
                           [{}, {}]],  # 1 large plot on the first row, 2 smaller plots on the second
                    subplot_titles=("Monthly Score Trends (Last 12 Months)", 
                                    "Annual Rating Score Trends (Last 6 Years)", 
                                    "Weekly Score Trends (Last 4 Weeks)"))

# Add monthly score trends to the first row (rating_score in stronger color)
colors = ['#1f77b4', '#aec7e8', '#aec7e8', '#aec7e8']
for i, column in enumerate(monthly_avg_scores.columns):
    label = label_mapping[column]
    fig.add_trace(
        go.Scatter(x=monthly_avg_scores.index.astype(str), y=monthly_avg_scores[column],
                   mode='lines+markers', name=label, 
                   text=[f"{label} - {val:.2f}" for val in monthly_avg_scores[column]], 
                   hoverinfo="text", line=dict(color=colors[i])),
        row=1, col=1)

# Add yearly score trends to the second row (left)
fig.add_trace(
    go.Scatter(x=yearly_avg_scores.index.astype(str), y=yearly_avg_scores['rating_score'],
               mode='lines+markers', name="Rating", line=dict(color='#1f77b4', width=4),
               text=[f"Rating - {val:.2f}" for val in yearly_avg_scores['rating_score']], 
               hoverinfo="text"),
    row=2, col=1)

# Add weekly score trends to the second row (right, weaker colors)
for i, column in enumerate(weekly_avg_scores.columns):
    label = label_mapping[column]  # Get the readable label
    fig.add_trace(
        go.Scatter(x=weekly_avg_scores.index.astype(str), y=weekly_avg_scores[column],
                   mode='lines+markers', name=label, 
                   text=[f"{label} - {val:.2f}" for val in weekly_avg_scores[column]], 
                   hoverinfo="text", line=dict(color=colors[i])),
        row=2, col=2)

# Enhance presentation: remove gridlines and borders, increase size, and remove legend
fig.update_layout(showlegend=False, 
                  title="Score Trends Analysis",
                  title_font=dict(size=28),
                  margin=dict(l=50, r=50, t=100, b=50),
                  paper_bgcolor="white",
                  height=800, width=1200)
fig.update_xaxes(showline=False, showgrid=False)
fig.update_yaxes(showline=False, showgrid=True)

# Customize x-axes formatting: show only the year for yearly data, and only day and month for weekly data
fig.update_xaxes(
    tickformat="%Y",  # Only show the year for the yearly graph
    row=2, col=1
)

fig.update_xaxes(
    tickformat="%d-%b",  # Show only the day and month for weekly graph
    row=2, col=2
)

# Add annotations to highlight key points
fig.add_annotation(x='2024-06', y=4.8, 
                   text="Highest Score", 
                   showarrow=True, arrowhead=2,
                   ax=0, ay=80, row=1, col=1, font=dict(size=14))

fig.add_annotation(x='2024-03', y=4.5, 
                   text="Drop in March", 
                   showarrow=True, arrowhead=2,
                   ax=0, ay=-40, row=1, col=1, font=dict(size=14))

fig.add_annotation(x='2024-08', y=4.5, 
                   text="Drop in August", 
                   showarrow=True, arrowhead=2,
                   ax=0, ay=-40, row=1, col=1, font=dict(size=14))

fig.update_traces(marker=dict(size=8), selector=dict(name="Rating"))
fig.update_layout(plot_bgcolor="white", paper_bgcolor="white")
fig.show()


### Cleaning and preprocessing

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

from tqdm import tqdm

# Download NLTK stopwords and lexicon
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Load spaCy Spanish model
nlp = spacy.load('es_core_news_sm')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jobandtalent/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jobandtalent/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
# Clean text, stopworks and tokenize words
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-záéíóúñü0-9\s]', '', text)
    doc = nlp(text)
    stop_words = set(stopwords.words('spanish'))
    tokens = [token.lemma_ for token in doc 
              if token.text not in stop_words and not token.is_punct and not token.is_space]
    return ' '.join(tokens)

In [9]:
tqdm.pandas(desc="Cleaning Reviews")
reviews['cleaned_review'] = reviews['review'].fillna('').progress_apply(clean_text)

display(reviews[['review', 'cleaned_review']].sample(5))

Cleaning Reviews: 100%|██████████| 7516/7516 [01:48<00:00, 69.37it/s] 


Unnamed: 0,review,cleaned_review
2917,We took a bunch of our college students to exp...,we took bunch of our college students to exper...
6962,The food tastes Very chain restaurant-like. Ch...,the food tast very chain restaurantlike chilis...
28,"Has an amazing oyster, delicious food and grea...",an amazing oyster delicious food and great var...
7166,Where do I start?!!!! DISAPPOINTING!!!!! Pleas...,wherar do i start disappointing please dont co...
1457,Absolutely love to visit this place every time...,absolutely lovir to visit this placir every ti...


### Embeddings and sentiment analysis

In [10]:
# Embeddings and Sentiment
from transformers import pipeline
from transformers import BertTokenizer, BertModel
import torch

# Word Clouds and Visualization
from wordcloud import WordCloud


#### Analyze sentiment

In [11]:
# Extract sentiment for each review using 
def analyzeSentiment(df):
    # Initialize VADER sentiment analyzer
    sia = SentimentIntensityAnalyzer()

    # Apply sentiment analysis to each review using VADER
    df['vader_sentiment'] = df['cleaned_review'].apply(lambda x: sia.polarity_scores(x)['compound'])
    
    # Classify sentiment into positive, neutral, negative using rating_score and vader_sentiment
    def classify_sentiment(row):
        if row['rating_score'] >= 4:
            return 'positive'
        elif row['rating_score'] <= 2:
            return 'negative'
        elif row['vader_sentiment'] > 0.05:
            return 'positive'
        elif row['vader_sentiment'] < -0.05:
            return 'negative'
        else:
            return 'neutral'
    
    df['sentiment_label'] = df.apply(classify_sentiment, axis=1)
    
    return df

# Extract most common words for a selected sentiment
def extractCommonWords(df, sentiment_label='positive', n=10):
    # Filter reviews by sentiment label
    filtered_reviews = df[df['sentiment_label'] == sentiment_label]['cleaned_review'].fillna('').tolist()
    
    # Tokenize and count words for the given sentiment label
    vectorizer = CountVectorizer().fit(filtered_reviews)
    word_counts = vectorizer.transform(filtered_reviews).sum(axis=0)
    
    # Create a dictionary of word frequencies
    word_freq = [(word, word_counts[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    sorted_word_freq = sorted(word_freq, key=lambda x: x[1], reverse=True)[:n]
    
    return sorted_word_freq

# Extract most common n-grams for a selected sentiment
def extractCommonNgrams(df, sentiment_label='positive', n=2, top_n=10):
    # Filter reviews by sentiment label
    filtered_reviews = df[df['sentiment_label'] == sentiment_label]['cleaned_review'].fillna('').tolist()
    
    # Create n-grams for the given sentiment label
    vectorizer = CountVectorizer(ngram_range=(n, n)).fit(filtered_reviews)
    ngram_counts = vectorizer.transform(filtered_reviews).sum(axis=0)
    
    # Create a list of n-grams with their counts
    ngram_freq = [(word, ngram_counts[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    sorted_ngrams = sorted(ngram_freq, key=lambda x: x[1], reverse=True)[:top_n]
    
    return sorted_ngrams

# Analyze sentiment with VADER
reviews = analyzeSentiment(reviews)

# Extract common positive and negative phrases
common_positive_words = extractCommonWords(reviews, sentiment_label = 'positive', n = 10)
common_negative_words = extractCommonWords(reviews, sentiment_label = 'negative', n = 10)

print("Top Positive Words:", common_positive_words)
print("Top Negative Words:", common_negative_words)

# Extract common positive and negative bigrams
common_positive_bigrams = extractCommonNgrams(reviews, sentiment_label='positive', n=2, top_n=10)
common_negative_bigrams = extractCommonNgrams(reviews, sentiment_label='negative', n=2, top_n=10)

print("Top Positive Bigrams:", common_positive_bigrams)
print("Top Negative Bigrams:", common_negative_bigrams)

Top Positive Words: [('the', 26133), ('and', 18364), ('was', 15166), ('to', 8506), ('we', 7159), ('of', 6398), ('it', 6260), ('in', 5176), ('for', 5075), ('had', 4958)]
Top Negative Words: [('the', 5250), ('and', 3015), ('was', 2797), ('to', 2113), ('it', 1443), ('of', 1306), ('we', 1209), ('in', 1078), ('food', 1003), ('for', 992)]
Top Positive Bigrams: [('it was', 2441), ('and the', 1936), ('the food', 1907), ('had the', 1505), ('new orleans', 1455), ('this placir', 1374), ('of the', 1299), ('food was', 1297), ('in the', 1149), ('the best', 1133)]
Top Negative Bigrams: [('it was', 471), ('the food', 399), ('this placir', 311), ('and the', 296), ('in the', 295), ('of the', 276), ('on the', 223), ('food was', 219), ('new orleans', 186), ('we were', 180)]


In [13]:
# Plot the evolution of distribution of reviews on time based on sentiments
def plotSentimentTrend(df, years_limit = 2):
    # Convert date to datetime format and handle missing values
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])
    
    # Filter only the last 6 years
    last_six_years = df['date'].max() - pd.DateOffset(years=years_limit)
    df = df[df['date'] >= last_six_years]

    # Set date as index for resampling
    df.set_index('date', inplace=True)
    
    # Resample to monthly and count sentiments
    sentiment_counts = df.resample('M')['sentiment_label'].value_counts().unstack().fillna(0)

    # Calculate the percentage for each sentiment type
    sentiment_percentage = sentiment_counts.div(sentiment_counts.sum(axis=1), axis=0) * 100
    sentiment_percentage = sentiment_percentage.round(2)
    sentiment_percentage = sentiment_percentage.reset_index().melt(id_vars=['date'], value_name='percentage', var_name='sentiment_label')
    
    # Plot sentiment percentage evolution
    fig = px.area(
        sentiment_percentage,
        x='date',
        y='percentage',
        color='sentiment_label',
        title='Sentiment Percentage Over the Last 6 Years',
        labels={'date': '', 'percentage': 'Percentage of Reviews (%)', 'sentiment_label': 'Sentiment'},
        template='plotly_white',
    )

    # Customize layout
    fig.update_layout(
        title=dict(x=0.5, xanchor='center', font=dict(size=18, color='black')),
        xaxis=dict(showgrid=False, zeroline=False),
        yaxis=dict(showgrid=True, title='Percentage of Reviews', ticksuffix='%'),
        legend=dict(title='', orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1),
        margin=dict(l=20, r=20, t=50, b=20),
        plot_bgcolor='rgba(0,0,0,0)',
        hovermode='x unified',
        width=1200,
        height=400,
    )

    # Customize color for sentiment categories
    color_map = {
        'positive': 'rgba(102, 194, 165, 0.7)', 
        'neutral': 'rgba(141, 160, 203, 0.7)', 
        'negative': 'rgba(252, 141, 98, 0.7)'
    }
    fig.for_each_trace(lambda trace: trace.update(line=dict(width=0, shape='spline'), fill='tonexty', fillcolor=color_map.get(trace.name, 'rgba(150, 150, 150, 0.5)')))

    # Remove the plot frame and keep the visualization as clean as possible
    fig.update_xaxes(showline=False)
    fig.update_yaxes(showline=False, range=[0, 100])  # Percentage scale from 0 to 100

    fig.show()

plotSentimentTrend(reviews)


'M' is deprecated and will be removed in a future version, please use 'ME' instead.



In [15]:

# Extract most and least recommendations mentioned
def analyzeRecommendations(df):
    all_dishes = []

    # Convert string representation of lists to actual lists and extend all_dishes
    for item in df['recommendations_list'].dropna():
        try:
            dishes = ast.literal_eval(item)
            if isinstance(dishes, list):
                all_dishes.extend(dishes)
        except:
            continue

    # Filter out empty values
    all_dishes = [dish for dish in all_dishes if dish.strip() != '']

    # Count the frequency of each dish
    dish_counts = Counter(all_dishes)
    if not dish_counts:
        return [], []
    
    # Most and least recommended dishes
    most_common_dishes = dish_counts.most_common(3)
    min_count = min(dish_counts.values())
    worst_dishes = [dish for dish, count in dish_counts.items() if count == min_count]

    return most_common_dishes, worst_dishes

most_recommended, less_recommended = analyzeRecommendations(reviews)
print("Top Most Recommended:", most_recommended)
print("Least Recommended :", less_recommended)

Top Most Recommended: []
Least Recommended : []


#### Calculate embeddings

In [None]:
# Extract the embeddings for each cleaned review
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Import Bert model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

tqdm.pandas(desc="Generating Embeddings")
reviews['embedding'] = reviews['cleaned_review'].progress_apply(get_embedding)

Generating Embeddings:   5%|▍         | 357/7516 [01:18<19:15,  6.19it/s]  

#### Analyze embeddings

In [13]:
from sklearn.decomposition import PCA

# PCA Embeddings Visualization
def visualizeEmbeddingsPCA(df):
    # Convert embeddings to a NumPy array
    embeddings = np.array(df['embedding'].tolist())
    ratings = df['rating_score']
    
    # Perform PCA for dimensionality reduction
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(embeddings)
    
    # Calculate variance explained by each component
    var_explained = pca.explained_variance_ratio_ * 100
    var1, var2 = var_explained
    
    # Prepare DataFrame for Plotly
    plot_df = pd.DataFrame({
        'PCA Component 1': reduced_embeddings[:, 0],
        'PCA Component 2': reduced_embeddings[:, 1],
        'Rating Score': ratings,
        'Review ID': df.get('review_id', range(len(df)))  # Optional identifier
    })
    
    # Create interactive scatter plot
    fig = px.scatter(
        plot_df,
        x='PCA Component 1',
        y='PCA Component 2',
        color='Rating Score',
        color_continuous_scale='Viridis',
        hover_data=['Review ID', 'Rating Score'],
        title=f'Embeddings by Rating Score (PCA 1: {var1:.1f}%, PCA 2: {var2:.1f}%)',
        labels={
            'PCA Component 1': f'PCA 1 ({var1:.1f}% variance)',
            'PCA Component 2': f'PCA 2 ({var2:.1f}% variance)',
            'Rating Score': 'Rating Score'
        }
    )
    
    # Enhance layout for clarity
    fig.update_layout(
        template='plotly_white',
        coloraxis_colorbar=dict(
            title='Rating Score',
            tickmode='linear'
        ),
        hovermode='closest'
    )
    
    fig.show()
    return reduced_embeddings

embeddings_pca = visualizeEmbeddingsPCA(reviews)

import umap.umap_ as umap

# UMAP Embeddings Visualization
def visualizeEmbeddingsUMAP(df):
    embeddings = np.array(df['embedding'].tolist())
    sentiment_labels = df['sentiment_label']

    # Reduce dimensionality with UMAP
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
    reduced_embeddings = reducer.fit_transform(embeddings)

    # Create DataFrame for visualization
    viz_df = pd.DataFrame(reduced_embeddings, columns=['x', 'y'])
    viz_df['sentiment_label'] = sentiment_labels

    # Scatter plot with Plotly for interactive visualization
    fig = px.scatter(
        viz_df,
        x='x',
        y='y',
        color='sentiment_label',
        title='Embedding Visualization with UMAP',
        labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2'},
        color_discrete_map={'positive': 'green', 'neutral': 'gray', 'negative': 'red'},
        opacity=0.7
    )
    fig.update_layout(showlegend=True, legend=dict(title='Sentiment'), margin=dict(l=10, r=10, t=40, b=10))
    fig.show()

    return reduced_embeddings

embeddings_umap = visualizeEmbeddingsUMAP(reviews)




n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [14]:
from sklearn.neighbors import NearestNeighbors

# Plot K distace for dbscan eps adjustment
def plotKdistance(reduced_embeddings, k=5, method='PCA'):
    # Compute k-nearest neighbors
    neighbors = NearestNeighbors(n_neighbors=k)
    neighbors_fit = neighbors.fit(reduced_embeddings)
    distances, _ = neighbors_fit.kneighbors(reduced_embeddings)
    
    # Sort distances to the k-th nearest neighbor
    k_distances = np.sort(distances[:, k-1])
    
    # Create interactive line plot
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=np.arange(1, len(k_distances) + 1),
        y=k_distances,
        mode='lines',
        line=dict(color='blue'),
        name='k-distance'
    ))
    
    # Update layout for clarity
    fig.update_layout(
        title=f'k-Distance Graph for {method}',
        xaxis_title='Points sorted by distance',
        yaxis_title=f'Distance to {k}th Nearest Neighbor',
        template='plotly_white',
        hovermode='x unified'
    )
    
    # Add light grid lines
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey')
    
    fig.show()

plotKdistance(embeddings_umap, k= 10, method='PCA')
plotKdistance(embeddings_pca, k= 10, method='UMAP')

In [15]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Function to apply DBSCAN
def apply_dbscan(reduced_embeddings, eps=0.6, min_samples=5):
    scaler = StandardScaler()
    scaled = scaler.fit_transform(reduced_embeddings)
    db = DBSCAN(eps=eps, min_samples=min_samples)
    labels = db.fit_predict(scaled)
    return labels

# PCA Visualization with DBSCAN
def visualizeEmbeddingsPCA_with_DBSCAN(df, eps=0.55, min_samples=10):
    embeddings = np.array(df['embedding'].tolist())
    ratings = df['rating_score']
    
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(embeddings)
    var1, var2 = pca.explained_variance_ratio_ * 100
    
    clusters = apply_dbscan(reduced, eps, min_samples)
    
    plot_df = pd.DataFrame({
        'pca_component_1': reduced[:, 0],
        'pca_component_2': reduced[:, 1],
        'rating_score': ratings,
        'pca_cluster': clusters,
        'review_id': df.get('review_id', range(len(df)))
    })
    
    fig = px.scatter(
        plot_df,
        x='pca_component_1',
        y='pca_component_2',
        color='pca_cluster',
        color_continuous_scale='Viridis',
        hover_data=['review_id', 'rating_score'],
        title=f'PCA with DBSCAN (PCA1: {var1:.1f}%, PCA2: {var2:.1f}%)',
        labels={
            'PCA 1': f'pca_component_1 ({var1:.1f}% variance)',
            'PCA 2': f'pca_component_2 ({var2:.1f}% variance)',
            'Cluster': 'pca_cluster'
        }
    )
    
    fig.update_layout(
        template='plotly_white',
        coloraxis_colorbar=dict(title='pca_cluster'),
        hovermode='closest'
    )
    
    fig.show()
    return plot_df

# UMAP Visualization with DBSCAN
def visualizeEmbeddingsUMAP_with_DBSCAN(df, eps=0.7, min_samples=10):
    embeddings = np.array(df['embedding'].tolist())
    sentiment = df['sentiment_label']
    
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
    reduced = reducer.fit_transform(embeddings)
    
    clusters = apply_dbscan(reduced, eps, min_samples)
    
    plot_df = pd.DataFrame({
        'umap_component_1': reduced[:, 0],
        'umap_component_2': reduced[:, 1],
        'sentiment': sentiment,
        'umap_cluster': clusters,
        'review_id': df.get('review_id', range(len(df)))
    })
    
    fig = px.scatter(
        plot_df,
        x='umap_component_1',
        y='umap_component_2',
        color='umap_cluster',
        color_continuous_scale='Viridis',
        hover_data=['sentiment', 'umap_cluster'],
        title='UMAP with DBSCAN',
        labels={
            'UMAP 1': 'umap_component_1',
            'UMAP 2': 'umap_component_2',
            'Cluster': 'umap_cluster'
        },
        opacity=0.7
    )
    
    fig.update_layout(
        showlegend=True,
        legend=dict(title='umap_cluster'),
        margin=dict(l=10, r=10, t=40, b=10)
    )
    
    fig.show()
    return plot_df

# Visualize with DBSCAN clusters
pca_clusters = visualizeEmbeddingsPCA_with_DBSCAN(reviews, eps=0.5, min_samples=5)
umap_clusters = visualizeEmbeddingsUMAP_with_DBSCAN(reviews, eps=0.5, min_samples=5)




n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [16]:
import networkx as nx
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Plot reviews by communities, using embeddingsm cosine_similarity and Girvan-Newman algorithm
def plotCommunities(reviews):
    # Load embeddings from reviews
    ebm_reviews = np.array(reviews['embedding'].tolist())

    # Calculate cosine similarity matrix between all pairs of embeddings
    similarity_matrix = cosine_similarity(ebm_reviews)
    similarity_threshold = 0.75

    G_sparser = nx.Graph()

    # Add nodes representing each review
    for i in range(len(reviews)):
        G_sparser.add_node(i, sentiment_label=reviews['sentiment_label'].iloc[i])

    # Add edges based on the similarity matrix and new threshold
    for i in range(len(similarity_matrix)):
        for j in range(i + 1, len(similarity_matrix)):  # Only consider upper triangle to avoid redundancy
            if similarity_matrix[i][j] >= similarity_threshold:
                G_sparser.add_edge(i, j, weight=similarity_matrix[i][j])

    # Use Girvan-Newman algorithm to detect communities
    comp = nx.algorithms.community.girvan_newman(G_sparser)
    communities_sparser = tuple(sorted(c) for c in next(comp))

    # Extract key terms from each community using TF-IDF
    vectorizer = TfidfVectorizer(max_features=3, stop_words='english')
    community_keywords = []

    for community in communities_sparser:
        reviews_text = reviews.iloc[list(community)]['cleaned_review'].astype(str).tolist()
        # Ensure there are non-stopword terms to avoid empty vocabulary error
        filtered_reviews_text = [text for text in reviews_text if len(vectorizer.build_tokenizer()(text)) > 0]
        if len(filtered_reviews_text) > 1:
            tfidf_matrix = vectorizer.fit_transform(filtered_reviews_text)
            keywords = vectorizer.get_feature_names_out()
            community_keywords.append(", ".join(keywords))
        else:
            community_keywords.append(reviews.iloc[list(community)[0]]['cleaned_review'])

    # Prepare data for Plotly interactive visualization
    node_x = []
    node_y = []
    node_color = []
    node_text = []

    pos = nx.spring_layout(G_sparser, seed=42)
    colors = px.colors.qualitative.Set1  # A set of distinct colors for different communities

    # Extract node positions, colors, and labels for Plotly
    for i, community in enumerate(communities_sparser):
        for node in community:
            x, y = pos[node]
            node_x.append(x)
            node_y.append(y)
            node_color.append(colors[i % len(colors)])
            node_text.append(f"{community_keywords[i]}")

    # Create edge traces
    edge_x = []
    edge_y = []

    for edge in G_sparser.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    # Create the Plotly figure
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='gray'),
        hoverinfo='none',
        mode='lines')

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        hoverinfo='text',
        text=node_text,
        marker=dict(
            size=10,
            line_width=2,
            color=node_color
        )
    )

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='Reviews by Communities',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

    fig.show()

plotCommunities(reviews)

In [17]:
### Join PCA and UMAP clusters info to reviews
reviews = reviews.merge(pca_clusters[['review_id','pca_cluster']]).merge(umap_clusters[['review_id','umap_cluster']])

#### Topics

In [18]:
from gensim import corpora
from gensim.models import LdaModel

# Extract topics using LDA model
def analyzeTopicsLDA(df, number_of_topics = 5):
   # Prepare corpus for LDA
    cleaned_reviews = df['cleaned_review'].dropna().tolist()
    tokenized_reviews = [review.split() for review in cleaned_reviews if isinstance(review, str) and review.strip() != '']
    
    if not tokenized_reviews:
        print("No valid reviews to process.")
        return None, []
    
    dictionary = corpora.Dictionary(tokenized_reviews)
    if len(dictionary) == 0:
        print("Dictionary is empty after tokenization.")
        return None, []
    
    corpus = [dictionary.doc2bow(review) for review in tokenized_reviews]
    if not any(corpus):
        print("Corpus is empty. No terms found in any document.")
        return None, []
    
    # Train LDA model
    try:
        lda_model = LdaModel(
            corpus,
            num_topics=number_of_topics,
            id2word=dictionary,
            passes=10,
            random_state=42
        )
    except ValueError as e:
        print(f"LDA Model training failed: {e}")
        return None, []
    
    # Extract topics
    topics = lda_model.print_topics(num_words=5)
    for topic in topics:
        print(f"Topic {topic[0]}: {topic[1]}")
    return lda_model, topics

print('=== General topics ===')
lda_model, topics = analyzeTopicsLDA(reviews)

=== General topics ===
Topic 0: 0.024*"hamburguesa" + 0.016*"bien" + 0.013*"buena" + 0.011*"lugar" + 0.010*"bonito"
Topic 1: 0.018*"hamburguesa" + 0.009*"poder" + 0.008*"hacer" + 0.008*"lugar" + 0.008*"genial"
Topic 2: 0.023*"buen" + 0.019*"hamburguesa" + 0.014*"servicio" + 0.013*"comida" + 0.011*"carta"
Topic 3: 0.018*"hamburguesa" + 0.018*"buen" + 0.011*"sitio" + 0.011*"excelente" + 0.010*"mejor"
Topic 4: 0.032*"buen" + 0.022*"comida" + 0.021*"lugar" + 0.020*"hamburguesa" + 0.012*"ambiente"


In [19]:
# Generate topics for all selected columns in group columns
def generateTopicsbyColumn(reviews, group_columns):
    # Initialize dictionary to store topics
    topics_dict = {group_col: {} for group_col in group_columns}

    # Iterate over each grouping column and generate topics
    for group_col in group_columns:
        print(f"\n=== Topics by {group_col} ===")
        unique_groups = reviews[group_col].dropna().unique()
        
        for group_val in unique_groups:
            subset = reviews[reviews[group_col] == group_val]
            
            # Check if there are enough reviews to train LDA
            if len(subset) < 5:
                print(f"\n--- {group_col} = {group_val} ---")
                print("Not enough data to train LDA.")
                continue
            
            print(f"\n--- {group_col} = {group_val} ---")
            
            # Generate topics for the current subset
            lda_model, topics = analyzeTopicsLDA(subset)
            
            if lda_model is not None and topics:
                # Store topics as strings in the dictionary
                topics_strings = [topic[1] for topic in topics]
                topics_dict[group_col][group_val] = topics_strings
            else:
                print("No topics generated for this group.\n")
    return topics_dict

group_columns = ['pca_cluster', 'umap_cluster', 'sentiment_label']
topics_dict = generateTopicsbyColumn(reviews, group_columns)


=== Topics by pca_cluster ===

--- pca_cluster = 0 ---
Topic 0: 0.024*"hamburguesa" + 0.012*"mejor" + 0.011*"sitio" + 0.009*"bien" + 0.008*"madrid"
Topic 1: 0.030*"buen" + 0.020*"comida" + 0.019*"hamburguesa" + 0.015*"sitio" + 0.013*"precio"
Topic 2: 0.015*"bien" + 0.010*"buen" + 0.009*"hamburguesa" + 0.007*"servicio" + 0.006*"súper"
Topic 3: 0.027*"buen" + 0.023*"hamburguesa" + 0.022*"lugar" + 0.014*"comida" + 0.013*"servicio"
Topic 4: 0.010*"amplio" + 0.010*"ir" + 0.008*"servicio" + 0.007*"pedir" + 0.007*"lugar"

--- pca_cluster = 1 ---
Topic 0: 0.250*"hamburguesa" + 0.250*"buena" + 0.250*"buen" + 0.250*"grande"
Topic 1: 0.471*"hamburguesa" + 0.469*"buena" + 0.031*"buen" + 0.029*"grande"
Topic 2: 0.250*"hamburguesa" + 0.250*"buen" + 0.250*"buena" + 0.250*"grande"
Topic 3: 0.256*"hamburguesa" + 0.255*"buena" + 0.244*"buen" + 0.244*"grande"
Topic 4: 0.471*"hamburguesa" + 0.323*"buen" + 0.177*"grande" + 0.030*"buena"

--- pca_cluster = 2 ---
No valid reviews to process.
No topics gener

#### Extract moments with worst rating and process that reviews

In [20]:
# Extract the periods with less score and the reviews of each period
def analyzeLowScores(df, score_column, time_period='month', num_periods=1, last_periods = 12):
    # Calculate the mean and standard deviation of the scores
    last_periods = df[df['date'] >= pd.to_datetime('today') - pd.DateOffset(months=last_periods)]

    # Compute averages for the required periods
    last_periods_avg_scores = last_periods.groupby(time_period)[score_column].mean().reset_index()
    last_periods_avg_scores.set_index(time_period, inplace=True)
    
    mean_score = last_periods_avg_scores[score_column].mean()
    std_dev_score = last_periods_avg_scores[score_column].std()
    
    # Define a threshold for low scores
    threshold = mean_score - std_dev_score
    low_scores = last_periods_avg_scores[last_periods_avg_scores[score_column] < threshold]
    # Select the specified number of periods with the lowest average score
    low_score_periods = low_scores.index[:num_periods]
    
    # Filter negative reviews for the selected periods with the lowest score
    period_reviews = df[(df[time_period].isin(low_score_periods)) & 
                        (df[score_column] <= 3)]
    
    # Drop the 'embedding' column if it exists to avoid issues with non-hashable types
    if 'embedding' in period_reviews.columns:
        period_reviews = period_reviews.drop(columns=['embedding'])
    
    # Add a column indicating the period with the lowest score for easier filtering
    period_reviews['low_score_period'] = period_reviews[time_period]
    period_reviews = period_reviews.sort_values('low_score_period')

    return period_reviews, low_score_periods

In [21]:
# Usage
time_period = 'month'  # Change to 'week', 'year', etc. to analyze different periods
num_periods = 3  # Number of periods with the lowest average score to select

# Analyze for each score type
negative_periods_rating_reviews, low_score_periods = analyzeLowScores(reviews, 'rating_score', time_period, num_periods)
negative_periods_food_reviews, _ = analyzeLowScores(reviews, 'food_score', time_period, num_periods)
negative_periods_service_reviews, _ = analyzeLowScores(reviews, 'service_score', time_period, num_periods)
negative_periods_atmosphere_reviews, _ = analyzeLowScores(reviews, 'atmosphere_score', time_period, num_periods)

In [22]:
# Calculate topics for each low_score_period and concatenate results
def generateTopicsPerPeriod(df, score_column, number_of_topics=1):
    valid_reviews = df[df['review'].notna()]
    topics_dict = {score_column: {}}
    for period in valid_reviews['low_score_period'].unique():
        period_reviews = valid_reviews[valid_reviews['low_score_period'] == period]
        # Assuming analyzeTopicsLDA function returns topics as the second output
        _, topics = analyzeTopicsLDA(period_reviews, number_of_topics=number_of_topics)
        topics_dict[score_column][period] = topics
    return topics_dict

negative_periods_rating_topics = generateTopicsPerPeriod(negative_periods_rating_reviews, 'rating_score')
negative_periods_food_topics = generateTopicsPerPeriod(negative_periods_food_reviews, 'food_score')
negative_periods_service_topics = generateTopicsPerPeriod(negative_periods_service_reviews, 'service_score')
negative_periods_atmosphere_topics = generateTopicsPerPeriod(negative_periods_atmosphere_reviews, 'atmosphere_score')

negative_periods_topics = {**negative_periods_rating_topics, **negative_periods_food_topics, **negative_periods_service_topics, **negative_periods_atmosphere_topics}

Topic 0: 0.081*"mala" + 0.054*"pena" + 0.054*"pro" + 0.054*"pedir" + 0.054*"merecer"
Topic 0: 0.059*"ir" + 0.039*"gin" + 0.039*"infernal" + 0.039*"q" + 0.039*"ruido"
Topic 0: 0.058*"cómodo" + 0.058*"mesa" + 0.038*"parecer" + 0.038*"mínimo" + 0.038*"podía"
Topic 0: 0.021*"cafetería" + 0.021*"tarde" + 0.021*"aquel" + 0.021*"entonces" + 0.021*"casa"
Topic 0: 0.038*"mesa" + 0.038*"bastante" + 0.038*"cómodo" + 0.038*"textura" + 0.025*"verdad"
Topic 0: 0.081*"mala" + 0.054*"pena" + 0.054*"pro" + 0.054*"pedir" + 0.054*"merecer"
Topic 0: 0.016*"tarde" + 0.016*"poder" + 0.016*"creer" + 0.016*"casa" + 0.016*"aquel"
Topic 0: 0.071*"ruido" + 0.071*"tónica" + 0.071*"q" + 0.071*"siempre" + 0.071*"ir"
Topic 0: 0.037*"cómodo" + 0.037*"mesa" + 0.037*"rico" + 0.025*"mantel" + 0.025*"cartón"


#### Extract outliers and pain points

In [23]:
import json
import numpy as np

# Format arrays of words in json format
def format_words(words_list):
    return {str(word): int(weight) if isinstance(weight, (int, np.integer)) else weight for word, weight in words_list}

# Join all the available information
words_dict = {
    "common_positive_words": format_words(common_positive_words),
    "common_negative_words": format_words(common_negative_words),
    "common_positive_bigrams": format_words(common_positive_bigrams),
    "common_negative_bigrams": format_words(common_negative_bigrams)
}
print(words_dict)

reviews_summary_dict = {**topics_dict, **words_dict}
print(reviews_summary_dict)

{'common_positive_words': {'hamburguesa': 128, 'buen': 118, 'comida': 66, 'lugar': 65, 'servicio': 48, 'sitio': 46, 'bien': 44, 'mejor': 42, 'precio': 41, 'ambiente': 36}, 'common_negative_words': {'ir': 10, 'café': 10, 'vez': 8, 'ver': 8, 'si': 8, 'pedir': 8, 'sitio': 7, 'servicio': 7, 'comida': 7, 'parecer': 6}, 'common_positive_bigrams': {'comida buen': 11, 'buen ambiente': 11, 'mejor hamburguesa': 11, 'buen comida': 10, 'buen servicio': 9, 'buen lugar': 8, 'aro cebolla': 7, 'hamburguesa rico': 7, 'precio razonable': 7, 'hamburguesa italiano': 6}, 'common_negative_bigrams': {'ver foto': 3, 'merecer pena': 3, 'nunca tanto': 2, 'si solo': 2, 'último vez': 2, 'huevo revuelto': 2, 'dar yo': 2, 'acompañar hamburguesa': 2, 'pan mohoso': 2, 'gofre frío': 1}}
{'pca_cluster': {0: ['0.024*"hamburguesa" + 0.012*"mejor" + 0.011*"sitio" + 0.009*"bien" + 0.008*"madrid"', '0.030*"buen" + 0.020*"comida" + 0.019*"hamburguesa" + 0.015*"sitio" + 0.013*"precio"', '0.015*"bien" + 0.010*"buen" + 0.009*"h

#### Extract reviews samples

In [24]:
# Calculate total score using the three main scores
reviews_score = reviews.copy()
food_score_mean = np.round(reviews_score['food_score'].mean(), 2) / 5
service_score_mean = np.round(reviews_score['service_score'].mean(), 2) / 5
atmosphere_score_mean = np.round(reviews_score['atmosphere_score'].mean(), 2) / 5

reviews_score['food_score'] = reviews_score['food_score'].fillna(food_score_mean)
reviews_score['service_score'] = reviews_score['service_score'].fillna(service_score_mean)
reviews_score['atmosphere_score'] = reviews_score['atmosphere_score'].fillna(atmosphere_score_mean)

reviews_score['total_score'] = np.round(
    reviews_score['rating_score'] +
    (reviews_score['food_score']/5 + reviews_score['service_score']/5 + reviews_score['atmosphere_score']/5) / 3, 2)

In [25]:
# Filter not null reviews
valid_reviews = reviews_score[reviews_score['review'].notna()]

# Select the best and worst reviews in general
best_reviews = valid_reviews[valid_reviews['total_score'] > 5]
worst_reviews = valid_reviews[valid_reviews['total_score'] < 2.5]

recent_best_reviews = best_reviews.sort_values(by='date', ascending=False).head(5)
print('last_positive_reviews')
print(recent_best_reviews.review)
recent_worst_reviews = worst_reviews.sort_values(by='date', ascending=False).head(5)
print('\nlast_negative_reviews')
print(recent_worst_reviews.review)

best_reviews_sample = best_reviews.sort_values(by='total_score', ascending=False).head(5)
print('\nbest_reviews_sample')
print(best_reviews_sample.review)
worst_reviews_sample = worst_reviews.sort_values(by='total_score', ascending=True).head(5)
print('\nworst_reviews_sample')
print(worst_reviews_sample.review)

low_score_reviews = negative_periods_rating_reviews[negative_periods_rating_reviews['review'].notna()][['month','review','rating_score']]
print('\nlow_score_reviews')
display(low_score_reviews)
print(low_score_periods)

last_positive_reviews
2      Estuve cenando en familia el martes y además d...
40     El martes fui a cenar con mis hijos y la pasam...
0      Además de que da un vibra de un bar de los 50’...
58     Muy buena la comida y la atención al cliente d...
110    Gran servicio, debo darle las felicitaciones a...
Name: review, dtype: object

last_negative_reviews
136    Paré para tomar un café y estaba rico. El siti...
31     El servicio encabezado por el encargado machis...
15     La comida me resultó bastante mala. Los canelo...
88     La cafetería despertó mi interés por su diseño...
157    De esos sitios que tienen malas maneras los em...
Name: review, dtype: object

best_reviews_sample
0      Además de que da un vibra de un bar de los 50’...
2      Estuve cenando en familia el martes y además d...
141    Un lugar ideal para tomar una cerveza 🍻 o verm...
140    Llevo años pidiendo la famosa egg-burger.\nInc...
133    Somos clientes habituales, la comida está riqu...
Name: review, dtype: ob

Unnamed: 0,month,review,rating_score
157,2024-02,De esos sitios que tienen malas maneras los em...,1.0
83,2024-03,Se nos está yendo ya la olla con los previos e...,3.0
153,2024-03,"Ya no es lo que era, vamos q hasta he encontra...",3.0
15,2024-08,La comida me resultó bastante mala. Los canelo...,1.0


PeriodIndex(['2024-02', '2024-03', '2024-08'], dtype='period[M]', name='month')


### ChatGPT API init

In [26]:
import sys
import os
project_root = os.path.abspath("..")
sys.path.append(project_root)

In [27]:
# Init ChatGPT client
from openai import OpenAI
import openai_setup

organization = openai_setup.conf['organization']
project = openai_setup.conf['project']
key = openai_setup.conf['key']

client = OpenAI(
    api_key=key,
    organization=organization,
    project=project
)

In [28]:
# Clean json outputs
def extract_json_string(input_string):
    json_match = re.search(r'\{.*\}', input_string, re.DOTALL)
    
    if json_match:
        json_string = json_match.group(0)
        json_dict = json.loads(json_string)
        return json_dict
    return None


In [29]:
# Extract main insights from API
def extractInsightsWithAI(info_dict, prompt, client):
    # Config and send message to gpt4o model
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a system expert in extracting value from reviews analysed using ML and NLP techniques, to provide valuable and actionable insights to stakeholders in an automated BI tool using AI."},
            {
                "role": "user",
                "content": prompt + str(info_dict)
            }
        ]
    )
    answer = completion.choices[0].message.content
    # Format correctly the answer
    answer_clean = extract_json_string(answer)
    return answer_clean

#### Extract main insights

In [30]:
# Prompt to extract automatically the general insights
general_insights_prompt = (
    "I have this information extracted from LDA topics using clustering and sentiment analysis, including positive and negative terms, in JSON format.\n"
    "I want you to extract:\n"
    "- 3 positive points\n"
    "- 3 negative points\n"
    "- 3 improvement suggestions based on the negative points\n"
    "\n"
    "Each point should be a logical, simple, and concise sentence that provides value. Do not name specific terms or topics, but focus on delivering direct value to business stakeholders without ambiguity. If you mention something that didn't go well, give examples based on the information.\n"
    "Return the result in English in JSON format, ensuring it is easy to read in a notebook and standardized as follows:\n"
    "\n"
    "{best:['','',''], worst:['','',''], improve:['','','']}\n"
    "\n"
    "Ensure there are no contradictions between positive, negative, and improvement points.\n"
    "The information:\n"
)
print(reviews_summary_dict)

{'pca_cluster': {0: ['0.024*"hamburguesa" + 0.012*"mejor" + 0.011*"sitio" + 0.009*"bien" + 0.008*"madrid"', '0.030*"buen" + 0.020*"comida" + 0.019*"hamburguesa" + 0.015*"sitio" + 0.013*"precio"', '0.015*"bien" + 0.010*"buen" + 0.009*"hamburguesa" + 0.007*"servicio" + 0.006*"súper"', '0.027*"buen" + 0.023*"hamburguesa" + 0.022*"lugar" + 0.014*"comida" + 0.013*"servicio"', '0.010*"amplio" + 0.010*"ir" + 0.008*"servicio" + 0.007*"pedir" + 0.007*"lugar"'], 1: ['0.250*"hamburguesa" + 0.250*"buena" + 0.250*"buen" + 0.250*"grande"', '0.471*"hamburguesa" + 0.469*"buena" + 0.031*"buen" + 0.029*"grande"', '0.250*"hamburguesa" + 0.250*"buen" + 0.250*"buena" + 0.250*"grande"', '0.256*"hamburguesa" + 0.255*"buena" + 0.244*"buen" + 0.244*"grande"', '0.471*"hamburguesa" + 0.323*"buen" + 0.177*"grande" + 0.030*"buena"']}, 'umap_cluster': {0: ['0.024*"hamburguesa" + 0.016*"bien" + 0.013*"buena" + 0.011*"lugar" + 0.010*"bonito"', '0.018*"hamburguesa" + 0.009*"poder" + 0.008*"hacer" + 0.008*"lugar" + 0.0

In [31]:
insigths_summary_dict = extractInsightsWithAI(reviews_summary_dict, general_insights_prompt, client)
print(insigths_summary_dict)

{'best': ['The quality of the hamburgers is highly praised, indicating a strong appeal among customers.', 'The service is noted to be good, contributing positively to the overall dining experience.', 'Customers appreciate the reasonable prices, which enhance the perceived value of the meals offered.'], 'worst': ['Some customers have reported issues with the consistency of service, leading to dissatisfaction.', 'Complaints regarding the freshness of certain menu items, like coffee and brunch options, have been mentioned.', 'Several reviews highlighted that the ambiance did not meet expectations, suggesting a need for improvement.'], 'improve': ['Enhance staff training to ensure consistent and attentive service at all times to improve customer satisfaction.', 'Evaluate and improve the quality and freshness of menu items, especially during peak hours to meet customer expectations.', 'Invest in the aesthetic aspects of the dining environment to create a more appealing atmosphere for diners

#### Extract pain moments

In [32]:
negative_periods_insights_prompt = (
    "I have this information extracted from LDA topics using clustering and sentiment analysis, including positive and negative terms at specific moments, in JSON format.\n"
    "\n"
    "I want you to extract:\n"
    "- For each date:\n"
    "- N negative points\n"
    "- N improvement suggestions based on the negative points\n"
    "\n"
    "Each point should be a logical, simple, and concise sentence that provides value. Do not mention specific terms or topics, but focus on delivering direct value to business stakeholders without ambiguity. If you mention something that didn't go well, provide examples based on the information.\n"
    "Return the result in English in JSON format, ensuring it is easy to read in a notebook and standardized as follows:\n"
    "\n"
    "{date: {problems:[problem, problem...], improve:[improve,improve...]}, date:{problems:[problem, problem...], improve:[improve,improve...]}, ...}\n"
    "\n"
    "Make sure there are no contradictions between the points.\n"
    "\n"
    "The information:\n"
)
print(negative_periods_topics)

{'rating_score': {Period('2024-02', 'M'): [(0, '0.081*"mala" + 0.054*"pena" + 0.054*"pro" + 0.054*"pedir" + 0.054*"merecer"')], Period('2024-03', 'M'): [(0, '0.059*"ir" + 0.039*"gin" + 0.039*"infernal" + 0.039*"q" + 0.039*"ruido"')], Period('2024-08', 'M'): [(0, '0.058*"cómodo" + 0.058*"mesa" + 0.038*"parecer" + 0.038*"mínimo" + 0.038*"podía"')]}, 'food_score': {Period('2023-11', 'M'): [(0, '0.021*"cafetería" + 0.021*"tarde" + 0.021*"aquel" + 0.021*"entonces" + 0.021*"casa"')], Period('2024-08', 'M'): [(0, '0.038*"mesa" + 0.038*"bastante" + 0.038*"cómodo" + 0.038*"textura" + 0.025*"verdad"')]}, 'service_score': {Period('2024-02', 'M'): [(0, '0.081*"mala" + 0.054*"pena" + 0.054*"pro" + 0.054*"pedir" + 0.054*"merecer"')]}, 'atmosphere_score': {Period('2023-11', 'M'): [(0, '0.016*"tarde" + 0.016*"poder" + 0.016*"creer" + 0.016*"casa" + 0.016*"aquel"')], Period('2024-03', 'M'): [(0, '0.071*"ruido" + 0.071*"tónica" + 0.071*"q" + 0.071*"siempre" + 0.071*"ir"')], Period('2024-08', 'M'): [(0, 

In [33]:
insigths_summary_dict = extractInsightsWithAI(negative_periods_topics, negative_periods_insights_prompt, client)
print(insigths_summary_dict)

{'2023-11': {'problems': ['There were complaints about the atmosphere being disruptive during the afternoon.', 'Several customers felt the environment was not conducive for a pleasant experience.'], 'improve': ['Consider implementing noise reduction strategies to enhance the ambiance.', 'Review the layout to create a more inviting and comfortable atmosphere.']}, '2024-02': {'problems': ['Service quality received negative feedback, with reports of poor interactions.', 'Customers expressed dissatisfaction with the attentiveness and response time of the staff.'], 'improve': ['Implement staff training programs focused on improving customer service skills.', 'Establish a monitoring system to ensure timely assistance during high traffic periods.']}, '2024-03': {'problems': ['Patrons noted excessive noise levels that detracted from the dining experience.', 'There were mentions of a chaotic environment which affected customer enjoyment.'], 'improve': ['Introduce acoustic treatments to help man

In [34]:
print("Top Most Recommended:", most_recommended)
print("Least Recommended :", less_recommended)

Top Most Recommended: [('Hamburguesa Completa Con Huevo', 9), ('Hamburguesa Americana', 8), ('Hamburguesa Italiana', 6)]
Least Recommended : ['Plaza de aparcamiento', 'Flan de Queso', 'Gofre Con Nocilla', 'Grandburguer HD', 'Hamburger', 'Hamburguesa Con Huevo', 'Tarta de Manzana', 'Tataki de Atún', 'Cheesecake', 'Hamburguesa Gallega', 'Brunch Con Baggel Salmón']
