In [96]:
import pandas as pd
import numpy as np
import re
import ast

import datetime

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [97]:
# Load the processed and cleaned data
processed_data_path = '../data/processed/'
raw_data_path = '../data/raw/'

name = 'Oceana Grill'

reviews_pro = pd.read_csv(processed_data_path + name + '_reviews.csv')
resumme_raw = pd.read_csv(raw_data_path + 'resumme_' + name + '.csv')

display(resumme_raw)
display(reviews_pro.sample(5))

reviews = reviews_pro.copy()
reviews.reset_index(drop=True, inplace=True)
resumme = resumme_raw.copy()

Unnamed: 0,stars,reviews
0,5,4012
1,4,1853
2,3,788
3,2,464
4,1,399


Unnamed: 0,rating_score,date,review,local_guide_reviews,service,meal_type,price_per_person_category,food_score,service_score,atmosphere_score,recommendations_list,avg_price_per_person
5639,4,2019-07-10,I rated the grill a four versus a five because...,80,Ate there,Luch,20-30 €,4,4,3,[''],30.0
7120,5,2019-09-28,Visiting New Orleans and definitely recommend ...,20,,Dinner,1-10 €,5,4,5,[''],10.0
2393,2,2015-07-24,I'll start by saying that the service here was...,51,Ate there,,10-20 €,1,3,1,[''],20.0
729,5,2013-09-09,The food and service was excellent here. All s...,50,,,10-20 €,4,5,5,[''],20.0
1696,4,2017-03-08,Fantastic service and tasty affordable grinds....,37,Take Away,,,3,3,4,[''],


### First draft summary plots 

In [98]:
# Calculate the average for each score
average_food = reviews['food_score'].mean()
average_service = reviews['service_score'].mean()
average_atmosphere = reviews['atmosphere_score'].mean()
average_reviews = (resumme_raw['stars'] * resumme_raw['reviews']).sum() / resumme_raw['reviews'].sum()

# Create a figure with horizontal subplots
fig = make_subplots(rows=1, cols=3, 
                    specs=[[{"type": "xy"}, {"type": "bar"}, {"type": "bar"}]], 
                    subplot_titles=("Average Score", "Number of Reviews", "Categories"))

# First subplot: Display the average review as large text
fig.add_trace(
    go.Scatter(x=[0], y=[0], text=[f"{average_reviews:.2f}"], mode="text", textfont=dict(size=120)),
    row=1, col=1
)

fig.update_xaxes(showgrid=False, zeroline=False, showticklabels=False, row=1, col=1)
fig.update_yaxes(showgrid=False, zeroline=False, showticklabels=False, row=1, col=1)


# Second subplot: Bar plot for reviews
fig.add_trace(
    go.Bar(x=resumme_raw['reviews'], y=resumme_raw['stars'], marker=dict(color='lightskyblue'),
           text=resumme_raw['reviews'], textposition='auto', name="Reviews", orientation='h'),
    row=1, col=2
)

# Third subplot: Bar plot for categories (Food, Service, Atmosphere)
fig.add_trace(
    go.Bar(x=[average_food, average_service, average_atmosphere], 
           y=['Food', 'Service', 'Atmosphere'], 
           marker=dict(color='lightgreen'), 
           text=[f"{average_food:.2f}", f"{average_service:.2f}", f"{average_atmosphere:.2f}"], 
           textposition='auto', 
           orientation='h', 
           name="Categories"),
    row=1, col=3
)

fig.update_layout(height=500, width=1200,  plot_bgcolor="white", paper_bgcolor="white", showlegend=False)
fig.show()

In [99]:
# Convert date column to datetime format
reviews['date'] = pd.to_datetime(reviews['date'], errors='coerce')
reviews['month'] = reviews['date'].dt.to_period('M')
reviews['year'] = reviews['date'].dt.year
reviews['week'] = reviews['date'].dt.to_period('W')
reviews['week'] = reviews['date'] - pd.to_timedelta(reviews['date'].dt.weekday, unit='d')
reviews['week'] = reviews['week'].dt.strftime('%Y-%m-%d')

# Filter data for the last periods (months, years, weeks)
limit_date = reviews['date'].max()#pd.to_datetime('today')
last_months = reviews[reviews['date'] >= limit_date - pd.DateOffset(months=12)]
last_years = reviews[reviews['date'] >= limit_date - pd.DateOffset(years=8)]
last_weeks = reviews[reviews['date'] >= limit_date - pd.DateOffset(weeks=5)]

# Compute averages for the required periods
monthly_avg_scores = last_months.groupby('month')[['rating_score', 'food_score', 'service_score', 'atmosphere_score']].mean()
yearly_avg_scores = last_years.groupby('year')[['rating_score']].mean()
weekly_avg_scores = last_weeks.groupby('week')[['rating_score', 'food_score', 'service_score', 'atmosphere_score']].mean()

# Update the axis labels for each score to be more readable
label_mapping = {
    'rating_score': 'Rating',
    'food_score': 'Food',
    'service_score': 'Service',
    'atmosphere_score': 'Atmosphere'
}

# Create a figure with subplots using the Z-layout
fig = make_subplots(rows=2, cols=2,
                    specs=[[{"colspan": 2}, None],
                           [{}, {}]],  # 1 large plot on the first row, 2 smaller plots on the second
                    subplot_titles=("Monthly Score Trends (Last 12 Months)", 
                                    "Annual Rating Score Trends (Last 6 Years)", 
                                    "Weekly Score Trends (Last 4 Weeks)"))

# Add monthly score trends to the first row (rating_score in stronger color)
colors = ['#1f77b4', '#aec7e8', '#aec7e8', '#aec7e8']
for i, column in enumerate(monthly_avg_scores.columns):
    label = label_mapping[column]
    fig.add_trace(
        go.Scatter(x=monthly_avg_scores.index.astype(str), y=monthly_avg_scores[column],
                   mode='lines+markers', name=label, 
                   text=[f"{label} - {val:.2f}" for val in monthly_avg_scores[column]], 
                   hoverinfo="text", line=dict(color=colors[i])),
        row=1, col=1)

# Add yearly score trends to the second row (left)
fig.add_trace(
    go.Scatter(x=yearly_avg_scores.index.astype(str), y=yearly_avg_scores['rating_score'],
               mode='lines+markers', name="Rating", line=dict(color='#1f77b4', width=4),
               text=[f"Rating - {val:.2f}" for val in yearly_avg_scores['rating_score']], 
               hoverinfo="text"),
    row=2, col=1)

# Add weekly score trends to the second row (right, weaker colors)
for i, column in enumerate(weekly_avg_scores.columns):
    label = label_mapping[column]  # Get the readable label
    fig.add_trace(
        go.Scatter(x=weekly_avg_scores.index.astype(str), y=weekly_avg_scores[column],
                   mode='lines+markers', name=label, 
                   text=[f"{label} - {val:.2f}" for val in weekly_avg_scores[column]], 
                   hoverinfo="text", line=dict(color=colors[i])),
        row=2, col=2)

# Enhance presentation: remove gridlines and borders, increase size, and remove legend
fig.update_layout(showlegend=False, 
                  title="Score Trends Analysis",
                  title_font=dict(size=28),
                  margin=dict(l=50, r=50, t=100, b=50),
                  paper_bgcolor="white",
                  height=800, width=1200)
fig.update_xaxes(showline=False, showgrid=False)
fig.update_yaxes(showline=False, showgrid=True)

# Customize x-axes formatting: show only the year for yearly data, and only day and month for weekly data
fig.update_xaxes(
    tickformat="%Y",  # Only show the year for the yearly graph
    row=2, col=1
)

fig.update_xaxes(
    tickformat="%d-%b",  # Show only the day and month for weekly graph
    row=2, col=2
)

# Add annotations to highlight key points
fig.add_annotation(x='2024-06', y=4.8, 
                   text="Highest Score", 
                   showarrow=True, arrowhead=2,
                   ax=0, ay=80, row=1, col=1, font=dict(size=14))

fig.add_annotation(x='2024-03', y=4.5, 
                   text="Drop in March", 
                   showarrow=True, arrowhead=2,
                   ax=0, ay=-40, row=1, col=1, font=dict(size=14))

fig.add_annotation(x='2024-08', y=4.5, 
                   text="Drop in August", 
                   showarrow=True, arrowhead=2,
                   ax=0, ay=-40, row=1, col=1, font=dict(size=14))

fig.update_traces(marker=dict(size=8), selector=dict(name="Rating"))
fig.update_layout(plot_bgcolor="white", paper_bgcolor="white")
fig.show()


### Cleaning and preprocessing

In [100]:
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

from tqdm import tqdm

# Download NLTK stopwords and lexicon
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Load spaCy Spanish model
nlp = spacy.load('es_core_news_sm')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jobandtalent/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jobandtalent/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [101]:
# Clean text, stopworks and tokenize words
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-záéíóúñü0-9\s]', '', text)
    doc = nlp(text)
    stop_words = set(stopwords.words('spanish'))
    tokens = [token.lemma_ for token in doc 
              if token.text not in stop_words and not token.is_punct and not token.is_space]
    return ' '.join(tokens)

In [102]:
tqdm.pandas(desc="Cleaning Reviews")
reviews['cleaned_review'] = reviews['review'].fillna('').progress_apply(clean_text)

display(reviews[['review', 'cleaned_review']].sample(5))

Cleaning Reviews: 100%|██████████| 7516/7516 [01:26<00:00, 87.14it/s] 


Unnamed: 0,review,cleaned_review
3845,My friends were telling how good this place is...,my friends were telling how good this placir i...
2368,"Eh, this place was fine. The food was decent, ...",eh this placir was finir the food was decent a...
5013,Took a weekend trip to NOLA and saw how highly...,took weekend trip to nola and saw how highly r...
3572,We were told to go here by a family member who...,we were told to go here by family member who p...
6271,So as I was in New Orleans for Mardi Gras I ha...,so as i was in new orleans for mardi gra i had...


### Embeddings and sentiment analysis

In [103]:
# Embeddings and Sentiment
from transformers import pipeline
from transformers import BertTokenizer, BertModel
import torch

# Word Clouds and Visualization
from wordcloud import WordCloud


#### Analyze sentiment

In [104]:
# Extract sentiment for each review using 
def analyzeSentiment(df):
    # Initialize VADER sentiment analyzer
    sia = SentimentIntensityAnalyzer()

    # Apply sentiment analysis to each review using VADER
    df['vader_sentiment'] = df['cleaned_review'].apply(lambda x: sia.polarity_scores(x)['compound'])
    
    # Classify sentiment into positive, neutral, negative using rating_score and vader_sentiment
    def classify_sentiment(row):
        if row['rating_score'] >= 4:
            return 'positive'
        elif row['rating_score'] <= 2:
            return 'negative'
        elif row['vader_sentiment'] > 0.05:
            return 'positive'
        elif row['vader_sentiment'] < -0.05:
            return 'negative'
        else:
            return 'neutral'
    
    df['sentiment_label'] = df.apply(classify_sentiment, axis=1)
    
    return df

# Extract most common words for a selected sentiment
def extractCommonWords(df, sentiment_label='positive', n=10):
    # Filter reviews by sentiment label
    filtered_reviews = df[df['sentiment_label'] == sentiment_label]['cleaned_review'].fillna('').tolist()
    
    # Tokenize and count words for the given sentiment label
    vectorizer = CountVectorizer().fit(filtered_reviews)
    word_counts = vectorizer.transform(filtered_reviews).sum(axis=0)
    
    # Create a dictionary of word frequencies
    word_freq = [(word, word_counts[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    sorted_word_freq = sorted(word_freq, key=lambda x: x[1], reverse=True)[:n]
    
    return sorted_word_freq

# Extract most common n-grams for a selected sentiment
def extractCommonNgrams(df, sentiment_label='positive', n=2, top_n=10):
    # Filter reviews by sentiment label
    filtered_reviews = df[df['sentiment_label'] == sentiment_label]['cleaned_review'].fillna('').tolist()
    
    # Create n-grams for the given sentiment label
    vectorizer = CountVectorizer(ngram_range=(n, n)).fit(filtered_reviews)
    ngram_counts = vectorizer.transform(filtered_reviews).sum(axis=0)
    
    # Create a list of n-grams with their counts
    ngram_freq = [(word, ngram_counts[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    sorted_ngrams = sorted(ngram_freq, key=lambda x: x[1], reverse=True)[:top_n]
    
    return sorted_ngrams

# Analyze sentiment with VADER
reviews = analyzeSentiment(reviews)

# Extract common positive and negative phrases
common_positive_words = extractCommonWords(reviews, sentiment_label = 'positive', n = 10)
common_negative_words = extractCommonWords(reviews, sentiment_label = 'negative', n = 10)

print("Top Positive Words:", common_positive_words)
print("Top Negative Words:", common_negative_words)

# Extract common positive and negative bigrams
common_positive_bigrams = extractCommonNgrams(reviews, sentiment_label='positive', n=2, top_n=10)
common_negative_bigrams = extractCommonNgrams(reviews, sentiment_label='negative', n=2, top_n=10)

print("Top Positive Bigrams:", common_positive_bigrams)
print("Top Negative Bigrams:", common_negative_bigrams)

Top Positive Words: [('the', 26133), ('and', 18364), ('was', 15166), ('to', 8506), ('we', 7159), ('of', 6398), ('it', 6260), ('in', 5176), ('for', 5075), ('had', 4958)]
Top Negative Words: [('the', 5250), ('and', 3015), ('was', 2797), ('to', 2113), ('it', 1443), ('of', 1306), ('we', 1209), ('in', 1078), ('food', 1003), ('for', 992)]
Top Positive Bigrams: [('it was', 2441), ('and the', 1936), ('the food', 1907), ('had the', 1505), ('new orleans', 1455), ('this placir', 1374), ('of the', 1299), ('food was', 1297), ('in the', 1149), ('the best', 1133)]
Top Negative Bigrams: [('it was', 471), ('the food', 399), ('this placir', 311), ('and the', 296), ('in the', 295), ('of the', 276), ('on the', 223), ('food was', 219), ('new orleans', 186), ('we were', 180)]


In [105]:
# Plot the evolution of distribution of reviews on time based on sentiments
def plotSentimentTrend(df, years_limit = 2):
    # Convert date to datetime format and handle missing values
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])
    
    # Filter only the last 6 years
    last_six_years = df['date'].max() - pd.DateOffset(years=years_limit)
    df = df[df['date'] >= last_six_years]

    # Set date as index for resampling
    df.set_index('date', inplace=True)
    
    # Resample to monthly and count sentiments
    sentiment_counts = df.resample('M')['sentiment_label'].value_counts().unstack().fillna(0)

    # Calculate the percentage for each sentiment type
    sentiment_percentage = sentiment_counts.div(sentiment_counts.sum(axis=1), axis=0) * 100
    sentiment_percentage = sentiment_percentage.round(2)
    sentiment_percentage = sentiment_percentage.reset_index().melt(id_vars=['date'], value_name='percentage', var_name='sentiment_label')
    
    # Plot sentiment percentage evolution
    fig = px.area(
        sentiment_percentage,
        x='date',
        y='percentage',
        color='sentiment_label',
        title='Sentiment Percentage Over the Last 6 Years',
        labels={'date': '', 'percentage': 'Percentage of Reviews (%)', 'sentiment_label': 'Sentiment'},
        template='plotly_white',
    )

    # Customize layout
    fig.update_layout(
        title=dict(x=0.5, xanchor='center', font=dict(size=18, color='black')),
        xaxis=dict(showgrid=False, zeroline=False),
        yaxis=dict(showgrid=True, title='Percentage of Reviews', ticksuffix='%'),
        legend=dict(title='', orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1),
        margin=dict(l=20, r=20, t=50, b=20),
        plot_bgcolor='rgba(0,0,0,0)',
        hovermode='x unified',
        width=1200,
        height=400,
    )

    # Customize color for sentiment categories
    color_map = {
        'positive': 'rgba(102, 194, 165, 0.7)', 
        'neutral': 'rgba(141, 160, 203, 0.7)', 
        'negative': 'rgba(252, 141, 98, 0.7)'
    }
    fig.for_each_trace(lambda trace: trace.update(line=dict(width=0, shape='spline'), fill='tonexty', fillcolor=color_map.get(trace.name, 'rgba(150, 150, 150, 0.5)')))

    # Remove the plot frame and keep the visualization as clean as possible
    fig.update_xaxes(showline=False)
    fig.update_yaxes(showline=False, range=[0, 100])  # Percentage scale from 0 to 100

    fig.show()

plotSentimentTrend(reviews)


'M' is deprecated and will be removed in a future version, please use 'ME' instead.



In [106]:

# Extract most and least recommendations mentioned
def analyzeRecommendations(df):
    all_dishes = []

    # Convert string representation of lists to actual lists and extend all_dishes
    for item in df['recommendations_list'].dropna():
        try:
            dishes = ast.literal_eval(item)
            if isinstance(dishes, list):
                all_dishes.extend(dishes)
        except:
            continue

    # Filter out empty values
    all_dishes = [dish for dish in all_dishes if dish.strip() != '']

    # Count the frequency of each dish
    dish_counts = Counter(all_dishes)
    if not dish_counts:
        return [], []
    
    # Most and least recommended dishes
    most_common_dishes = dish_counts.most_common(3)
    min_count = min(dish_counts.values())
    worst_dishes = [dish for dish, count in dish_counts.items() if count == min_count]

    return most_common_dishes, worst_dishes

most_recommended, less_recommended = analyzeRecommendations(reviews)
print("Top Most Recommended:", most_recommended)
print("Least Recommended :", less_recommended)

Top Most Recommended: []
Least Recommended : []


#### Calculate embeddings

In [107]:
# Extract the embeddings for each cleaned review
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Import Bert model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

tqdm.pandas(desc="Generating Embeddings")
reviews['embedding'] = reviews['cleaned_review'].progress_apply(get_embedding)

Generating Embeddings: 100%|██████████| 7516/7516 [20:47<00:00,  6.03it/s] 


#### Analyze embeddings

In [108]:
from sklearn.decomposition import PCA

# PCA Embeddings Visualization
def visualizeEmbeddingsPCA(df):
    # Convert embeddings to a NumPy array
    embeddings = np.array(df['embedding'].tolist())
    ratings = df['rating_score']
    
    # Perform PCA for dimensionality reduction
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(embeddings)
    
    # Calculate variance explained by each component
    var_explained = pca.explained_variance_ratio_ * 100
    var1, var2 = var_explained
    
    # Prepare DataFrame for Plotly
    plot_df = pd.DataFrame({
        'PCA Component 1': reduced_embeddings[:, 0],
        'PCA Component 2': reduced_embeddings[:, 1],
        'Rating Score': ratings,
        'Review ID': df.get('review_id', range(len(df)))  # Optional identifier
    })
    
    # Create interactive scatter plot
    fig = px.scatter(
        plot_df,
        x='PCA Component 1',
        y='PCA Component 2',
        color='Rating Score',
        color_continuous_scale='Viridis',
        hover_data=['Review ID', 'Rating Score'],
        title=f'Embeddings by Rating Score (PCA 1: {var1:.1f}%, PCA 2: {var2:.1f}%)',
        labels={
            'PCA Component 1': f'PCA 1 ({var1:.1f}% variance)',
            'PCA Component 2': f'PCA 2 ({var2:.1f}% variance)',
            'Rating Score': 'Rating Score'
        }
    )
    
    # Enhance layout for clarity
    fig.update_layout(
        template='plotly_white',
        coloraxis_colorbar=dict(
            title='Rating Score',
            tickmode='linear'
        ),
        hovermode='closest'
    )
    
    fig.show()
    return reduced_embeddings

embeddings_pca = visualizeEmbeddingsPCA(reviews)

import umap.umap_ as umap

# UMAP Embeddings Visualization
def visualizeEmbeddingsUMAP(df):
    embeddings = np.array(df['embedding'].tolist())
    sentiment_labels = df['sentiment_label']

    # Reduce dimensionality with UMAP
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
    reduced_embeddings = reducer.fit_transform(embeddings)

    # Create DataFrame for visualization
    viz_df = pd.DataFrame(reduced_embeddings, columns=['x', 'y'])
    viz_df['sentiment_label'] = sentiment_labels

    # Scatter plot with Plotly for interactive visualization
    fig = px.scatter(
        viz_df,
        x='x',
        y='y',
        color='sentiment_label',
        title='Embedding Visualization with UMAP',
        labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2'},
        color_discrete_map={'positive': 'green', 'neutral': 'gray', 'negative': 'red'},
        opacity=0.7
    )
    fig.update_layout(showlegend=True, legend=dict(title='Sentiment'), margin=dict(l=10, r=10, t=40, b=10))
    fig.show()

    return reduced_embeddings

embeddings_umap = visualizeEmbeddingsUMAP(reviews)




n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [109]:
from sklearn.neighbors import NearestNeighbors

# Plot K distace for dbscan eps adjustment
def plotKdistance(reduced_embeddings, k=5, method='PCA'):
    # Compute k-nearest neighbors
    neighbors = NearestNeighbors(n_neighbors=k)
    neighbors_fit = neighbors.fit(reduced_embeddings)
    distances, _ = neighbors_fit.kneighbors(reduced_embeddings)
    
    # Sort distances to the k-th nearest neighbor
    k_distances = np.sort(distances[:, k-1])
    
    # Create interactive line plot
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=np.arange(1, len(k_distances) + 1),
        y=k_distances,
        mode='lines',
        line=dict(color='blue'),
        name='k-distance'
    ))
    
    # Update layout for clarity
    fig.update_layout(
        title=f'k-Distance Graph for {method}',
        xaxis_title='Points sorted by distance',
        yaxis_title=f'Distance to {k}th Nearest Neighbor',
        template='plotly_white',
        hovermode='x unified'
    )
    
    # Add light grid lines
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey')
    
    fig.show()

plotKdistance(embeddings_umap, k= 10, method='PCA')
plotKdistance(embeddings_pca, k= 10, method='UMAP')

In [110]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Function to apply DBSCAN
def apply_dbscan(reduced_embeddings, eps=0.6, min_samples=5):
    scaler = StandardScaler()
    scaled = scaler.fit_transform(reduced_embeddings)
    db = DBSCAN(eps=eps, min_samples=min_samples)
    labels = db.fit_predict(scaled)
    return labels

# PCA Visualization with DBSCAN
def visualizeEmbeddingsPCA_with_DBSCAN(df, eps=0.55, min_samples=10):
    embeddings = np.array(df['embedding'].tolist())
    ratings = df['rating_score']
    
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(embeddings)
    var1, var2 = pca.explained_variance_ratio_ * 100
    
    clusters = apply_dbscan(reduced, eps, min_samples)
    
    plot_df = pd.DataFrame({
        'pca_component_1': reduced[:, 0],
        'pca_component_2': reduced[:, 1],
        'rating_score': ratings,
        'pca_cluster': clusters,
        'review_id': df.get('review_id', range(len(df)))
    })
    
    fig = px.scatter(
        plot_df,
        x='pca_component_1',
        y='pca_component_2',
        color='pca_cluster',
        color_continuous_scale='Viridis',
        hover_data=['review_id', 'rating_score'],
        title=f'PCA with DBSCAN (PCA1: {var1:.1f}%, PCA2: {var2:.1f}%)',
        labels={
            'PCA 1': f'pca_component_1 ({var1:.1f}% variance)',
            'PCA 2': f'pca_component_2 ({var2:.1f}% variance)',
            'Cluster': 'pca_cluster'
        }
    )
    
    fig.update_layout(
        template='plotly_white',
        coloraxis_colorbar=dict(title='pca_cluster'),
        hovermode='closest'
    )
    
    fig.show()
    return plot_df

# UMAP Visualization with DBSCAN
def visualizeEmbeddingsUMAP_with_DBSCAN(df, eps=0.7, min_samples=10):
    embeddings = np.array(df['embedding'].tolist())
    sentiment = df['sentiment_label']
    
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
    reduced = reducer.fit_transform(embeddings)
    
    clusters = apply_dbscan(reduced, eps, min_samples)
    
    plot_df = pd.DataFrame({
        'umap_component_1': reduced[:, 0],
        'umap_component_2': reduced[:, 1],
        'sentiment': sentiment,
        'umap_cluster': clusters,
        'review_id': df.get('review_id', range(len(df)))
    })
    
    fig = px.scatter(
        plot_df,
        x='umap_component_1',
        y='umap_component_2',
        color='umap_cluster',
        color_continuous_scale='Viridis',
        hover_data=['sentiment', 'umap_cluster'],
        title='UMAP with DBSCAN',
        labels={
            'UMAP 1': 'umap_component_1',
            'UMAP 2': 'umap_component_2',
            'Cluster': 'umap_cluster'
        },
        opacity=0.7
    )
    
    fig.update_layout(
        showlegend=True,
        legend=dict(title='umap_cluster'),
        margin=dict(l=10, r=10, t=40, b=10)
    )
    
    fig.show()
    return plot_df

# Visualize with DBSCAN clusters
pca_clusters = visualizeEmbeddingsPCA_with_DBSCAN(reviews, eps=0.5, min_samples=5)
umap_clusters = visualizeEmbeddingsUMAP_with_DBSCAN(reviews, eps=0.5, min_samples=5)




n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [111]:
import networkx as nx
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Plot reviews by communities, using embeddingsm cosine_similarity and Girvan-Newman algorithm
def plotCommunities(reviews):
    # Load embeddings from reviews
    ebm_reviews = np.array(reviews['embedding'].tolist())

    # Calculate cosine similarity matrix between all pairs of embeddings
    similarity_matrix = cosine_similarity(ebm_reviews)
    similarity_threshold = 0.75

    G_sparser = nx.Graph()

    # Add nodes representing each review
    for i in range(len(reviews)):
        G_sparser.add_node(i, sentiment_label=reviews['sentiment_label'].iloc[i])

    # Add edges based on the similarity matrix and new threshold
    for i in range(len(similarity_matrix)):
        for j in range(i + 1, len(similarity_matrix)):  # Only consider upper triangle to avoid redundancy
            if similarity_matrix[i][j] >= similarity_threshold:
                G_sparser.add_edge(i, j, weight=similarity_matrix[i][j])

    # Use Girvan-Newman algorithm to detect communities
    comp = nx.algorithms.community.girvan_newman(G_sparser)
    communities_sparser = tuple(sorted(c) for c in next(comp))

    # Extract key terms from each community using TF-IDF
    vectorizer = TfidfVectorizer(max_features=3, stop_words='english')
    community_keywords = []

    for community in communities_sparser:
        reviews_text = reviews.iloc[list(community)]['cleaned_review'].astype(str).tolist()
        # Ensure there are non-stopword terms to avoid empty vocabulary error
        filtered_reviews_text = [text for text in reviews_text if len(vectorizer.build_tokenizer()(text)) > 0]
        if len(filtered_reviews_text) > 1:
            tfidf_matrix = vectorizer.fit_transform(filtered_reviews_text)
            keywords = vectorizer.get_feature_names_out()
            community_keywords.append(", ".join(keywords))
        else:
            community_keywords.append(reviews.iloc[list(community)[0]]['cleaned_review'])

    # Prepare data for Plotly interactive visualization
    node_x = []
    node_y = []
    node_color = []
    node_text = []

    pos = nx.spring_layout(G_sparser, seed=42)
    colors = px.colors.qualitative.Set1  # A set of distinct colors for different communities

    # Extract node positions, colors, and labels for Plotly
    for i, community in enumerate(communities_sparser):
        for node in community:
            x, y = pos[node]
            node_x.append(x)
            node_y.append(y)
            node_color.append(colors[i % len(colors)])
            node_text.append(f"{community_keywords[i]}")

    # Create edge traces
    edge_x = []
    edge_y = []

    for edge in G_sparser.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    # Create the Plotly figure
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='gray'),
        hoverinfo='none',
        mode='lines')

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        hoverinfo='text',
        text=node_text,
        marker=dict(
            size=10,
            line_width=2,
            color=node_color
        )
    )

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='Reviews by Communities',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

    fig.show()

if False:
    plotCommunities(reviews)

In [112]:
### Join PCA and UMAP clusters info to reviews
reviews = reviews.reset_index().rename(columns={'index':'review_id'})
reviews = reviews.merge(pca_clusters[['review_id','pca_cluster']]).merge(umap_clusters[['review_id','umap_cluster']])

In [113]:
reviews.to_csv(processed_data_path + name + '_ml_processed_reviews.csv', index=False)
print('OK! -> processed sample reviews saved at', processed_data_path + name + '_ml_processed_reviews.csv')

OK! -> processed sample reviews saved at ../data/processed/Oceana Grill_ml_processed_reviews.csv


#### Topics

In [114]:
from gensim import corpora
from gensim.models import LdaModel

# Extract topics using LDA model
def analyzeTopicsLDA(df, number_of_topics = 5):
   # Prepare corpus for LDA
    cleaned_reviews = df['cleaned_review'].dropna().tolist()
    tokenized_reviews = [review.split() for review in cleaned_reviews if isinstance(review, str) and review.strip() != '']
    
    if not tokenized_reviews:
        print("No valid reviews to process.")
        return None, []
    
    dictionary = corpora.Dictionary(tokenized_reviews)
    if len(dictionary) == 0:
        print("Dictionary is empty after tokenization.")
        return None, []
    
    corpus = [dictionary.doc2bow(review) for review in tokenized_reviews]
    if not any(corpus):
        print("Corpus is empty. No terms found in any document.")
        return None, []
    
    # Train LDA model
    try:
        lda_model = LdaModel(
            corpus,
            num_topics=number_of_topics,
            id2word=dictionary,
            passes=10,
            random_state=42
        )
    except ValueError as e:
        print(f"LDA Model training failed: {e}")
        return None, []
    
    # Extract topics
    topics = lda_model.print_topics(num_words=5)
    for topic in topics:
        print(f"Topic {topic[0]}: {topic[1]}")
    return lda_model, topics

print('=== General topics ===')
lda_model, topics = analyzeTopicsLDA(reviews)

=== General topics ===
Topic 0: 0.073*"the" + 0.050*"and" + 0.044*"was" + 0.018*"i" + 0.018*"had"
Topic 1: 0.100*"breakfast" + 0.017*"eggs" + 0.017*"bloody" + 0.016*"omelette" + 0.016*"grits"
Topic 2: 0.050*"the" + 0.048*"and" + 0.042*"was" + 0.033*"we" + 0.025*"to"
Topic 3: 0.057*"i" + 0.042*"the" + 0.031*"to" + 0.026*"it" + 0.021*"was"
Topic 4: 0.056*"the" + 0.035*"is" + 0.026*"and" + 0.021*"you" + 0.020*"food"


In [115]:
# Generate topics for all selected columns in group columns
def generateTopicsbyColumn(reviews, group_columns):
    # Initialize dictionary to store topics
    topics_dict = {group_col: {} for group_col in group_columns}

    # Iterate over each grouping column and generate topics
    for group_col in group_columns:
        print(f"\n=== Topics by {group_col} ===")
        unique_groups = reviews[group_col].dropna().unique()
        
        for group_val in unique_groups:
            subset = reviews[reviews[group_col] == group_val]
            
            # Check if there are enough reviews to train LDA
            if len(subset) < 5:
                print(f"\n--- {group_col} = {group_val} ---")
                print("Not enough data to train LDA.")
                continue
            
            print(f"\n--- {group_col} = {group_val} ---")
            
            # Generate topics for the current subset
            lda_model, topics = analyzeTopicsLDA(subset)
            
            if lda_model is not None and topics:
                # Store topics as strings in the dictionary
                topics_strings = [topic[1] for topic in topics]
                topics_dict[group_col][group_val] = topics_strings
            else:
                print("No topics generated for this group.\n")
    return topics_dict

group_columns = ['pca_cluster', 'umap_cluster', 'sentiment_label']
topics_dict = generateTopicsbyColumn(reviews, group_columns)


=== Topics by pca_cluster ===

--- pca_cluster = 0 ---
Topic 0: 0.073*"the" + 0.051*"and" + 0.045*"was" + 0.018*"had" + 0.017*"i"
Topic 1: 0.106*"breakfast" + 0.018*"eggs" + 0.017*"omelette" + 0.015*"toast" + 0.015*"bloody"
Topic 2: 0.050*"the" + 0.046*"and" + 0.041*"was" + 0.033*"we" + 0.025*"to"
Topic 3: 0.058*"i" + 0.043*"the" + 0.031*"to" + 0.027*"it" + 0.021*"was"
Topic 4: 0.056*"the" + 0.035*"is" + 0.026*"and" + 0.021*"you" + 0.020*"food"

--- pca_cluster = -1 ---
Not enough data to train LDA.

=== Topics by umap_cluster ===

--- umap_cluster = 0 ---
Topic 0: 0.068*"the" + 0.042*"was" + 0.042*"and" + 0.027*"i" + 0.020*"it"
Topic 1: 0.047*"the" + 0.030*"i" + 0.027*"and" + 0.026*"to" + 0.023*"is"
Topic 2: 0.005*"glutar" + 0.005*"free" + 0.004*"owner" + 0.003*"sweetest" + 0.003*"nightmar"
Topic 3: 0.057*"and" + 0.051*"was" + 0.051*"the" + 0.024*"great" + 0.023*"food"
Topic 4: 0.042*"the" + 0.036*"to" + 0.020*"and" + 0.017*"we" + 0.015*"food"

--- umap_cluster = 1 ---
Topic 0: 0.048

#### Extract moments with worst rating and process that reviews

In [116]:
# Extract the periods with less score and the reviews of each period
def analyzeLowScores(df, score_column, time_period='month', num_periods=1, last_periods = 12):
    # Calculate the mean and standard deviation of the scores
    last_periods = df[df['date'] >= df['date'].max() - pd.DateOffset(months=last_periods)]

    # Compute averages for the required periods
    last_periods_avg_scores = last_periods.groupby(time_period)[score_column].mean().reset_index()
    last_periods_avg_scores.set_index(time_period, inplace=True)
    
    mean_score = last_periods_avg_scores[score_column].mean()
    std_dev_score = last_periods_avg_scores[score_column].std()
    
    # Define a threshold for low scores
    threshold = mean_score - std_dev_score
    low_scores = last_periods_avg_scores[last_periods_avg_scores[score_column] < threshold]
    # Select the specified number of periods with the lowest average score
    low_score_periods = low_scores.index[:num_periods]
    
    # Filter negative reviews for the selected periods with the lowest score
    period_reviews = df[(df[time_period].isin(low_score_periods)) & 
                        (df[score_column] <= 3)]
    
    # Drop the 'embedding' column if it exists to avoid issues with non-hashable types
    if 'embedding' in period_reviews.columns:
        period_reviews = period_reviews.drop(columns=['embedding'])
    
    # Add a column indicating the period with the lowest score for easier filtering
    period_reviews['low_score_period'] = period_reviews[time_period]
    period_reviews = period_reviews.sort_values('low_score_period')

    return period_reviews, low_score_periods

In [117]:
# Usage
time_period = 'month'  # Change to 'week', 'year', etc. to analyze different periods
num_periods = 3  # Number of periods with the lowest average score to select

# Analyze for each score type
negative_periods_rating_reviews, low_score_periods = analyzeLowScores(reviews, 'rating_score', time_period, num_periods)
negative_periods_food_reviews, _ = analyzeLowScores(reviews, 'food_score', time_period, num_periods)
negative_periods_service_reviews, _ = analyzeLowScores(reviews, 'service_score', time_period, num_periods)
negative_periods_atmosphere_reviews, _ = analyzeLowScores(reviews, 'atmosphere_score', time_period, num_periods)

In [118]:
# Calculate topics for each low_score_period and concatenate results
def generateTopicsPerPeriod(df, score_column, number_of_topics=1):
    valid_reviews = df[df['review'].notna()]
    topics_dict = {score_column: {}}
    for period in valid_reviews['low_score_period'].unique():
        period_reviews = valid_reviews[valid_reviews['low_score_period'] == period]
        # Assuming analyzeTopicsLDA function returns topics as the second output
        _, topics = analyzeTopicsLDA(period_reviews, number_of_topics=number_of_topics)
        topics_dict[score_column][period] = topics
    return topics_dict

negative_periods_rating_topics = generateTopicsPerPeriod(negative_periods_rating_reviews, 'rating_score')
negative_periods_food_topics = generateTopicsPerPeriod(negative_periods_food_reviews, 'food_score')
negative_periods_service_topics = generateTopicsPerPeriod(negative_periods_service_reviews, 'service_score')
negative_periods_atmosphere_topics = generateTopicsPerPeriod(negative_periods_atmosphere_reviews, 'atmosphere_score')

negative_periods_topics = {**negative_periods_rating_topics, **negative_periods_food_topics, **negative_periods_service_topics, **negative_periods_atmosphere_topics}

Topic 0: 0.050*"the" + 0.025*"and" + 0.021*"was" + 0.021*"to" + 0.016*"i"
Topic 0: 0.040*"the" + 0.024*"was" + 0.019*"to" + 0.018*"and" + 0.013*"we"
Topic 0: 0.048*"the" + 0.030*"and" + 0.023*"was" + 0.018*"to" + 0.012*"i"
Topic 0: 0.040*"the" + 0.026*"was" + 0.025*"and" + 0.016*"to" + 0.014*"were"
Topic 0: 0.046*"the" + 0.028*"was" + 0.027*"and" + 0.021*"to" + 0.014*"it"
Topic 0: 0.050*"the" + 0.024*"was" + 0.024*"and" + 0.020*"i" + 0.019*"to"
Topic 0: 0.045*"the" + 0.032*"was" + 0.028*"i" + 0.027*"and" + 0.015*"my"


#### Extract outliers and pain points

In [119]:
import json
import numpy as np

# Format arrays of words in json format
def format_words(words_list):
    return {str(word): int(weight) if isinstance(weight, (int, np.integer)) else weight for word, weight in words_list}

# Join all the available information
words_dict = {
    "common_positive_words": format_words(common_positive_words),
    "common_negative_words": format_words(common_negative_words),
    "common_positive_bigrams": format_words(common_positive_bigrams),
    "common_negative_bigrams": format_words(common_negative_bigrams)
}
print(words_dict)

reviews_summary_dict = {**topics_dict, **words_dict}
print(reviews_summary_dict)

{'common_positive_words': {'the': 26133, 'and': 18364, 'was': 15166, 'to': 8506, 'we': 7159, 'of': 6398, 'it': 6260, 'in': 5176, 'for': 5075, 'had': 4958}, 'common_negative_words': {'the': 5250, 'and': 3015, 'was': 2797, 'to': 2113, 'it': 1443, 'of': 1306, 'we': 1209, 'in': 1078, 'food': 1003, 'for': 992}, 'common_positive_bigrams': {'it was': 2441, 'and the': 1936, 'the food': 1907, 'had the': 1505, 'new orleans': 1455, 'this placir': 1374, 'of the': 1299, 'food was': 1297, 'in the': 1149, 'the best': 1133}, 'common_negative_bigrams': {'it was': 471, 'the food': 399, 'this placir': 311, 'and the': 296, 'in the': 295, 'of the': 276, 'on the': 223, 'food was': 219, 'new orleans': 186, 'we were': 180}}
{'pca_cluster': {0: ['0.073*"the" + 0.051*"and" + 0.045*"was" + 0.018*"had" + 0.017*"i"', '0.106*"breakfast" + 0.018*"eggs" + 0.017*"omelette" + 0.015*"toast" + 0.015*"bloody"', '0.050*"the" + 0.046*"and" + 0.041*"was" + 0.033*"we" + 0.025*"to"', '0.058*"i" + 0.043*"the" + 0.031*"to" + 0.0

#### Extract reviews samples

In [120]:
# Calculate total score using the three main scores
reviews_score = reviews.copy()
food_score_mean = np.round(reviews_score['food_score'].mean(), 2) / 5
service_score_mean = np.round(reviews_score['service_score'].mean(), 2) / 5
atmosphere_score_mean = np.round(reviews_score['atmosphere_score'].mean(), 2) / 5

reviews_score['food_score'] = reviews_score['food_score'].fillna(food_score_mean)
reviews_score['service_score'] = reviews_score['service_score'].fillna(service_score_mean)
reviews_score['atmosphere_score'] = reviews_score['atmosphere_score'].fillna(atmosphere_score_mean)

reviews_score['total_score'] = np.round(
    reviews_score['rating_score'] +
    (reviews_score['food_score']/5 + reviews_score['service_score']/5 + reviews_score['atmosphere_score']/5) / 3, 2)

In [121]:
# Filter not null reviews
valid_reviews = reviews_score[reviews_score['review'].notna()]

# Select the best and worst reviews in general
best_reviews = valid_reviews[valid_reviews['total_score'] > 5]
worst_reviews = valid_reviews[valid_reviews['total_score'] < 2.5]

recent_best_reviews = best_reviews.sort_values(by='date', ascending=False)
print('last_positive_reviews')
print(recent_best_reviews.review)
recent_worst_reviews = worst_reviews.sort_values(by='date', ascending=False)
print('\nlast_negative_reviews')
print(recent_worst_reviews.review)

best_reviews_sample = best_reviews.sort_values(by='total_score', ascending=False)
print('\nbest_reviews_sample')
print(best_reviews_sample.review)
worst_reviews_sample = worst_reviews.sort_values(by='total_score', ascending=True)
print('\nworst_reviews_sample')
print(worst_reviews_sample.review)

low_score_reviews = negative_periods_rating_reviews[negative_periods_rating_reviews['review'].notna()][['month','review','rating_score']]
print('\nlow_score_reviews')
display(low_score_reviews)
print(low_score_periods)

last_positive_reviews
7089    Our family was visiting from Florida and stopp...
7388    The best crab cakes I've ever had. I mean ever...
7149    The ratatouille in this was amazing, definitel...
7312    Great food, excellent service.\n\nWe were in N...
7360    It was my boyfriend and my first time in New O...
                              ...                        
2055    I only eat BBQ shrimp from here and it is wond...
4777    Best Damn Catfish I've Ever Had !!!!!!!!!! \n\...
4242    Hands down the best seafood I have ever had. W...
856     WTF is up with some of these low starred revie...
3320    I dined here in August. The service was so-so,...
Name: review, Length: 4012, dtype: object

last_negative_reviews
7352    A group of us went to this place on a Friday f...
7416    Absolutely trash. I don't understand how this ...
7400    Hmmm....I was not impressed. First, I have to ...
7374    Tourist trap. Service was pretty bad, food cam...
6723    Just ok. We chose this because it w

Unnamed: 0,month,review,rating_score
5714,2021-07,Meh. Tough to go to Chef Ron's and then this p...,3
6721,2021-07,"Dinner was good, but breakfast not at all. One...",2
6616,2021-07,Our appetizers were very good. Crab cakes and ...,3
6555,2021-07,So finally made it back to Nola! I really want...,3
6430,2021-07,Food is just ok. It's not worth the long wait....,3
6345,2021-07,Not a pleasant experience. Despite having a re...,2
6339,2021-07,Was very surprised. It took an hour and 20 min...,3
7448,2021-07,Had the bbq shrimp and Oysters Rockefeller. Co...,1
6267,2021-07,Not sure really what I can say at this moment ...,1
6268,2021-07,Catfish and shrimp were not freshly cooked and...,2


PeriodIndex(['2021-07', '2021-10'], dtype='period[M]', name='month')


In [122]:
# Join all the samples
recent_best_reviews['sample_type'] = 'recent_best_reviews'
recent_worst_reviews['sample_type'] = 'recent_worst_reviews'
best_reviews_sample['sample_type'] = 'best_reviews_sample'
worst_reviews_sample['sample_type'] = 'worst_reviews_sample'
low_score_reviews['sample_type'] = 'low_score_reviews'

combined_reviews = pd.concat([
    recent_best_reviews,
    recent_worst_reviews,
    best_reviews_sample,
    worst_reviews_sample,
    low_score_reviews
])

# Save samples
combined_reviews.reset_index(drop=True, inplace=True)
combined_reviews.to_csv(processed_data_path + name + '_sample_selected_reviews.csv', index=False)
print('OK! -> processed sample reviews saved at', processed_data_path + name + '_sample_selected_reviews.csv')

OK! -> processed sample reviews saved at ../data/processed/Oceana Grill_sample_selected_reviews.csv


### ChatGPT API init

In [123]:
import sys
import os
project_root = os.path.abspath("..")
sys.path.append(project_root)

In [124]:
# Init ChatGPT client
from openai import OpenAI
import openai_setup

organization = openai_setup.conf['organization']
project = openai_setup.conf['project']
key = openai_setup.conf['key']

client = OpenAI(
    api_key=key,
    organization=organization,
    project=project
)

In [125]:
# Clean json outputs
def extract_json_string(input_string):
    json_match = re.search(r'\{.*\}', input_string, re.DOTALL)
    
    if json_match:
        json_string = json_match.group(0)
        json_dict = json.loads(json_string)
        return json_dict
    return None


In [126]:
# Extract main insights from API
def extractInsightsWithAI(info_dict, prompt, client):
    # Config and send message to gpt4o model
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a system expert in extracting value from reviews analysed using ML and NLP techniques, to provide valuable and actionable insights to stakeholders in an automated BI tool using AI."},
            {
                "role": "user",
                "content": prompt + str(info_dict)
            }
        ]
    )
    answer = completion.choices[0].message.content
    # Format correctly the answer
    answer_clean = extract_json_string(answer)
    return answer_clean

#### Extract main insights

In [127]:
# Prompt to extract automatically the general insights
general_insights_prompt = (
    "I have this information extracted from LDA topics using clustering and sentiment analysis, including positive and negative terms, in JSON format.\n"
    "I want you to extract:\n"
    "- 3 positive points\n"
    "- 3 negative points\n"
    "- 3 improvement suggestions based on the negative points\n"
    "\n"
    "Each point should be a logical, simple, and concise sentence that provides value. Do not name specific terms or topics, but focus on delivering direct value to business stakeholders without ambiguity. If you mention something that didn't go well, give examples based on the information.\n"
    "Return the result in English in JSON format, ensuring it is easy to read in a notebook and standardized as follows:\n"
    "\n"
    "{best:['','',''], worst:['','',''], improve:['','','']}\n"
    "\n"
    "Ensure there are no contradictions between positive, negative, and improvement points.\n"
    "The information:\n"
)
print(reviews_summary_dict)

{'pca_cluster': {0: ['0.073*"the" + 0.051*"and" + 0.045*"was" + 0.018*"had" + 0.017*"i"', '0.106*"breakfast" + 0.018*"eggs" + 0.017*"omelette" + 0.015*"toast" + 0.015*"bloody"', '0.050*"the" + 0.046*"and" + 0.041*"was" + 0.033*"we" + 0.025*"to"', '0.058*"i" + 0.043*"the" + 0.031*"to" + 0.027*"it" + 0.021*"was"', '0.056*"the" + 0.035*"is" + 0.026*"and" + 0.021*"you" + 0.020*"food"']}, 'umap_cluster': {0: ['0.068*"the" + 0.042*"was" + 0.042*"and" + 0.027*"i" + 0.020*"it"', '0.047*"the" + 0.030*"i" + 0.027*"and" + 0.026*"to" + 0.023*"is"', '0.005*"glutar" + 0.005*"free" + 0.004*"owner" + 0.003*"sweetest" + 0.003*"nightmar"', '0.057*"and" + 0.051*"was" + 0.051*"the" + 0.024*"great" + 0.023*"food"', '0.042*"the" + 0.036*"to" + 0.020*"and" + 0.017*"we" + 0.015*"food"'], 1: ['0.048*"the" + 0.032*"and" + 0.028*"i" + 0.024*"to" + 0.021*"was"', '0.042*"the" + 0.039*"and" + 0.019*"to" + 0.019*"was" + 0.017*"i"', '0.059*"the" + 0.024*"and" + 0.023*"to" + 0.019*"was" + 0.018*"i"', '0.043*"the" + 0.

In [128]:
insigths_summary_dict = extractInsightsWithAI(reviews_summary_dict, general_insights_prompt, client)
print(insigths_summary_dict)

{'best': ['The food was often highlighted as great and enjoyable.', 'Customers expressed satisfaction with breakfast options, particularly the variety.', 'Service was acknowledged positively, with many appreciating the attentiveness of staff.'], 'worst': ['Some patrons experienced dissatisfaction with the food quality, citing it as uneven.', 'A few reviews mentioned issues with the dining environment, indicating it felt cramped.', 'Customers found the wait times to be longer than expected, leading to frustration.'], 'improve': ['Enhance the consistency of food quality to ensure all dishes meet customer expectations.', 'Consider redesigning the dining space to improve comfort and reduce crowding.', 'Implement strategies to streamline service and reduce wait times for a better dining experience.']}


In [129]:
json_file_path = processed_data_path + name + '_general_insights.json'
with open(json_file_path, 'w') as json_file:
    json.dump(insigths_summary_dict, json_file, indent=4)
print('OK! -> general insights saved at', json_file_path)

OK! -> general insights saved at ../data/processed/Oceana Grill_general_insights.json


#### Extract pain moments

In [130]:
negative_periods_insights_prompt = (
    "I have this information extracted from LDA topics using clustering and sentiment analysis, including positive and negative terms at specific moments, in JSON format.\n"
    "\n"
    "I want you to extract:\n"
    "- For each date:\n"
    "- N negative points\n"
    "- N improvement suggestions based on the negative points\n"
    "\n"
    "Each point should be a logical, simple, and concise sentence that provides value. Do not mention specific terms or topics, but focus on delivering direct value to business stakeholders without ambiguity. If you mention something that didn't go well, provide examples based on the information.\n"
    "Return the result in English in JSON format, ensuring it is easy to read in a notebook and standardized as follows:\n"
    "\n"
    "{date: {problems:[problem, problem...], improve:[improve,improve...]}, date:{problems:[problem, problem...], improve:[improve,improve...]}, ...}\n"
    "\n"
    "Make sure there are no contradictions between the points.\n"
    "\n"
    "The information:\n"
)
print(negative_periods_topics)

{'rating_score': {Period('2021-07', 'M'): [(0, '0.050*"the" + 0.025*"and" + 0.021*"was" + 0.021*"to" + 0.016*"i"')], Period('2021-10', 'M'): [(0, '0.040*"the" + 0.024*"was" + 0.019*"to" + 0.018*"and" + 0.013*"we"')]}, 'food_score': {Period('2021-06', 'M'): [(0, '0.048*"the" + 0.030*"and" + 0.023*"was" + 0.018*"to" + 0.012*"i"')], Period('2021-10', 'M'): [(0, '0.040*"the" + 0.026*"was" + 0.025*"and" + 0.016*"to" + 0.014*"were"')]}, 'service_score': {Period('2021-03', 'M'): [(0, '0.046*"the" + 0.028*"was" + 0.027*"and" + 0.021*"to" + 0.014*"it"')], Period('2021-07', 'M'): [(0, '0.050*"the" + 0.024*"was" + 0.024*"and" + 0.020*"i" + 0.019*"to"')]}, 'atmosphere_score': {Period('2021-01', 'M'): [(0, '0.045*"the" + 0.032*"was" + 0.028*"i" + 0.027*"and" + 0.015*"my"')]}}


In [131]:
insigths_summary_dict = extractInsightsWithAI(negative_periods_topics, negative_periods_insights_prompt, client)
print(insigths_summary_dict)

{'2021-01': {'problems': ['Customers felt that the atmosphere did not meet their expectations.', 'Some found the environment to be less inviting and warm.'], 'improve': ['Enhance the decor to create a more welcoming atmosphere.', 'Consider soft lighting and improved seating arrangements.']}, '2021-03': {'problems': ['There were complaints regarding the service speed.', 'Customers expressed dissatisfaction with staff attentiveness.'], 'improve': ['Implement training programs to improve service efficiency.', 'Introduce a system for monitoring service responsiveness.']}, '2021-06': {'problems': ['The quality of food was not consistent, leading to mixed reviews.', 'Some items were not prepared to the expected standards.'], 'improve': ['Standardize recipes and ensure regular training for kitchen staff.', 'Conduct regular quality checks on menu items.']}, '2021-07': {'problems': ['Customers noted that the overall experience fell short of their expectations.', 'Some felt that the service coul

In [132]:
## Save insights
json_file_path = processed_data_path + name + '_worst_periods_insights.json'
with open(json_file_path, 'w') as json_file:
    json.dump(insigths_summary_dict, json_file, indent=4)
print('OK! -> worst periods insights saved at', json_file_path)

OK! -> worst periods insights saved at ../data/processed/Oceana Grill_worst_periods_insights.json


In [133]:
print("Top Most Recommended:", most_recommended)
print("Least Recommended :", less_recommended)

Top Most Recommended: []
Least Recommended : []
