In [143]:
import pandas as pd
import numpy as np
import re
import ast

import datetime

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [144]:
# Load the processed and cleaned data
processed_data_path = '../data/processed/'
raw_data_path = '../data/raw/'

name = 'hd'

reviews_pro = pd.read_csv(processed_data_path + name + '_reviews.csv')
resumme_raw = pd.read_csv(raw_data_path + 'resumme_' + name + '.csv')

display(resumme_raw)
display(reviews_pro.sample(5))

reviews = reviews_pro.copy()
resumme = resumme_raw.copy()

Unnamed: 0,stars,reviews
0,5,2290
1,4,1308
2,3,396
3,2,132
4,1,128


Unnamed: 0,review_id,review,local_guide_reviews,rating_score,service,meal_type,price_per_person_category,food_score,service_score,atmosphere_score,recommendations_list,date,avg_price_per_person
157,157,De esos sitios que tienen malas maneras los em...,219.0,1.0,Comí allí,Otro,10-20 €,2.0,1.0,2.0,[''],2024-02-01,20.0
51,51,Una cafetería con un ambiente agradable. Iba ...,651.0,5.0,,Otro,10-20 €,5.0,5.0,5.0,[''],2024-02-01,20.0
20,20,Siguiendo recomendaciones decidí visitar hoy D...,33.0,5.0,Comí allí,Cena,20-30 €,5.0,5.0,5.0,"['Sándwich de Pollo Frito', 'Hamburguesa Compl...",2024-01-01,30.0
346,346,,348.0,4.0,,,,2.0,4.0,3.0,[''],2023-11-01,
66,66,"Hamburguesas buenísimas, como siempre. El mil ...",427.0,4.0,,,,,,,[''],2018-01-01,


### First draft summary plots 

In [145]:
# Calculate the average for each score
average_food = reviews['food_score'].mean()
average_service = reviews['service_score'].mean()
average_atmosphere = reviews['atmosphere_score'].mean()
average_reviews = (resumme_raw['stars'] * resumme_raw['reviews']).sum() / resumme_raw['reviews'].sum()

# Create a figure with horizontal subplots
fig = make_subplots(rows=1, cols=3, 
                    specs=[[{"type": "xy"}, {"type": "bar"}, {"type": "bar"}]], 
                    subplot_titles=("Average Score", "Number of Reviews", "Categories"))

# First subplot: Display the average review as large text
fig.add_trace(
    go.Scatter(x=[0], y=[0], text=[f"{average_reviews:.2f}"], mode="text", textfont=dict(size=120)),
    row=1, col=1
)

fig.update_xaxes(showgrid=False, zeroline=False, showticklabels=False, row=1, col=1)
fig.update_yaxes(showgrid=False, zeroline=False, showticklabels=False, row=1, col=1)


# Second subplot: Bar plot for reviews
fig.add_trace(
    go.Bar(x=resumme_raw['reviews'], y=resumme_raw['stars'], marker=dict(color='lightskyblue'),
           text=resumme_raw['reviews'], textposition='auto', name="Reviews", orientation='h'),
    row=1, col=2
)

# Third subplot: Bar plot for categories (Food, Service, Atmosphere)
fig.add_trace(
    go.Bar(x=[average_food, average_service, average_atmosphere], 
           y=['Food', 'Service', 'Atmosphere'], 
           marker=dict(color='lightgreen'), 
           text=[f"{average_food:.2f}", f"{average_service:.2f}", f"{average_atmosphere:.2f}"], 
           textposition='auto', 
           orientation='h', 
           name="Categories"),
    row=1, col=3
)

fig.update_layout(height=500, width=1200,  plot_bgcolor="white", paper_bgcolor="white", showlegend=False)
fig.show()

In [146]:
# Convert date column to datetime format
reviews['date'] = pd.to_datetime(reviews['date'], errors='coerce')
reviews['month'] = reviews['date'].dt.to_period('M')
reviews['year'] = reviews['date'].dt.year
reviews['week'] = reviews['date'].dt.to_period('W')
reviews['week'] = reviews['date'] - pd.to_timedelta(reviews['date'].dt.weekday, unit='d')
reviews['week'] = reviews['week'].dt.strftime('%Y-%m-%d')

# Filter data for the last periods (months, years, weeks)
last_months = reviews[reviews['date'] >= pd.to_datetime('today') - pd.DateOffset(months=12)]
last_years = reviews[reviews['date'] >= pd.to_datetime('today') - pd.DateOffset(years=8)]
last_weeks = reviews[reviews['date'] >= pd.to_datetime('today') - pd.DateOffset(weeks=5)]

# Compute averages for the required periods
monthly_avg_scores = last_months.groupby('month')[['rating_score', 'food_score', 'service_score', 'atmosphere_score']].mean()
yearly_avg_scores = last_years.groupby('year')[['rating_score']].mean()
weekly_avg_scores = last_weeks.groupby('week')[['rating_score', 'food_score', 'service_score', 'atmosphere_score']].mean()

# Update the axis labels for each score to be more readable
label_mapping = {
    'rating_score': 'Rating',
    'food_score': 'Food',
    'service_score': 'Service',
    'atmosphere_score': 'Atmosphere'
}

# Create a figure with subplots using the Z-layout
fig = make_subplots(rows=2, cols=2,
                    specs=[[{"colspan": 2}, None],
                           [{}, {}]],  # 1 large plot on the first row, 2 smaller plots on the second
                    subplot_titles=("Monthly Score Trends (Last 12 Months)", 
                                    "Annual Rating Score Trends (Last 6 Years)", 
                                    "Weekly Score Trends (Last 4 Weeks)"))

# Add monthly score trends to the first row (rating_score in stronger color)
colors = ['#1f77b4', '#aec7e8', '#aec7e8', '#aec7e8']
for i, column in enumerate(monthly_avg_scores.columns):
    label = label_mapping[column]
    fig.add_trace(
        go.Scatter(x=monthly_avg_scores.index.astype(str), y=monthly_avg_scores[column],
                   mode='lines+markers', name=label, 
                   text=[f"{label} - {val:.2f}" for val in monthly_avg_scores[column]], 
                   hoverinfo="text", line=dict(color=colors[i])),
        row=1, col=1)

# Add yearly score trends to the second row (left)
fig.add_trace(
    go.Scatter(x=yearly_avg_scores.index.astype(str), y=yearly_avg_scores['rating_score'],
               mode='lines+markers', name="Rating", line=dict(color='#1f77b4', width=4),
               text=[f"Rating - {val:.2f}" for val in yearly_avg_scores['rating_score']], 
               hoverinfo="text"),
    row=2, col=1)

# Add weekly score trends to the second row (right, weaker colors)
for i, column in enumerate(weekly_avg_scores.columns):
    label = label_mapping[column]  # Get the readable label
    fig.add_trace(
        go.Scatter(x=weekly_avg_scores.index.astype(str), y=weekly_avg_scores[column],
                   mode='lines+markers', name=label, 
                   text=[f"{label} - {val:.2f}" for val in weekly_avg_scores[column]], 
                   hoverinfo="text", line=dict(color=colors[i])),
        row=2, col=2)

# Enhance presentation: remove gridlines and borders, increase size, and remove legend
fig.update_layout(showlegend=False, 
                  title="Score Trends Analysis",
                  title_font=dict(size=28),
                  margin=dict(l=50, r=50, t=100, b=50),
                  paper_bgcolor="white",
                  height=800, width=1200)
fig.update_xaxes(showline=False, showgrid=False)
fig.update_yaxes(showline=False, showgrid=True)

# Customize x-axes formatting: show only the year for yearly data, and only day and month for weekly data
fig.update_xaxes(
    tickformat="%Y",  # Only show the year for the yearly graph
    row=2, col=1
)

fig.update_xaxes(
    tickformat="%d-%b",  # Show only the day and month for weekly graph
    row=2, col=2
)

# Add annotations to highlight key points
fig.add_annotation(x='2024-06', y=4.8, 
                   text="Highest Score", 
                   showarrow=True, arrowhead=2,
                   ax=0, ay=80, row=1, col=1, font=dict(size=14))

fig.add_annotation(x='2024-03', y=4.5, 
                   text="Drop in March", 
                   showarrow=True, arrowhead=2,
                   ax=0, ay=-40, row=1, col=1, font=dict(size=14))

fig.add_annotation(x='2024-08', y=4.5, 
                   text="Drop in August", 
                   showarrow=True, arrowhead=2,
                   ax=0, ay=-40, row=1, col=1, font=dict(size=14))

fig.update_traces(marker=dict(size=8), selector=dict(name="Rating"))
fig.show()


### Cleaning and preprocessing

In [147]:
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

from tqdm import tqdm

# Download NLTK stopwords and lexicon
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Load spaCy Spanish model
nlp = spacy.load('es_core_news_sm')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jobandtalent/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jobandtalent/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [148]:
# Clean text, stopworks and tokenize words
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-záéíóúñü0-9\s]', '', text)
    doc = nlp(text)
    stop_words = set(stopwords.words('spanish'))
    tokens = [token.lemma_ for token in doc 
              if token.text not in stop_words and not token.is_punct and not token.is_space]
    return ' '.join(tokens)

In [149]:
tqdm.pandas(desc="Cleaning Reviews")
reviews['cleaned_review'] = reviews['review'].fillna('').progress_apply(clean_text)

display(reviews[['review', 'cleaned_review']].sample(5))

Cleaning Reviews: 100%|██████████| 360/360 [00:02<00:00, 148.41it/s]


Unnamed: 0,review,cleaned_review
191,"Comimos patatas bravas, croquetas y guacamole ...",comimos patata brava croqueta guacamolir totop...
176,No es por la estupenda carta de cafetería (pes...,estupendo carta cafetería pese precio entorno ...
68,Buen local donde ser sirve comida hasta tarde....,buen local ser servir comida tarde variado car...
50,Primera vez que me como una hamburguesa de ver...,primero vez hamburguesa verdad carlos majo
55,"Me encanta esta cafetería, siempre que voy el ...",encantar cafetería siempre ir trato bueno cama...


### Embeddings and sentiment analysis

In [150]:
# Embeddings and Sentiment
from transformers import pipeline
from transformers import BertTokenizer, BertModel
import torch

# Word Clouds and Visualization
from wordcloud import WordCloud


#### Analyze sentiment

In [190]:
# Extract sentiment for each review using 
def analyzeSentiment(df):
    # Initialize VADER sentiment analyzer
    sia = SentimentIntensityAnalyzer()

    # Apply sentiment analysis to each review using VADER
    df['vader_sentiment'] = df['cleaned_review'].apply(lambda x: sia.polarity_scores(x)['compound'])
    
    # Classify sentiment into positive, neutral, negative using rating_score and vader_sentiment
    def classify_sentiment(row):
        if row['rating_score'] >= 4:
            return 'positive'
        elif row['rating_score'] <= 2:
            return 'negative'
        elif row['vader_sentiment'] > 0.05:
            return 'positive'
        elif row['vader_sentiment'] < -0.05:
            return 'negative'
        else:
            return 'neutral'
    
    df['sentiment_label'] = df.apply(classify_sentiment, axis=1)
    
    return df

# Extract most common words for a selected sentiment
def extractCommonWords(df, sentiment_label='positive', n=10):
    # Filter reviews by sentiment label
    filtered_reviews = df[df['sentiment_label'] == sentiment_label]['cleaned_review'].fillna('').tolist()
    
    # Tokenize and count words for the given sentiment label
    vectorizer = CountVectorizer().fit(filtered_reviews)
    word_counts = vectorizer.transform(filtered_reviews).sum(axis=0)
    
    # Create a dictionary of word frequencies
    word_freq = [(word, word_counts[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    sorted_word_freq = sorted(word_freq, key=lambda x: x[1], reverse=True)[:n]
    
    return sorted_word_freq

# Extract most common n-grams for a selected sentiment
def extractCommonNgrams(df, sentiment_label='positive', n=2, top_n=10):
    # Filter reviews by sentiment label
    filtered_reviews = df[df['sentiment_label'] == sentiment_label]['cleaned_review'].fillna('').tolist()
    
    # Create n-grams for the given sentiment label
    vectorizer = CountVectorizer(ngram_range=(n, n)).fit(filtered_reviews)
    ngram_counts = vectorizer.transform(filtered_reviews).sum(axis=0)
    
    # Create a list of n-grams with their counts
    ngram_freq = [(word, ngram_counts[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    sorted_ngrams = sorted(ngram_freq, key=lambda x: x[1], reverse=True)[:top_n]
    
    return sorted_ngrams

# Analyze sentiment with VADER
reviews = analyzeSentiment(reviews)

# Extract common positive and negative phrases
common_positive_words = extractCommonWords(reviews, sentiment_label = 'positive', n = 10)
common_negative_words = extractCommonWords(reviews, sentiment_label = 'negative', n = 10)

print("Top Positive Words:", common_positive_words)
print("Top Negative Words:", common_negative_words)

# Extract common positive and negative bigrams
common_positive_bigrams = extractCommonNgrams(reviews, sentiment_label='positive', n=2, top_n=10)
common_negative_bigrams = extractCommonNgrams(reviews, sentiment_label='negative', n=2, top_n=10)

print("Top Positive Bigrams:", common_positive_bigrams)
print("Top Negative Bigrams:", common_negative_bigrams)

Top Positive Words: [('hamburguesa', 128), ('buen', 118), ('comida', 66), ('lugar', 65), ('servicio', 48), ('sitio', 46), ('bien', 44), ('mejor', 42), ('precio', 41), ('ambiente', 36)]
Top Negative Words: [('ir', 10), ('café', 10), ('vez', 8), ('ver', 8), ('si', 8), ('pedir', 8), ('sitio', 7), ('servicio', 7), ('comida', 7), ('parecer', 6)]
Top Positive Bigrams: [('comida buen', 11), ('buen ambiente', 11), ('mejor hamburguesa', 11), ('buen comida', 10), ('buen servicio', 9), ('buen lugar', 8), ('aro cebolla', 7), ('hamburguesa rico', 7), ('precio razonable', 7), ('hamburguesa italiano', 6)]
Top Negative Bigrams: [('ver foto', 3), ('merecer pena', 3), ('nunca tanto', 2), ('si solo', 2), ('último vez', 2), ('huevo revuelto', 2), ('dar yo', 2), ('acompañar hamburguesa', 2), ('pan mohoso', 2), ('gofre frío', 1)]


In [152]:
# Plot the evolution of distribution of reviews on time based on sentiments
def plotSentimentTrend(df, years_limit = 6):
    # Convert date to datetime format and handle missing values
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])
    
    # Filter only the last 6 years
    last_six_years = datetime.datetime.now() - pd.DateOffset(years=years_limit)
    df = df[df['date'] >= last_six_years]

    # Set date as index for resampling
    df.set_index('date', inplace=True)
    
    # Resample to monthly and count sentiments
    sentiment_counts = df.resample('M')['sentiment_label'].value_counts().unstack().fillna(0)

    # Calculate the percentage for each sentiment type
    sentiment_percentage = sentiment_counts.div(sentiment_counts.sum(axis=1), axis=0) * 100
    sentiment_percentage = sentiment_percentage.round(2)
    sentiment_percentage = sentiment_percentage.reset_index().melt(id_vars=['date'], value_name='percentage', var_name='sentiment_label')
    
    # Plot sentiment percentage evolution
    fig = px.area(
        sentiment_percentage,
        x='date',
        y='percentage',
        color='sentiment_label',
        title='Sentiment Percentage Over the Last 6 Years',
        labels={'date': '', 'percentage': 'Percentage of Reviews (%)', 'sentiment_label': 'Sentiment'},
        template='plotly_white',
    )

    # Customize layout
    fig.update_layout(
        title=dict(x=0.5, xanchor='center', font=dict(size=18, color='black')),
        xaxis=dict(showgrid=False, zeroline=False),
        yaxis=dict(showgrid=True, title='Percentage of Reviews', ticksuffix='%'),
        legend=dict(title='', orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1),
        margin=dict(l=20, r=20, t=50, b=20),
        plot_bgcolor='rgba(0,0,0,0)',
        hovermode='x unified',
        width=1200,
        height=400,
    )

    # Customize color for sentiment categories
    color_map = {
        'positive': 'rgba(102, 194, 165, 0.7)', 
        'neutral': 'rgba(141, 160, 203, 0.7)', 
        'negative': 'rgba(252, 141, 98, 0.7)'
    }
    fig.for_each_trace(lambda trace: trace.update(line=dict(width=0, shape='spline'), fill='tonexty', fillcolor=color_map.get(trace.name, 'rgba(150, 150, 150, 0.5)')))

    # Remove the plot frame and keep the visualization as clean as possible
    fig.update_xaxes(showline=False)
    fig.update_yaxes(showline=False, range=[0, 100])  # Percentage scale from 0 to 100

    fig.show()

plotSentimentTrend(reviews)


'M' is deprecated and will be removed in a future version, please use 'ME' instead.



In [189]:

# Extract most and least recommendations mentioned
def analyzeRecommendations(df):
    all_dishes = []

    # Convert string representation of lists to actual lists and extend all_dishes
    for item in df['recommendations_list'].dropna():
        try:
            dishes = ast.literal_eval(item)
            if isinstance(dishes, list):
                all_dishes.extend(dishes)
        except:
            continue

    # Filter out empty values
    all_dishes = [dish for dish in all_dishes if dish.strip() != '']

    # Count the frequency of each dish
    dish_counts = Counter(all_dishes)

    # Most and least recommended dishes
    most_common_dishes = dish_counts.most_common(3)
    min_count = min(dish_counts.values())
    worst_dishes = [dish for dish, count in dish_counts.items() if count == min_count]

    print("Top Most Recommended:", most_common_dishes)
    print("Least Recommended :", worst_dishes)

analyzeRecommendations(reviews)


Top Most Recommended: [('Hamburguesa Completa Con Huevo', 9), ('Hamburguesa Americana', 8), ('Hamburguesa Italiana', 6)]
Least Recommended : ['Plaza de aparcamiento', 'Flan de Queso', 'Gofre Con Nocilla', 'Grandburguer HD', 'Hamburger', 'Hamburguesa Con Huevo', 'Tarta de Manzana', 'Tataki de Atún', 'Cheesecake', 'Hamburguesa Gallega', 'Brunch Con Baggel Salmón']


#### Calculate embeddings

In [154]:
# Extract the embeddings for each cleaned review
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Import Bert model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

tqdm.pandas(desc="Generating Embeddings")
reviews['embedding'] = reviews['cleaned_review'].progress_apply(get_embedding)

Generating Embeddings: 100%|██████████| 360/360 [00:26<00:00, 13.42it/s]


#### Analyze embeddings

In [188]:
from sklearn.decomposition import PCA

# PCA Embeddings Visualization
def visualizeEmbeddingsPCA(df):
    # Convert embeddings to a NumPy array
    embeddings = np.array(df['embedding'].tolist())
    ratings = df['rating_score']
    
    # Perform PCA for dimensionality reduction
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(embeddings)
    
    # Calculate variance explained by each component
    var_explained = pca.explained_variance_ratio_ * 100
    var1, var2 = var_explained
    
    # Prepare DataFrame for Plotly
    plot_df = pd.DataFrame({
        'PCA Component 1': reduced_embeddings[:, 0],
        'PCA Component 2': reduced_embeddings[:, 1],
        'Rating Score': ratings,
        'Review ID': df.get('review_id', range(len(df)))  # Optional identifier
    })
    
    # Create interactive scatter plot
    fig = px.scatter(
        plot_df,
        x='PCA Component 1',
        y='PCA Component 2',
        color='Rating Score',
        color_continuous_scale='Viridis',
        hover_data=['Review ID', 'Rating Score'],
        title=f'Embeddings by Rating Score (PCA 1: {var1:.1f}%, PCA 2: {var2:.1f}%)',
        labels={
            'PCA Component 1': f'PCA 1 ({var1:.1f}% variance)',
            'PCA Component 2': f'PCA 2 ({var2:.1f}% variance)',
            'Rating Score': 'Rating Score'
        }
    )
    
    # Enhance layout for clarity
    fig.update_layout(
        template='plotly_white',
        coloraxis_colorbar=dict(
            title='Rating Score',
            tickmode='linear'
        ),
        hovermode='closest'
    )
    
    fig.show()
    return reduced_embeddings

embeddings_pca = visualizeEmbeddingsPCA(reviews)

import umap.umap_ as umap

# UMAP Embeddings Visualization
def visualizeEmbeddingsUMAP(df):
    embeddings = np.array(df['embedding'].tolist())
    sentiment_labels = df['sentiment_label']

    # Reduce dimensionality with UMAP
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
    reduced_embeddings = reducer.fit_transform(embeddings)

    # Create DataFrame for visualization
    viz_df = pd.DataFrame(reduced_embeddings, columns=['x', 'y'])
    viz_df['sentiment_label'] = sentiment_labels

    # Scatter plot with Plotly for interactive visualization
    fig = px.scatter(
        viz_df,
        x='x',
        y='y',
        color='sentiment_label',
        title='Embedding Visualization with UMAP',
        labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2'},
        color_discrete_map={'positive': 'green', 'neutral': 'gray', 'negative': 'red'},
        opacity=0.7
    )
    fig.update_layout(showlegend=True, legend=dict(title='Sentiment'), margin=dict(l=10, r=10, t=40, b=10))
    fig.show()

    return reduced_embeddings

embeddings_umap = visualizeEmbeddingsUMAP(reviews)




n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [177]:
from sklearn.neighbors import NearestNeighbors

# Plot K distace for dbscan eps adjustment
def plotKdistance(reduced_embeddings, k=5, method='PCA'):
    # Compute k-nearest neighbors
    neighbors = NearestNeighbors(n_neighbors=k)
    neighbors_fit = neighbors.fit(reduced_embeddings)
    distances, _ = neighbors_fit.kneighbors(reduced_embeddings)
    
    # Sort distances to the k-th nearest neighbor
    k_distances = np.sort(distances[:, k-1])
    
    # Create interactive line plot
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=np.arange(1, len(k_distances) + 1),
        y=k_distances,
        mode='lines',
        line=dict(color='blue'),
        name='k-distance'
    ))
    
    # Update layout for clarity
    fig.update_layout(
        title=f'k-Distance Graph for {method}',
        xaxis_title='Points sorted by distance',
        yaxis_title=f'Distance to {k}th Nearest Neighbor',
        template='plotly_white',
        hovermode='x unified'
    )
    
    # Add light grid lines
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey')
    
    fig.show()

plotKdistance(embeddings_umap, k= 10, method='PCA')
plotKdistance(embeddings_pca, k= 10, method='UMAP')

In [184]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Function to apply DBSCAN
def apply_dbscan(reduced_embeddings, eps=0.6, min_samples=5):
    scaler = StandardScaler()
    scaled = scaler.fit_transform(reduced_embeddings)
    db = DBSCAN(eps=eps, min_samples=min_samples)
    labels = db.fit_predict(scaled)
    return labels

# PCA Visualization with DBSCAN
def visualizeEmbeddingsPCA_with_DBSCAN(df, eps=0.55, min_samples=10):
    embeddings = np.array(df['embedding'].tolist())
    ratings = df['rating_score']
    
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(embeddings)
    var1, var2 = pca.explained_variance_ratio_ * 100
    
    clusters = apply_dbscan(reduced, eps, min_samples)
    
    plot_df = pd.DataFrame({
        'pca_component_1': reduced[:, 0],
        'pca_component_2': reduced[:, 1],
        'rating_score': ratings,
        'pca_cluster': clusters,
        'review_id': df.get('review_id', range(len(df)))
    })
    
    fig = px.scatter(
        plot_df,
        x='pca_component_1',
        y='pca_component_2',
        color='pca_cluster',
        color_continuous_scale='Viridis',
        hover_data=['review_id', 'rating_score'],
        title=f'PCA with DBSCAN (PCA1: {var1:.1f}%, PCA2: {var2:.1f}%)',
        labels={
            'PCA 1': f'pca_component_1 ({var1:.1f}% variance)',
            'PCA 2': f'pca_component_2 ({var2:.1f}% variance)',
            'Cluster': 'pca_cluster'
        }
    )
    
    fig.update_layout(
        template='plotly_white',
        coloraxis_colorbar=dict(title='pca_cluster'),
        hovermode='closest'
    )
    
    fig.show()
    return plot_df

# UMAP Visualization with DBSCAN
def visualizeEmbeddingsUMAP_with_DBSCAN(df, eps=0.7, min_samples=10):
    embeddings = np.array(df['embedding'].tolist())
    sentiment = df['sentiment_label']
    
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
    reduced = reducer.fit_transform(embeddings)
    
    clusters = apply_dbscan(reduced, eps, min_samples)
    
    plot_df = pd.DataFrame({
        'umap_component_1': reduced[:, 0],
        'umap_component_2': reduced[:, 1],
        'sentiment': sentiment,
        'umap_cluster': clusters,
        'review_id': df.get('review_id', range(len(df)))
    })
    
    fig = px.scatter(
        plot_df,
        x='umap_component_1',
        y='umap_component_2',
        color='umap_cluster',
        color_continuous_scale='Viridis',
        hover_data=['sentiment', 'umap_cluster'],
        title='UMAP with DBSCAN',
        labels={
            'UMAP 1': 'umap_component_1',
            'UMAP 2': 'umap_component_2',
            'Cluster': 'umap_cluster'
        },
        opacity=0.7
    )
    
    fig.update_layout(
        showlegend=True,
        legend=dict(title='umap_cluster'),
        margin=dict(l=10, r=10, t=40, b=10)
    )
    
    fig.show()
    return plot_df

# Visualize with DBSCAN clusters
pca_clusters = visualizeEmbeddingsPCA_with_DBSCAN(reviews, eps=0.5, min_samples=5)
umap_clusters = visualizeEmbeddingsUMAP_with_DBSCAN(reviews, eps=0.5, min_samples=5)




n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [186]:
### Join PCA and UMAP clusters info to reviews
reviews = reviews.merge(pca_clusters[['review_id','pca_cluster']]).merge(umap_clusters[['review_id','umap_cluster']])

#### Topics

In [199]:
from gensim import corpora
from gensim.models import LdaModel

# Extract topics using LDA model
def analyzeTopicsLDA(df, number_of_topics = 5):
   # Prepare corpus for LDA
    cleaned_reviews = df['cleaned_review'].dropna().tolist()
    tokenized_reviews = [review.split() for review in cleaned_reviews if isinstance(review, str) and review.strip() != '']
    
    if not tokenized_reviews:
        print("No valid reviews to process.")
        return None, []
    
    dictionary = corpora.Dictionary(tokenized_reviews)
    if len(dictionary) == 0:
        print("Dictionary is empty after tokenization.")
        return None, []
    
    corpus = [dictionary.doc2bow(review) for review in tokenized_reviews]
    if not any(corpus):
        print("Corpus is empty. No terms found in any document.")
        return None, []
    
    # Train LDA model
    try:
        lda_model = LdaModel(
            corpus,
            num_topics=number_of_topics,
            id2word=dictionary,
            passes=10,
            random_state=42
        )
    except ValueError as e:
        print(f"LDA Model training failed: {e}")
        return None, []
    
    # Extract topics
    topics = lda_model.print_topics(num_words=5)
    for topic in topics:
        print(f"Topic {topic[0]}: {topic[1]}")
    return lda_model, topics

print('=== General topics ===')
lda_model, topics = analyzeTopicsLDA(reviews)

=== General topics ===
Topic 0: 0.024*"hamburguesa" + 0.016*"bien" + 0.013*"buena" + 0.011*"lugar" + 0.010*"bonito"
Topic 1: 0.018*"hamburguesa" + 0.009*"poder" + 0.008*"hacer" + 0.008*"lugar" + 0.008*"genial"
Topic 2: 0.023*"buen" + 0.019*"hamburguesa" + 0.014*"servicio" + 0.013*"comida" + 0.011*"carta"
Topic 3: 0.018*"hamburguesa" + 0.018*"buen" + 0.011*"sitio" + 0.011*"excelente" + 0.010*"mejor"
Topic 4: 0.032*"buen" + 0.022*"comida" + 0.021*"lugar" + 0.020*"hamburguesa" + 0.012*"ambiente"


In [207]:
group_columns = ['pca_cluster', 'umap_cluster', 'sentiment_label']

# Initialize dictionary to store topics
topics_dict = {group_col: {} for group_col in group_columns}

# Iterate over each grouping column and generate topics
for group_col in group_columns:
    print(f"\n=== Topics by {group_col} ===")
    unique_groups = reviews[group_col].dropna().unique()
    
    for group_val in unique_groups:
        subset = reviews[reviews[group_col] == group_val]
        
        # Check if there are enough reviews to train LDA
        if len(subset) < 5:
            print(f"\n--- {group_col} = {group_val} ---")
            print("Not enough data to train LDA.")
            continue
        
        print(f"\n--- {group_col} = {group_val} ---")
        
        # Generate topics for the current subset
        lda_model, topics = analyzeTopicsLDA(subset)
        
        if lda_model is not None and topics:
            # Store topics as strings in the dictionary
            topics_strings = [topic[1] for topic in topics]
            topics_dict[group_col][group_val] = topics_strings
        else:
            print("No topics generated for this group.\n")


=== Topics by pca_cluster ===

--- pca_cluster = 0 ---
Topic 0: 0.024*"hamburguesa" + 0.012*"mejor" + 0.011*"sitio" + 0.009*"bien" + 0.008*"madrid"
Topic 1: 0.030*"buen" + 0.020*"comida" + 0.019*"hamburguesa" + 0.015*"sitio" + 0.013*"precio"
Topic 2: 0.015*"bien" + 0.010*"buen" + 0.009*"hamburguesa" + 0.007*"servicio" + 0.006*"súper"
Topic 3: 0.027*"buen" + 0.023*"hamburguesa" + 0.022*"lugar" + 0.014*"comida" + 0.013*"servicio"
Topic 4: 0.010*"amplio" + 0.010*"ir" + 0.008*"servicio" + 0.007*"pedir" + 0.007*"lugar"

--- pca_cluster = 1 ---
Topic 0: 0.250*"hamburguesa" + 0.250*"buena" + 0.250*"buen" + 0.250*"grande"
Topic 1: 0.471*"hamburguesa" + 0.469*"buena" + 0.031*"buen" + 0.029*"grande"
Topic 2: 0.250*"hamburguesa" + 0.250*"buen" + 0.250*"buena" + 0.250*"grande"
Topic 3: 0.256*"hamburguesa" + 0.255*"buena" + 0.244*"buen" + 0.244*"grande"
Topic 4: 0.471*"hamburguesa" + 0.323*"buen" + 0.177*"grande" + 0.030*"buena"

--- pca_cluster = 2 ---
No valid reviews to process.
No topics gener

#### Extract moments with worst rating and process that reviews

In [237]:
import pandas as pd

# Analyze low scores and extract negative reviews for the selected time period
def analyzeLowScores(df, score_column, time_period='month', num_periods=1):
    # Group by selected period and calculate average score
    avg_scores = df.groupby(time_period)[score_column].mean().sort_values()
    # Select the specified number of periods with the lowest average score
    low_score_periods = avg_scores.index[:num_periods]
    
    # Filter negative reviews for the selected periods with the lowest score
    period_reviews = df[(df[time_period].isin(low_score_periods)) & (df[score_column] < 3) & (df['sentiment_label'] == 'negative')]
    # Drop the 'embedding' column if it exists to avoid issues with non-hashable types
    if 'embedding' in period_reviews.columns:
        period_reviews = period_reviews.drop(columns=['embedding'])
    
    # Add a column indicating the period with the lowest score for easier filtering
    period_reviews['low_score_period'] = period_reviews[time_period]
    period_reviews = period_reviews.sort_values('low_score_period')
    return period_reviews

# Usage
time_period = 'month'  # Change to 'week', 'year', etc. to analyze different periods
num_periods = 3  # Number of periods with the lowest average score to select

# Analyze for each score type
rating_negative_reviews_by_negative_periods = analyzeLowScores(reviews, 'rating_score', time_period, num_periods)
food_negative_reviews_by_negative_periods = analyzeLowScores(reviews, 'food_score', time_period, num_periods)
service_negative_by_negative_periods = analyzeLowScores(reviews, 'service_score', time_period, num_periods)
atmosphere_negative_by_negative_periods = analyzeLowScores(reviews, 'atmosphere_score', time_period, num_periods)

In [238]:
# Calculate topics for each low_score_period and concatenate results
def generateTopicsPerPeriod(df, score_column, number_of_topics=1):
    topics_dict = {score_column: {}}
    for period in df['low_score_period'].unique():
        period_reviews = df[df['low_score_period'] == period]
        # Assuming analyzeTopicsLDA function returns topics as the second output
        _, topics = analyzeTopicsLDA(period_reviews, number_of_topics=number_of_topics)
        topics_dict[score_column][period] = topics
    return topics_dict

rating_negative_topics_by_negative_periods = generateTopicsPerPeriod(rating_negative_reviews_by_negative_periods, 'rating_score')
food_negative_topics_by_negative_periods = generateTopicsPerPeriod(food_negative_reviews_by_negative_periods, 'rating_score')
service_topics_by_negative_periods = generateTopicsPerPeriod(service_negative_by_negative_periods, 'rating_score')
atmosphere_topics_by_negative_periods = generateTopicsPerPeriod(atmosphere_negative_by_negative_periods, 'rating_score')

Topic 0: 1.000*"estimado"
Topic 0: 0.010*"menos" + 0.010*"café" + 0.009*"precio" + 0.009*"huevo" + 0.009*"hacer"
Topic 0: 0.058*"cómodo" + 0.058*"mesa" + 0.038*"parecer" + 0.038*"mínimo" + 0.038*"podía"
Topic 0: 0.058*"cómodo" + 0.058*"mesa" + 0.038*"parecer" + 0.038*"mínimo" + 0.038*"podía"
Topic 0: 0.081*"mala" + 0.054*"pena" + 0.054*"pro" + 0.054*"pedir" + 0.054*"merecer"


#### Extract outliers and pain points

In [None]:
## extract the reviews with negative sentiment:
# each one for very low category score
# worst 3 words
# worst 3 bigrams

# use topics


In [None]:
# put together each one in a string and send it to gpt api to extract the main pain points to improve

In [209]:
suggested_prompt = "Using the provided LDA topics for different aggrupations (UMAP clustering, PCA clustering and sentiment Clustering), generate a list of key strengths and areas for improvement for the venue. The output should be clear, direct, and suitable for stakeholders, avoiding ambiguity. Ensure coherence between positive and negative points, with no contradictions. Organize into 'Key Strengths' and 'Areas for Improvement' with concise, complete ideas. LDA Topics: "

In [211]:
suggested_prompt + str(topics_dict)

'Using the provided LDA topics for different aggrupations (UMAP clustering, PCA clustering and sentiment Clustering), generate a list of key strengths and areas for improvement for the venue. The output should be clear, direct, and suitable for stakeholders, avoiding ambiguity. Ensure coherence between positive and negative points, with no contradictions. Organize into \'Key Strengths\' and \'Areas for Improvement\' with concise, complete ideas. LDA Topics: {\'pca_cluster\': {0: [\'0.024*"hamburguesa" + 0.012*"mejor" + 0.011*"sitio" + 0.009*"bien" + 0.008*"madrid"\', \'0.030*"buen" + 0.020*"comida" + 0.019*"hamburguesa" + 0.015*"sitio" + 0.013*"precio"\', \'0.015*"bien" + 0.010*"buen" + 0.009*"hamburguesa" + 0.007*"servicio" + 0.006*"súper"\', \'0.027*"buen" + 0.023*"hamburguesa" + 0.022*"lugar" + 0.014*"comida" + 0.013*"servicio"\', \'0.010*"amplio" + 0.010*"ir" + 0.008*"servicio" + 0.007*"pedir" + 0.007*"lugar"\'], 1: [\'0.250*"hamburguesa" + 0.250*"buena" + 0.250*"buen" + 0.250*"gra

In [229]:
topics_dict

{'pca_cluster': {0: ['0.024*"hamburguesa" + 0.012*"mejor" + 0.011*"sitio" + 0.009*"bien" + 0.008*"madrid"',
   '0.030*"buen" + 0.020*"comida" + 0.019*"hamburguesa" + 0.015*"sitio" + 0.013*"precio"',
   '0.015*"bien" + 0.010*"buen" + 0.009*"hamburguesa" + 0.007*"servicio" + 0.006*"súper"',
   '0.027*"buen" + 0.023*"hamburguesa" + 0.022*"lugar" + 0.014*"comida" + 0.013*"servicio"',
   '0.010*"amplio" + 0.010*"ir" + 0.008*"servicio" + 0.007*"pedir" + 0.007*"lugar"'],
  1: ['0.250*"hamburguesa" + 0.250*"buena" + 0.250*"buen" + 0.250*"grande"',
   '0.471*"hamburguesa" + 0.469*"buena" + 0.031*"buen" + 0.029*"grande"',
   '0.250*"hamburguesa" + 0.250*"buen" + 0.250*"buena" + 0.250*"grande"',
   '0.256*"hamburguesa" + 0.255*"buena" + 0.244*"buen" + 0.244*"grande"',
   '0.471*"hamburguesa" + 0.323*"buen" + 0.177*"grande" + 0.030*"buena"']},
 'umap_cluster': {0: ['0.024*"hamburguesa" + 0.016*"bien" + 0.013*"buena" + 0.011*"lugar" + 0.010*"bonito"',
   '0.018*"hamburguesa" + 0.009*"poder" + 0.008