In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import re
import json
import os
import sys

sys.path.append(os.path.abspath(os.path.join('..')))
from src import plots
from src import ml_processing

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jobandtalent/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jobandtalent/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Load data

In [4]:
def loadData(uploaded_file):
    if uploaded_file is not None:
        return pd.read_csv(uploaded_file)
    return None

def extractPrefix(file_name):
    # Split the filename and extract the part before "_ml"
    return file_name.split('_ml')[0]

def loadJson(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def reFormatEmbeddings(embedding_str):
    cleaned_str = re.sub(r'[\[\]\n]', '', embedding_str)
    embedding_list = list(map(float, cleaned_str.split()))
    return np.array(embedding_list, dtype=np.float32)
    return embedding_str

processed_path = '../data/processed/'
raw_path = '../data/raw/'

In [8]:
uploaded_file = 'hd_ml_processed_reviews.csv'

## Load all necessary data
# Load reviews data and extract place from the file name
reviews = loadData(processed_path + uploaded_file)
if 'embedding' in reviews.columns:
    # Convert embeddings from string to list of floats
    reviews['embedding'] = reviews['embedding'].apply(reFormatEmbeddings)

file_name = uploaded_file
place = extractPrefix(file_name)

# Paths for the JSON and additional CSV files
general_insights_file = os.path.join(processed_path, f"{place}_general_insights.json")
worst_periods_file = os.path.join(processed_path, f"{place}_worst_periods_insights.json")
sample_reviews_file = os.path.join(processed_path, f"{place}_sample_selected_reviews.csv")
resume_file = os.path.join(raw_path, f"resumme_{place}.csv")

# Load "place"_general_insights.json into a dictionary
if os.path.exists(general_insights_file):
    general_insights = loadJson(general_insights_file)

# Load "place"_worst_periods_insights.json into a dictionary
if os.path.exists(worst_periods_file):
    worst_periods_insights = loadJson(worst_periods_file)

# Load "place"_sample_selected_reviews.csv into a DataFrame
if os.path.exists(sample_reviews_file):
    sample_reviews = pd.read_csv(sample_reviews_file)

# Load resumme_"place".csv from ./data/raw into a DataFrame
if os.path.exists(resume_file):
    resume = pd.read_csv(resume_file)

### Dev

In [18]:
display(sample_reviews.sample(3))

# best_reviews
best_reviews = sample_reviews[sample_reviews['sample_type'] == 'best_reviews_sample'][['date', 'rating_score','review', 'food_score', 'service_score', 'atmosphere_score', 'meal_type']]
best_reviews.rename(columns = {'review':'Review', 'rating_score':'Rating', 'meal_type':'Meal','food_score':'Food', 'service_score':'Service', 'atmosphere_score':'Ambient', 'date':'Date'}, inplace = True)

# worst_reviews
worst_reviews = sample_reviews[sample_reviews['sample_type'] == 'worst_reviews_sample'][['date', 'rating_score','review', 'food_score', 'service_score', 'atmosphere_score', 'meal_type']]
worst_reviews.rename(columns = {'review':'Review', 'rating_score':'Rating', 'meal_type':'Meal','food_score':'Food', 'service_score':'Service', 'atmosphere_score':'Ambient', 'date':'Date'}, inplace = True)

Unnamed: 0,review_id,review,local_guide_reviews,rating_score,service,meal_type,price_per_person_category,food_score,service_score,atmosphere_score,...,cleaned_review,vader_sentiment,sentiment_label,embedding,pca_cluster,umap_cluster,month,year,total_score,sample_type
23,,La comida me resultó bastante mala. Los canelo...,,1.0,,,,,,,...,,,,,,,2024-08,,,low_score_reviews
12,141.0,Un lugar ideal para tomar una cerveza 🍻 o verm...,224.0,5.0,Comí allí,Comida,10-20 €,5.0,5.0,5.0,...,lugar ideal tomar cerveza vermut rico berenjen...,0.5267,positive,[-2.78942168e-01 -2.01295927e-01 1.19766586e-...,0.0,0.0,2024-05,2024.0,6.0,best_reviews_sample
8,88.0,La cafetería despertó mi interés por su diseño...,8.0,2.0,,,,0.874,0.902,0.894,...,cafetería despertar interés diseño retro cenar...,0.296,negative,[-6.82985008e-01 -6.79774642e-01 4.46038634e-...,0.0,0.0,2024-04,2024.0,2.18,recent_worst_reviews


In [19]:
general_insights

{'best': ['Customers frequently praise the quality and taste of the burgers.',
  'Many enjoy the overall ambiance and atmosphere of the dining space.',
  'The reasonable pricing of the food is consistently noted as a positive aspect.'],
 'worst': ['Several customers have experienced delays when trying to place orders.',
  'There are reports of subpar service that detract from the dining experience.',
  'Some diners have found certain menu items to be disappointing and not worth the price.'],
 'improve': ['Streamline the ordering process to reduce waiting times for customers.',
  'Enhance staff training to improve the quality and responsiveness of customer service.',
  'Review the menu offerings to ensure all items meet customer expectations and value.']}

In [20]:
worst_periods_insights

{'2023-11': {'problems': ['Customers reported that the café environment was not inviting.',
   'There seems to be a lack of urgency in addressing customer concerns during visits.'],
  'improve': ['Enhance the ambiance of the café to make it more welcoming.',
   'Implement a training program focused on timely customer service response.']},
 '2024-02': {'problems': ['Service was described as poor, leading to negative customer experiences.',
   'Customers felt that their orders were not managed properly.'],
  'improve': ['Conduct regular service quality assessments to identify areas needing improvement.',
   'Increase staff training on order management and customer interaction.']},
 '2024-03': {'problems': ['High noise levels were reported, making the atmosphere uncomfortable.',
   'Customers expressed dissatisfaction with the consistency of their experiences.'],
  'improve': ['Consider soundproofing measures or altering the layout to reduce noise.',
   'Standardize service procedures to 

In [21]:
display(sample_reviews.sample(3))

Unnamed: 0,review_id,review,local_guide_reviews,rating_score,service,meal_type,price_per_person_category,food_score,service_score,atmosphere_score,...,cleaned_review,vader_sentiment,sentiment_label,embedding,pca_cluster,umap_cluster,month,year,total_score,sample_type
22,,"Ya no es lo que era, vamos q hasta he encontra...",,3.0,,,,,,,...,,,,,,,2024-03,,,low_score_reviews
1,40.0,El martes fui a cenar con mis hijos y la pasam...,1.0,5.0,Comí allí,Cena,20-30 €,5.0,5.0,5.0,...,martes cenar hijo pasar genial croqueta ensala...,0.4215,positive,[-1.92947149e-01 -3.06159824e-01 2.90438443e-...,0.0,0.0,2024-09,2024.0,6.0,recent_best_reviews
15,100.0,Un maravilloso ambiente. Seguido de una amable...,103.0,1.0,,,,0.874,0.902,0.894,...,maravilloso ambiente seguido amable oferta cam...,0.0,negative,[-3.54738146e-01 -2.41264358e-01 3.90854657e-...,0.0,0.0,2018-01,2018.0,1.18,worst_reviews_sample


In [23]:
sample_reviews.groupby('sample_type').count()

Unnamed: 0_level_0,review_id,review,local_guide_reviews,rating_score,service,meal_type,price_per_person_category,food_score,service_score,atmosphere_score,...,avg_price_per_person,cleaned_review,vader_sentiment,sentiment_label,embedding,pca_cluster,umap_cluster,month,year,total_score
sample_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
best_reviews_sample,5,5,5,5,3,3,3,5,5,5,...,3,5,5,5,5,5,5,5,5,5
low_score_reviews,0,4,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4,0,0
recent_best_reviews,5,5,5,5,2,2,2,5,5,5,...,2,5,5,5,5,5,5,5,5,5
recent_worst_reviews,5,5,5,5,2,2,2,5,5,5,...,2,5,5,5,5,5,5,5,5,5
worst_reviews_sample,5,5,5,5,1,1,1,5,5,5,...,1,5,5,5,5,5,5,4,4,5


In [28]:
period_reviews = sample_reviews[(sample_reviews['month'] == '2024-08') & (sample_reviews['sample_type'] == 'low_score_reviews')][['date', 'rating_score', 'review', 'food_score', 'service_score', 'atmosphere_score', 'meal_type']]
period_reviews

Unnamed: 0,date,rating_score,review,food_score,service_score,atmosphere_score,meal_type
23,,1.0,La comida me resultó bastante mala. Los canelo...,,,,
