In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import re
import json
import os
import sys

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

import networkx as nx

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

sys.path.append(os.path.abspath(os.path.join('..')))
from src import plots
from src import ml_processing

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jobandtalent/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jobandtalent/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Load data

In [2]:
def loadData(uploaded_file):
    if uploaded_file is not None:
        return pd.read_csv(uploaded_file)
    return None

def extractPrefix(file_name):
    # Split the filename and extract the part before "_ml"
    return file_name.split('_ml')[0]

def loadJson(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def reFormatEmbeddings(embedding_str):
    cleaned_str = re.sub(r'[\[\]\n]', '', embedding_str)
    embedding_list = list(map(float, cleaned_str.split()))
    return np.array(embedding_list, dtype=np.float32)
    return embedding_str

processed_path = '../data/processed/'
raw_path = '../data/raw/'

In [3]:
uploaded_file = 'hd_ml_processed_reviews.csv'

## Load all necessary data
# Load reviews data and extract place from the file name
reviews = loadData(processed_path + uploaded_file)
if 'embedding' in reviews.columns:
    # Convert embeddings from string to list of floats
    reviews['embedding'] = reviews['embedding'].apply(reFormatEmbeddings)

file_name = uploaded_file
place = extractPrefix(file_name)

# Paths for the JSON and additional CSV files
general_insights_file = os.path.join(processed_path, f"{place}_general_insights.json")
worst_periods_file = os.path.join(processed_path, f"{place}_worst_periods_insights.json")
sample_reviews_file = os.path.join(processed_path, f"{place}_sample_selected_reviews.csv")
resume_file = os.path.join(raw_path, f"resumme_{place}.csv")

# Load "place"_general_insights.json into a dictionary
if os.path.exists(general_insights_file):
    general_insights = loadJson(general_insights_file)

# Load "place"_worst_periods_insights.json into a dictionary
if os.path.exists(worst_periods_file):
    worst_periods_insights = loadJson(worst_periods_file)

# Load "place"_sample_selected_reviews.csv into a DataFrame
if os.path.exists(sample_reviews_file):
    sample_reviews = pd.read_csv(sample_reviews_file)

# Load resumme_"place".csv from ./data/raw into a DataFrame
if os.path.exists(resume_file):
    resume = pd.read_csv(resume_file)

### Dev

In [None]:
display(sample_reviews.sample(3))

# best_reviews
best_reviews = sample_reviews[sample_reviews['sample_type'] == 'best_reviews_sample'][['date', 'rating_score','review', 'food_score', 'service_score', 'atmosphere_score', 'meal_type']]
best_reviews.rename(columns = {'review':'Review', 'rating_score':'Rating', 'meal_type':'Meal','food_score':'Food', 'service_score':'Service', 'atmosphere_score':'Ambient', 'date':'Date'}, inplace = True)

# worst_reviews
worst_reviews = sample_reviews[sample_reviews['sample_type'] == 'worst_reviews_sample'][['date', 'rating_score','review', 'food_score', 'service_score', 'atmosphere_score', 'meal_type']]
worst_reviews.rename(columns = {'review':'Review', 'rating_score':'Rating', 'meal_type':'Meal','food_score':'Food', 'service_score':'Service', 'atmosphere_score':'Ambient', 'date':'Date'}, inplace = True)

In [None]:
general_insights

In [None]:
worst_periods_insights

In [None]:
display(sample_reviews.sample(3))

In [None]:
sample_reviews.groupby('sample_type').count()

In [None]:
period_reviews = sample_reviews[(sample_reviews['month'] == '2024-08') & (sample_reviews['sample_type'] == 'low_score_reviews')][['date', 'rating_score', 'review', 'food_score', 'service_score', 'atmosphere_score', 'meal_type']]
period_reviews

### Plots

In [4]:
def plotScoreTrends(reviews, app = False):
    # Convert date column to datetime format and create additional time columns
    reviews['date'] = pd.to_datetime(reviews['date'], errors='coerce')
    reviews['month'] = reviews['date'].dt.to_period('M')
    reviews['year'] = reviews['date'].dt.year
    reviews['week'] = reviews['date'] - pd.to_timedelta(reviews['date'].dt.weekday, unit='d')
    reviews['week'] = reviews['week'].dt.strftime('%Y-%m-%d')

    # Filter data for the last periods (months, years, weeks)
    limit_date = reviews['date'].max()
    last_months = reviews[reviews['date'] >= limit_date - pd.DateOffset(months=12)]
    last_years = reviews[reviews['date'] >= limit_date - pd.DateOffset(years=8)]
    last_weeks = reviews[reviews['date'] >= limit_date - pd.DateOffset(weeks=5)]

    # Compute averages for the required periods
    monthly_avg_scores = last_months.groupby('month')[['rating_score', 'food_score', 'service_score', 'atmosphere_score']].mean()
    yearly_avg_scores = last_years.groupby('year')[['rating_score']].mean()
    weekly_avg_scores = last_weeks.groupby('week')[['rating_score', 'food_score', 'service_score', 'atmosphere_score']].mean()



    # Create a figure with subplots using the Z-layout
    fig = make_subplots(rows=2, cols=2,
                        specs=[[{"colspan": 2}, None],
                               [{}, {}]],  # 1 large plot on the first row, 2 smaller plots on the second
                        subplot_titles=("Monthly Score Trends (Last 12 Months)", 
                                        "Annual Rating Score Trends (Last 6 Years)", 
                                        "Weekly Score Trends (Last 4 Weeks)"))

    # Add monthly score trends to the first row


    # Add yearly score trends to the second row (left)
    fig.add_trace(
        go.Scatter(x=yearly_avg_scores.index.astype(str), y=yearly_avg_scores['rating_score'],
                   mode='lines+markers', name="Rating", line=dict(color='#1f77b4', width=4),
                   text=[f"Rating - {val:.2f}" for val in yearly_avg_scores['rating_score']], 
                   hoverinfo="text"),
        row=2, col=1)

    # Add weekly score trends to the second row (right)
    for i, column in enumerate(weekly_avg_scores.columns):
        label = label_mapping[column]
        fig.add_trace(
            go.Scatter(x=weekly_avg_scores.index.astype(str), y=weekly_avg_scores[column],
                       mode='lines+markers', name=label, 
                       text=[f"{label} - {val:.2f}" for val in weekly_avg_scores[column]], 
                       hoverinfo="text", line=dict(color=colors[i])),
            row=2, col=2)

    # Customize layout
    fig.update_layout(showlegend=False, 
                      title="Score Trends Analysis",
                      title_font=dict(size=28),
                      margin=dict(l=50, r=50, t=100, b=50),
                      paper_bgcolor="white",
                      height=800, width=1200)
    fig.update_xaxes(showline=False, showgrid=False)
    fig.update_yaxes(showline=False, showgrid=True)

    # Customize x-axes formatting
    fig.update_xaxes(tickformat="%Y", row=2, col=1)  # Yearly format
    fig.update_xaxes(tickformat="%d-%b", row=2, col=2)  # Weekly format

    # Add annotations
    fig.add_annotation(x='2024-06', y=4.8, text="Highest Score", showarrow=True, arrowhead=2, ax=0, ay=80, row=1, col=1)
    fig.add_annotation(x='2024-03', y=4.5, text="Drop in March", showarrow=True, arrowhead=2, ax=0, ay=-40, row=1, col=1)
    fig.add_annotation(x='2024-08', y=4.5, text="Drop in August", showarrow=True, arrowhead=2, ax=0, ay=-40, row=1, col=1)

    # Update marker sizes
    fig.update_traces(marker=dict(size=8), selector=dict(name="Rating"))
    
    # Show the figure
    if app:
        return fig
    else:
        fig.show()


In [38]:
def plotTrend(reviews, label_mapping, app=False, filter_min=None, filter_max=None):
    # Convert date column to datetime format and create additional time columns
    reviews['date'] = pd.to_datetime(reviews['date'], errors='coerce')
    reviews['month'] = reviews['date'].dt.to_period('M')

    # Filter data for the last periods based on filter_min and filter_max
    limit_date = reviews['date'].max()
    if filter_min is None and filter_max is None:
        # If both filters are None, select data from the last year
        start_date = limit_date - pd.DateOffset(years=1)
        selected_reviews = reviews[(reviews['date'] >= start_date) & (reviews['date'] <= limit_date)]
    else:
        # Apply the filters if provided
        selected_reviews = reviews
        if filter_min is not None:
            selected_reviews = selected_reviews[selected_reviews['date'] >= filter_min]
        if filter_max is not None:
            selected_reviews = selected_reviews[selected_reviews['date'] <= filter_max]

    # Compute averages for the required periods using label_mapping keys
    columns_to_average = list(label_mapping.keys())
    monthly_avg_scores = selected_reviews.groupby('month')[columns_to_average].mean()
    
    # Create a figure to plot the trends
    fig = make_subplots(rows=1, cols=1)
    
    # Update the axis labels for each score to be more readable
    colors = ['#32CD32', 'rgba(31, 119, 180, 0.8)', 'rgba(107, 174, 214, 0.8)', 'rgba(158, 202, 225, 0.8)'] 
    for i, column in enumerate(monthly_avg_scores.columns):
        label = label_mapping[column]
        fig.add_trace(
            go.Scatter(x=monthly_avg_scores.index.astype(str), y=monthly_avg_scores[column],
                       mode='lines+markers', name=label, 
                       text=[f"{label} - {val:.2f}" for val in monthly_avg_scores[column]], 
                       hoverinfo="text", line=dict(color=colors[i], width=3 if i == 0 else 2)),
            row=1, col=1)


    # Analyze low scores and find high score
    _, low_score_periods = ml_processing.analyzeLowScores(reviews, 'rating_score', num_periods=3)
    high_score_period = monthly_avg_scores['rating_score'].idxmax()
    high_score_value = monthly_avg_scores['rating_score'].max()
    
    # Add annotations for low scores
    for i in range(len(low_score_periods)):
        if i > 0 and low_score_periods[i] - low_score_periods[i - 1] == 1:
            # If two periods are contiguous, combine them in one annotation
            fig.add_annotation(x=str(low_score_periods[i]), y=monthly_avg_scores.loc[low_score_periods[i], 'rating_score'] + 0.5,
                               text=f"Drop in {low_score_periods[i - 1].strftime('%B')} & {low_score_periods[i].strftime('%B')}",
                               showarrow=True, arrowhead=2, ax=0, ay=-40, row=1, col=1)
        elif i == 0 or low_score_periods[i] - low_score_periods[i - 1] != 1:
            fig.add_annotation(x=str(low_score_periods[i]), y=monthly_avg_scores.loc[low_score_periods[i], 'rating_score'] + 0.5,
                               text=f"Drop in {low_score_periods[i].strftime('%B')}",
                               showarrow=True, arrowhead=2, ax=0, ay=-40, row=1, col=1)
    
    # Add annotation for high score
    fig.add_annotation(x=str(high_score_period), y=high_score_value - 0.3,
                       text=f"High in {high_score_period.strftime('%B')}",
                       showarrow=True, arrowhead=2, ax=0, ay=40, row=1, col=1)

    fig.update_xaxes(showgrid=False)
    fig.update_yaxes(showgrid=False, title_text='Average Score')
    fig.update_layout(showlegend=False, 
                    #title="Rating Trends",
                    #title_font=dict(size=28),
                    margin=dict(l=50, r=50, t=100, b=50),
                    paper_bgcolor="white",
                    height=400, width=1200)
    
    # Show or return the figure depending on the context
    if app:
        return fig
    else:
        fig.show()

In [39]:
label_mapping = {
    'rating_score': 'Rating',
    'food_score': 'Food',
    'service_score': 'Service',
    'atmosphere_score': 'Ambient'
}

plotTrend(reviews, label_mapping, app=False)
#plotTrend(reviews, label_mapping, app=False, filter_min="2024-01-01", filter_max="2024-05-01")