In [1]:
import pandas as pd
import numpy as np
import vizro.plotly.express as px
import plotly.io as pio
from datasets import Dataset
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from umap import UMAP
import hdbscan
from tqdm.auto import tqdm
from data_cleaning_and_utilities import plot_preset, get_earth_colorscale


### Data loading

In [2]:
path = "../data/cleaned_reviews.csv"
df = pd.read_csv(path)

In [3]:
# Set up plot vizual
pio.templates.default = "vizro_dark"
earth_palette = get_earth_colorscale()

# Sentiment analysis with BERT

### Model Setup and Tokenization

In [4]:
# Mutlilinguar BERT model that analyzes sentiment and can differentiate uppercase and lowercase
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
# Set up model to use gpu
device = 0 if torch.cuda.is_available() else -1  
# Set up automatic tokenizer to convert reviews to be usable by model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Set up pipeline to analyze reviews
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)

Device set to use cuda:0


### Apply Model to Dataset

In [5]:
# Convert dataset to a Hugging Face Dataset
hf_dataset = Dataset.from_dict({"text": df["text"].astype(str).tolist()})

print(f"Running BERT on {len(hf_dataset)} reviews")

# Run the pipeline on all reviews at once with batching 
results = sentiment_pipeline(hf_dataset["text"], batch_size=64)

# Add results back to DataFrame
df["bert_sentiment"] = [r["label"] for r in results]
df["bert_score"] = [r["score"] for r in results]


Running BERT on 12420 reviews


### Map and Normalize Sentiment Labels

In [6]:
# Remap the score for comparison
map_real = {
    "5 star rating" : "5",
    "4 star rating" : "4",
    "3 star rating" : "3",
    "2 star rating" : "2",
    "1 star rating" : "1"
}
map_bert = {
    "5 stars" : "5",
    "4 stars" : "4",
    "3 stars" : "3",
    "2 stars" : "2",
    "1 star" : "1"
}

df["score"]= df["score"].map(map_real)
df["bert_sentiment"] = df["bert_sentiment"].map(map_bert)

# Convers rating to float type
df[["score", "bert_sentiment"]] = df[["score", "bert_sentiment"]].astype("float")

### Compare users and BERT averages

In [7]:
# Calculate averages of user and BERT scores
reviewers_score_mean = df["score"].mean()
bert_score_mean = df["bert_sentiment"].mean()

print(f"Review score mean - {reviewers_score_mean:.2f}")
print(f"Bert score mean is - {bert_score_mean:.2f}")

Review score mean - 4.53
Bert score mean is - 4.33


### Compare user and BERT sentiment distribution

In [8]:
# Calculate users and BERT sentiment distribution
user_rating = df["score"].value_counts().reset_index()
user_rating.columns = ["Score", "Count"]
bert_rating = df["bert_sentiment"].value_counts().reset_index()
bert_rating.columns = ["Score", "Count"]
# Visualize BERT sentiment distribution across rating with pie chart
fig = px.pie(user_rating , values="Count", names="Score", title="Users Score Distribution", color_discrete_sequence=earth_palette)
fig = plot_preset(fig)
# Adjust legend position
fig.update_layout(
    legend=dict(
        orientation="h",            
        yanchor="bottom", y=-0.2,   
        xanchor="center", x=0.5     
    )
);
fig = px.pie(bert_rating, values="Count", names="Score", title="BERT Score Distribution", color_discrete_sequence=earth_palette)
fig = plot_preset(fig)
fig.update_layout(
    legend=dict(
        orientation="h",            
        yanchor="bottom", y=-0.2,   
        xanchor="center", x=0.5     
    )
);

### Calculating mismatch percentage per score

In [9]:
# Flag reviews where the user-provided score and BERT-predicted sentiment disagree
df["mismatch"] = df["score"] != df["bert_sentiment"]

# Extract only the rows where a mismatch occurs
mismatches = df[df["mismatch"]]

# Count the number of mismatched reviews for each user-provided score
count_per_score = mismatches.groupby("score")["review_id"].count().reset_index()

# Count the total number of reviews for each user-provided score
total_reviews = df.groupby("score")["review_id"].count().reset_index()

# Merge mismatch counts with total review counts by score
count_per_score = pd.merge(count_per_score, total_reviews, on="score", how="right")

# Calculate what percentage of all mismatches belong to each score category
count_per_score["percentages"] = round(count_per_score["review_id_x"] / count_per_score["review_id_x"].sum() * 100, 2)

# Calculate the mismatch rate within each score category
count_per_score["percentages_2"] = round(count_per_score["review_id_x"] / count_per_score["review_id_y"] * 100, 2)

# Rename columns for better readability in the final output
count_per_score.columns = [
    "Score",
    "Mismatched review amount",
    "Total review amount",
    "% of All Mismatch",
    "% Mismatch per Score"
]

# Save the mismatch statistics to a CSV file for reporting or further analysis
output_path = (current_working_directory / "../data/BERT_mismatch_data.csv").resolve()
count_per_score.to_csv(output_path, index=False)

# Preview the result
count_per_score.head()

NameError: name 'current_working_directory' is not defined

In [None]:
# Visualize total reviews and mismatch percentages per score using a grouped bar chart
fig = px.bar(count_per_score, x="Score", y=["Total review amount", "Mismatched review amount"], barmode="group", labels={"variable": "Metric"}, 
             title="Mismatch Metrics by Review Score(Log scale)", log_y = True, color_discrete_sequence=earth_palette,)
fig = plot_preset(fig)
fig.update_layout(
   yaxis_title = "Review Amount",
   yaxis_tickvals = ["250", "500", "1000", "1700", "3000", "5500", "10000"],
   yaxis_ticktext = ["250", "500", "1000", "1700", "3000", "5500", "10000"]
);

### Heatmap showcasing results

In [None]:
# Create confusion matrix
confusion = pd.crosstab(df["score"], df["bert_sentiment"])
# Apply log scaling for better color distribution
confusion_log = np.log1p(confusion)

# Plot the heatmap
fig = px.imshow(
    confusion_log,
    labels=dict(x="BERT Star Rating", y="User Star Rating"),
    x=confusion.columns.astype(str),
    y=confusion.index.astype(str),
    color_continuous_scale="Viridis",
)

# Add raw counts as annotations
fig.update_traces(
    text=confusion.values,
    texttemplate="%{text}",
    textfont={"size": 12, "color": "black"}
)
fig = plot_preset(fig)
fig.update_layout(
    title={
        "text": "User Ratings vs BERT Predicted Ratings",
        "x": 0.45,  
        "xanchor": "center"
    }
);

In [None]:
# Calculate the average user rating by size feedback category
size_users = df.groupby("size")["score"].mean().reset_index()

# Calculate the average BERT-predicted sentiment by size feedback category
size_bert = df.groupby("size")["bert_sentiment"].mean().reset_index()

# Merge both sets of averages into one DataFrame
size_analisys = pd.merge(size_users, size_bert, on="size", how="right")

# Rename columns for clarity
size_analisys.columns = ["Size Feedback", "User Score", "BERT Score"]

# Calculate the difference (drift) between user score and BERT sentiment
size_analisys["Drift (Score - Sentiment)"] = size_analisys["User Score"] - size_analisys["BERT Score"]

# Reshape the DataFrame for grouped bar plotting
size_analisys = size_analisys.melt(
    id_vars="Size Feedback",
    value_vars=["User Score", "BERT Score"],
    var_name="Metric",
    value_name="Value"
)

# Plot average scores by size feedback
fig = px.bar(
    size_analisys,
    x="Size Feedback",
    y="Value",
    color="Metric",
    barmode="group",
    color_discrete_sequence=earth_palette,
    title="Users Rate Higher Than They Feel: Size Feedback vs. Review Sentiment"
)

fig = plot_preset(fig)
fig.update_layout(
    xaxis_title="Metric",
    yaxis_title="Average Score"
);

### Review Length

In [None]:
# Aggregate average review length, BERT sentiment, user score, and count by size feedback
dft = df.groupby("size").agg({
    "review_length": "mean",
    "bert_sentiment": "mean",
    "score": "mean",
    "review_id": "count"
}).reset_index()

# Plot review length distribution across size categories
fig = px.box(dft, y="review_length",  color_discrete_sequence=['#F1B555'])
fig = plot_preset(fig)
fig.update_layout(
    yaxis_title="Review Length",
    width=415
);

### Average review length by category

In [None]:
# Group by product category and compute total review length and number of reviews
dfc = df[["main_category", "review_length", "review_id"]]
dfc = dfc.groupby("main_category").agg({
    "review_length": "sum",
    "review_id": "count"
}).reset_index()

# Calculate average review length per review
dfc["avg_review_length"] = round(dfc["review_length"] / dfc["review_id"], 2)

# Sort by average length for better visual ranking
dfc = dfc.sort_values("avg_review_length", ascending=True)

# Plot horizontal bar chart of average review length by category
fig = px.bar(
    dfc,
    x="avg_review_length",
    y="main_category",
    orientation="h",
    text="avg_review_length",
    color="main_category",
    title="Average Review Length by Product Category",
    color_discrete_sequence=earth_palette
)

fig.update_traces(textposition="outside")
fig = plot_preset(fig)
fig.update_layout(
    xaxis_title="Avg. Review Length (Characters)",
    yaxis_title="Product Category",
    showlegend=False,
    width=1100
);

### Review mismatch per main category

In [None]:
# Select relevant columns and create a copy for mismatch analysis by category
dfc = df[["main_category", "review_id", "score", "bert_sentiment"]].copy()

# Flag mismatches between user rating and BERT sentiment
dfc["mismatches"] = dfc["score"] != dfc["bert_sentiment"]
dfc = dfc.rename(columns = {"review_id" : "Review Amount", "mismatches" : "Mismatches"})
# Drop raw score columns to focus on counts and mismatches
dfc = dfc.drop(["score", "bert_sentiment"], axis=1)

# Group by product category and aggregate total reviews and mismatch counts
dfc = dfc.groupby("main_category").agg({
    "Review Amount": "count",
    "Mismatches": "sum"
}).reset_index()

# Plot grouped bar chart of total reviews vs mismatches per product category
fig = px.bar(dfc, x="main_category", y=["Review Amount", "Mismatches"], barmode="group", title="Review vs Mismatch Count by Product Category(Log scale for Review Amount)", log_y=True,
    color_discrete_sequence=earth_palette
)

fig = plot_preset(fig)

# Update axis labels and customize tick values for clarity
fig.update_layout(
    xaxis_title="Product Category",
    yaxis_title="Review Amount",
    yaxis_tickvals=["30", "100", "250", "500", "1000", "2000", "5000", "10000"],
    yaxis_ticktext=["30", "100", "250", "500", "1000", "2000", "5000", "10000"]
);

# Aspect-Based Sentiment Analysis

In [None]:
# Extract keywords from reviews using bag of words model
# Ensure data type
df["text"] = df["text"].fillna("").astype(str)

# Initialize vectorizer
vectorizer = CountVectorizer(stop_words="english", max_features=1000)

# Fit and transform text data
X = vectorizer.fit_transform(df["text"])

# Sum word frequencies
word_freq = X.sum(axis=0).A1

# Get words
vocab = vectorizer.get_feature_names_out()

# Build DataFrame
freq_df = pd.DataFrame({"word": vocab, "count": word_freq})

# Sort by count
freq_df = freq_df.sort_values(by="count", ascending=False).reset_index(drop=True)

freq_df.head(200).to_string("top_words.txt")

In [None]:
# Create aspect keyword disctinory and list based on bag of words model using top_words.txt
aspect_keywords = {
    "size_and_fit": [
        "size", "fit", "small", "half", "wide", "big", "tight", "narrow", "fits",
        "runs", "sizing", "bigger", "smaller", "snug", "room", "sized", "too small",
        "too big", "toe", "strap", "toes", "calves", "calf", "straps", "ankle"
    ],
    "comfort": [
        "comfortable", "comfy", "feet", "feel", "soft", "uncomfortable", "comfort", "worn"
    ],
    "style_and_appearance": [
        "cute", "look", "stylish", "beautiful", "color", "style", "black", "nice",
        "pretty", "dress", "gorgeous", "classy", "red", "casual", "white", "brown",
        "sexy", "outfit", "looks", "looking", "design"
    ],
    "quality_and_material": [
        "quality", "leather", "material", "suede", "stiff"
    ],
    "wearability": [
        "wear", "walk", "long", "wore", "walking", "day", "hours", "short"
    ],
    "price_and_value": [
        "price", "expensive", "affordable", "worth", "value", "cheap", "overpriced"
    ],
    "delivery": [
        "delivery", "shipping", "arrived", "fast", "slow", "late", "package"
    ]
}
shoe_type_keywords = [
    "sandals", "heels", "sneakers", "loafers", "flats", "boots", 
    "platforms", "wedges", "slides", "mules"
]

In [None]:
# Create columns in main dataframe with keyword aspects
def detect_aspects(text, aspect_keywords):
    text = text.lower()
    detected = []
    for aspect, keywords in aspect_keywords.items():
        if any(kw in text for kw in keywords):
            detected.append(aspect)
    return detected
df["detected_aspects"] = df["text"].apply(lambda x: detect_aspects(x, aspect_keywords))

def detect_shoe_type(text, shoe_type_keywords):
    text = str(text).lower()
    for keyword in shoe_type_keywords:
        if keyword in text:
            return keyword  
    return None 
df["detected_shoe_type"] = df["text"].apply(lambda x: detect_shoe_type(x, shoe_type_keywords))

In [None]:
# Explode the 'detected_aspects' list column so each aspect has its own row
df_exploded = df.explode("detected_aspects")
# Filter out review with no aspects 
df_exploded = df_exploded[df_exploded["detected_aspects"].notna()]

# Assign setiment label to each aspect based on bert sentiment
def sentiment_label(score):
    if score >= 4:
        return "Positive"
    elif score <= 2:
        return "Negative"
    else:
        return "Neutral"

df_exploded["aspect_sentiment"] = df_exploded["bert_sentiment"].apply(sentiment_label)


In [None]:
# Group the exploded review data by aspect and sentiment, count occurrences, pivot to wide format
df_aspects = df_exploded.groupby(["detected_aspects", "aspect_sentiment"]).size().unstack(fill_value=0).reset_index()

# Calculate sentiment percentages for each aspect
df_aspects["Negative_%"] = (df_aspects["Negative"] / (df_aspects["Negative"] + df_aspects["Neutral"] + df_aspects["Positive"]) * 100)
df_aspects["Neutral_%"] = (df_aspects["Neutral"] / (df_aspects["Negative"] + df_aspects["Neutral"] + df_aspects["Positive"]) * 100)
df_aspects["Positive_%"] = (df_aspects["Positive"] / (df_aspects["Negative"] + df_aspects["Neutral"] + df_aspects["Positive"]) * 100)

# Convert absolute sentiment counts and sentiment percentages to long format for plotting
df_aspects_raw = df_aspects.melt(id_vars="detected_aspects", value_vars=["Negative", "Neutral", "Positive"], var_name="Aspects", value_name="Count")

df_aspects_percentages = df_aspects.melt(id_vars="detected_aspects", value_vars=["Negative_%", "Neutral_%", "Positive_%"], var_name="Aspects", value_name="Percentage")

# Create a bar chart showing sentiment count per aspect (log-scaled Y axis for readability)
fig = px.bar(df_aspects_raw, x="detected_aspects", y="Count", color="Aspects",log_y=True,
    color_discrete_sequence=earth_palette, barmode="group", title="Aspect-Level Breakdown of Sentiment in Reviews"
)

# Update chart layout for clarity and consistent scaling
fig.update_layout(
    xaxis_title="Product Aspect Category",
    yaxis_title="Review Count (Log Scale)",
    yaxis_tickvals=[0, 20, 50, 100, 250, 500, 1000, 1700, 3000, 5000, 8000]
);

# Create a bar chart showing sentiment percentage per aspect
fig = px.bar(df_aspects_percentages, x="detected_aspects", y="Percentage", color="Aspects",
    color_discrete_sequence=earth_palette, barmode="group", title="Aspect-Level Breakdown of Sentiment in Reviews"
)

fig.update_layout(
    xaxis_title="Product Aspect Category",
    yaxis_title="Review Percentage"
);


In [None]:
# Explode the 'detected_shoe_type' list column so each aspect has its own row
df_exploded_shoe = df.explode("detected_shoe_type")

# Filter out review with no aspects 
df_exploded = df_exploded_shoe[df_exploded_shoe["detected_shoe_type"].notna()]

df_exploded["shoe_type_sentiment"] = df_exploded["bert_sentiment"].apply(sentiment_label)

In [None]:
# Group the exploded review data by aspect and sentiment, count occurrences, pivot to wide format
df_aspects = df_exploded.groupby(["detected_shoe_type", "shoe_type_sentiment"]).size().unstack(fill_value=0).reset_index()
# Convert absolute sentiment counts and sentiment percentages to long format for plotting
df_aspects = df_aspects.melt(id_vars="detected_shoe_type", value_vars=["Negative", "Neutral", "Positive"], var_name="Aspects", value_name="Count")
# Create a bar chart showing sentiment count per aspect (log-scaled Y axis for readability)
fig = px.bar(df_aspects, x="detected_shoe_type", y="Count", color="Aspects",log_y=True,
    color_discrete_sequence=earth_palette, barmode="group", title="Aspect-Level Breakdown of Sentiment in Reviews"
)

# Update chart layout for clarity and consistent scaling
fig.update_layout(
    xaxis_title="Product Aspect Category",
    yaxis_title="Review Count (Log Scale)",
    yaxis_tickvals=[0, 2, 5, 10, 25, 50, 100, 250, 500, 1000]
);

# BERTopic

In [None]:
# Remove NaNs and short reviews
df = df[df["text"].notna()]
df = df[df["text"].str.len() > 20]
df = df.reset_index(drop=True) 
# Apply BERTopic
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=9)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean', prediction_data=True)

topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, language="english", min_topic_size=10, verbose=True)
topics, probs = topic_model.fit_transform(df["text"])

# Get topic info
df["topic"] = topics
df["topic_prob"] = probs
topic_info = topic_model.get_topic_info()


2025-07-10 10:26:02,522 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/340 [00:00<?, ?it/s]

2025-07-10 10:26:13,598 - BERTopic - Embedding - Completed ✓
2025-07-10 10:26:13,599 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-10 10:26:39,241 - BERTopic - Dimensionality - Completed ✓
2025-07-10 10:26:39,243 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-10 10:26:39,638 - BERTopic - Cluster - Completed ✓
2025-07-10 10:26:39,643 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-10 10:26:39,908 - BERTopic - Representation - Completed ✓


In [None]:
#Use BERT sentiment pipeline to assign sentiment scores
tqdm.pandas()
hf_dataset = Dataset.from_dict({"text": df["text"].astype(str).tolist()})

# Define sentiment function using your pipeline
def get_sentiment_label(example):
    result = sentiment_pipeline(example["text"][:512])
    label = result[0]['label']
    stars = int(label.split()[0])
    if stars >= 4:
        return {"sentiment_label": "Positive"}
    elif stars == 3:
        return {"sentiment_label": "Neutral"}
    else:
        return {"sentiment_label": "Negative"}

# Apply the function to the dataset (batched=False because we use single texts)
hf_dataset = hf_dataset.map(get_sentiment_label)

# Convert back to pandas and merge with your original DataFrame
df["sentiment_label"] = hf_dataset["sentiment_label"]

Map:   0%|          | 0/10855 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
# Prepare a list to collect topic summary data
topic_summaries = []
for row in topic_info.itertuples():
    if row.Topic == -1:
        continue  # Skip outliers/no-topic

    # Extract topic metadata
    topic_num = row.Topic
    topic_name = row.Name
    topic_count = row.Count
    topic_keywords = row.Representation

    # Filter and sort reviews assigned to this topic by confidence
    topic_reviews_df = df[df["topic"] == topic_num][["text", "topic_prob", "sentiment_label"]].dropna()
    topic_reviews_df = topic_reviews_df.sort_values("topic_prob", ascending=False)

    # Aggregate sentiment counts and compute percentages
    sentiment_counts = topic_reviews_df["sentiment_label"].value_counts()
    total = sentiment_counts.sum()
    pos_sent = sentiment_counts.get("Positive", 0) / total * 100
    neu_sent = sentiment_counts.get("Neutral", 0) / total * 100
    neg_sent = sentiment_counts.get("Negative", 0) / total * 100

    # Output topic summary details
    print(f"Topic Name and Number: {topic_name}")
    print(f"Review count - {topic_count}")
    print(f"Keywords - {topic_keywords}")
    print(f"Sentiment distribution: Positive: {pos_sent:.1f}%, Neutral: {neu_sent:.1f}%, Negative: {neg_sent:.1f}%")

    # Extract and print the top 5 most confident reviews
    most_confident_reviews = topic_reviews_df.head(5)["text"]
    print("Most Confident Reviews:")
    for i, review in enumerate(most_confident_reviews, 1):
        print(f"{i}. {review.strip()[:300]}")
    print("\n")

    # Store all topic-level insights into a summary dictionary
    topic_summaries.append({
        "Topic Number": topic_num,
        "Topic Name": topic_name,
        "Review Count": topic_count,
        "Keywords": topic_keywords,
        "Positive %": round(pos_sent, 1),
        "Neutral %": round(neu_sent, 1),
        "Negative %": round(neg_sent, 1)
    })

# Create a DataFrame with all topic summaries for further analysis and visualization
topic_summary_df = pd.DataFrame(topic_summaries)
topic_summary_df.head()

Topic Name and Number: 0_shoes_these_are_love
Review count - 381
Keywords - ['shoes', 'these', 'are', 'love', 'comfortable', 'absolutely', 'cute', 'comfy', 'so', 'and']
Sentiment distribution: Positive: 95.3%, Neutral: 3.4%, Negative: 1.3%
Most Confident Reviews:
1. The animal print is cute detail when shoes are off. They are comfy & can be used for almost any shoe-flats & any low heeled styles. Using these for SM boots I bought (Ryder).
2. While these are cute they are also comfy. A bit clunky but are stylish as well. Have had many compliments on my jelly shoes!
3. These are cute and comfortable socks that are a fashion must have, especially when you don't want white socks peeking out from your shoes!
4. I absolutely love these shoes. They are so comfortable. I went to the zoo and wore these sandals, I was very comfortable the entire day! I was very surprised, as usually thicker wedge sandals would hurt my feet. These are the best pair!!!
5. I was drawn in by the design of the shoe an

Unnamed: 0,Topic Number,Topic Name,Review Count,Keywords,Positive %,Neutral %,Negative %
0,0,0_shoes_these_are_love,381,"[shoes, these, are, love, comfortable, absolut...",95.3,3.4,1.3
1,1,1_comfortable_them_comfy_are,357,"[comfortable, them, comfy, are, love, cute, th...",86.3,4.2,9.5
2,2,2_boots_these_love_them,329,"[boots, these, love, them, are, comfortable, s...",95.4,1.8,2.7
3,3,3_sandals_are_these_summer,242,"[sandals, are, these, summer, they, comfortabl...",90.1,5.4,4.5
4,4,4_wide_narrow_feet_foot,229,"[wide, narrow, feet, foot, width, have, if, yo...",40.2,29.7,30.1


In [None]:
# Select top 10 topics for plotting
df = topic_summary_df.head(10)

# Create grouped bar chart of sentiment percentages
fig = px.bar(df, x="Topic Name", y=["Positive %", "Neutral %", "Negative %"], barmode="group",
             color_discrete_sequence=earth_palette, title="BERTopic Sentiment Analysis")

fig = plot_preset(fig)

fig.update_layout(
    xaxis_title="Topic",
    yaxis_title="Percentage",
    legend_title="Sentiment",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1,
        xanchor="center",
        x=0.5
    ),
    xaxis_tickangle=30,
    margin=dict(b=150, t=80)
);