Graph Slide 10

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json

# --- Configuration ---
# Set the path to your downloaded data file
# Choose either CSV or Parquet, and uncomment the relevant line.
# If you saved as both, Parquet is generally faster for large files.

# DATA_FILE_PATH = "final_data.csv"
DATA_FILE_PATH = "final_data.parquet" # Recommended for performance if available

# --- Load Data ---
def load_data(file_path):
    """
    Loads data from a CSV or Parquet file into a Pandas DataFrame.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Data file not found: {file_path}")

    print(f"Loading data from {file_path}...")
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    elif file_path.endswith('.parquet'):
        df = pd.read_parquet(file_path)
    else:
        raise ValueError("Unsupported file format. Please use .csv or .parquet.")
    print(f"Data loaded successfully. Total rows: {len(df)}")
    return df

# --- Main Analysis and Visualization ---
if __name__ == "__main__":
    df = load_data(DATA_FILE_PATH)

    # Ensure necessary columns exist for these visualizations
    required_cols = ['sentiment_label', 'stars', 'key_category', 'sentiment_score']
    if not all(col in df.columns for col in required_cols):
        raise ValueError(f"Missing required columns in DataFrame for these visualizations. Expected: {required_cols}. Found: {df.columns.tolist()}")

    sns.set_theme(style="whitegrid") # Apply a consistent theme for all plots

    # --- Visualization 1: Overall Sentiment Distribution ---
    print("\nGenerating Overall Sentiment Distribution visualization...")
    plt.figure(figsize=(8, 6))
    sentiment_counts = df['sentiment_label'].value_counts(normalize=True).reindex(['positive', 'neutral', 'negative']) * 100

    # Custom color mapping for consistency
    sentiment_colors = {'positive': 'green', 'neutral': 'gold', 'negative': 'red'}
    colors_ordered = [sentiment_colors[label] for label in sentiment_counts.index]

    bars = plt.bar(sentiment_counts.index, sentiment_counts.values, color=colors_ordered)
    plt.title('Overall Sentiment Distribution Across All Reviews')
    plt.xlabel('Sentiment Label')
    plt.ylabel('Proportion of Reviews (%)')
    plt.ylim(0, 100) # Ensure y-axis is from 0 to 100%

    # Add percentage labels on top of bars
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval + 1, f'{yval:.1f}%', ha='center', va='bottom')

    plt.tight_layout()
    plt.show()
    print("Overall Sentiment Distribution visualization generated.")

    # --- Visualization 2: Average Business Stars per Key Category ---
    print("\nGenerating Average Business Stars per Key Category visualization...")
    # Calculate average stars for each key_category
    avg_stars_per_category = df.groupby('key_category')['stars'].mean().sort_values(ascending=False).reset_index()

    plt.figure(figsize=(12, 7))
    sns.barplot(x='stars', y='key_category', data=avg_stars_per_category, palette='viridis')
    plt.title('Average Business Stars per Key Category')
    plt.xlabel('Average Star Rating')
    plt.ylabel('Key Category')
    plt.xlim(1, 5) # Star ratings are typically 1-5
    plt.tight_layout()
    plt.show()
    print("Average Business Stars per Key Category visualization generated.")


    # --- Visualization 3: Sentiment Distribution by Business Stars ---
    print("\nGenerating Sentiment Distribution by Business Stars visualization...")
    # Group by business stars and sentiment label, then count
    sentiment_by_stars = df.groupby(['stars', 'sentiment_label']).size().unstack(fill_value=0)

    # Ensure all sentiment labels are present for consistent stacking
    for label in ['positive', 'neutral', 'negative']:
        if label not in sentiment_by_stars.columns:
            sentiment_by_stars[label] = 0

    # Order columns for stacking (negative, neutral, positive)
    sentiment_by_stars = sentiment_by_stars[['negative', 'neutral', 'positive']]

    # Calculate proportions within each star rating
    sentiment_by_stars_props = sentiment_by_stars.div(sentiment_by_stars.sum(axis=1), axis=0) * 100

    plt.figure(figsize=(10, 6))
    sentiment_by_stars_props.plot(
        kind='bar',
        stacked=True,
        ax=plt.gca(),
        color=['#e41a1c', '#ffff33', '#4daf4a'] # Red, Yellow, Green
    )
    plt.title('Sentiment Distribution by Business Star Rating')
    plt.xlabel('Business Star Rating')
    plt.ylabel('Proportion of Reviews (%)')
    plt.xticks(rotation=0) # Keep x-axis labels horizontal
    plt.legend(title='Sentiment Label', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()
    print("Sentiment Distribution by Business Stars visualization generated.")

    print("\nAll requested compelling visualizations generated.")

Graph Slide 13 and 14

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from dotenv import dotenv_values
from matplotlib.cm import get_cmap

# --- Load DB config ---
config = dotenv_values()
pg_user = config['POSTGRES_USER']
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

# --- DB connection ---
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url)

# --- Read table ---
query = f'SELECT * FROM {pg_schema}."table_test_4"'
df = pd.read_sql(query, con=engine)

# --- Topic name mapping ---
topic_name_map = {
    '7_thai_pho_chinese_soup': 'Asian Cuisine',
    '5_pizza_crust_pizzas_best pizza': 'Pizza',
    '4_hair_massage_dress_salon': 'Hair Salon',
    '2_dr_dentist_office_dental': 'Dentist / Doctors',
    '11_nails_nail_salon_gel': 'Nail Salon Positive',
    '9_room_hotel_stay_desk': 'Hotels',
    '58_used_changed_gone_quality': 'Change for the Worse',
    '31_dog_vet_dogs_cat': 'Pets',
    '14_nails_nail_gel_polish': 'Nail Salon Negative',
    '111_pharmacy_prescription_walgreens_cvs': 'Pharmacy'
}
df['topic_name'] = df['topic_name'].replace(topic_name_map)

# --- Assign sentiment ---
positive_topics = [4, 7, 2, 5, 11]
negative_topics = [14, 9, 31, 58, 111]
df['sentiment'] = df['topic'].apply(lambda x: 'positive' if x in positive_topics else 'negative')

# --- Pivot prep function ---
def prepare_pivot(data):
    pivot = data.pivot(index='topic_name', columns='key_category', values='percentage').fillna(0)
    order = data.groupby('topic_name')['percentage'].sum().sort_values(ascending=False).index
    return pivot.loc[order]

pivot_pos = prepare_pivot(df[df['sentiment'] == 'positive'])
pivot_neg = prepare_pivot(df[df['sentiment'] == 'negative'])

# --- Create a color map ---
all_categories = sorted(set(pivot_pos.columns).union(pivot_neg.columns))
cmap = get_cmap('tab20', len(all_categories))
color_map = {cat: cmap(i) for i, cat in enumerate(all_categories)}

# --- Plot function with shared colors and legend filtering ---
def plot_filtered_legend(pivot, title, exclude_categories=None):
    fig, ax = plt.subplots(figsize=(12, 6))

    # Filter non-zero and excluded categories
    visible_categories = pivot.loc[:, (pivot != 0).any(axis=0)].columns.tolist()
    if exclude_categories:
        visible_categories = [cat for cat in visible_categories if cat not in exclude_categories]

    # Reorder pivot to match visible category order
    data = pivot[visible_categories]

    # Extract consistent colors
    colors = [color_map[cat] for cat in visible_categories]

    # Plot
    data.plot(kind='barh', stacked=True, ax=ax, color=colors)

    ax.legend(title='Key Category', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.set_title(title, fontsize=16, pad=15)
    ax.set_xlabel("Percentage of Reviews", fontsize=13)
    ax.set_ylabel("Topic", fontsize=13)
    plt.tight_layout()
    plt.show()

# --- Kategorien ausschließen ---
exclude_pos = [
    'Active Life', 'Arts & Entertainment', 'Automotive',
    'Home Services', 'Hotels & Travel', 'Local Services', 'Nightlife'
]
exclude_neg = [
    'Arts & Entertainment', 'Automotive',
    'Local Services', 'Nightlife', 'Home Services'
]

# --- Plots ---
plot_filtered_legend(pivot_pos, "Key Category Distribution – Positive Topics", exclude_categories=exclude_pos)
plot_filtered_legend(pivot_neg, "Key Category Distribution – Negative Topics", exclude_categories=exclude_neg)

Graph Slide 15 and 16

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from dotenv import dotenv_values

# --- Load DB config ---
config = dotenv_values()
pg_user = config['POSTGRES_USER']
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

# --- DB connection ---
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url)

# --- Read data ---
df = pd.read_sql(f'SELECT * FROM {pg_schema}."table_test_3"', con=engine)
pop_df = pd.read_sql(f'SELECT * FROM {pg_schema}."population_2019"', con=engine)

# --- Rename topic names ---
topic_name_map = {
    '7_thai_pho_chinese_soup': 'Asian Cuisine',
    '5_pizza_crust_pizzas_best pizza': 'Pizza',
    '4_hair_massage_dress_salon': 'Hair Salon',
    '2_dr_dentist_office_dental': 'Dentist / Doctors',
    '11_nails_nail_salon_gel': 'Nail Salon Positive',
    '9_room_hotel_stay_desk': 'Hotels',
    '58_used_changed_gone_quality': 'Change for the Worse',
    '31_dog_vet_dogs_cat': 'Pets',
    '14_nails_nail_gel_polish': 'Nail Salon Negative',
    '111_pharmacy_prescription_walgreens_cvs': 'Pharmacy'
}
df['topic_name'] = df['topic_name'].replace(topic_name_map)

# --- Assign sentiment labels ---
positive_topics = [4, 7, 2, 5, 11]
negative_topics = [14, 9, 31, 58, 111]
df['sentiment'] = df['topic'].apply(lambda x: 'positive' if x in positive_topics else 'negative')

# --- Pivot prep function (sorted by total review %) ---
def prepare_pivot(data):
    pivot = data.pivot(index='topic_name', columns='metro', values='percentage').fillna(0)
    order = pivot.sum(axis=1).sort_values(ascending=False).index
    return pivot.loc[order]

pivot_pos = prepare_pivot(df[df['sentiment'] == 'positive'])
pivot_neg = prepare_pivot(df[df['sentiment'] == 'negative'])

# --- Population normalization ---
pop_map = pop_df.set_index('metro')['Population'].to_dict()
normalized_pop = {k: v / max(pop_map.values()) for k, v in pop_map.items()}

# --- Custom metro color palette ---
# metro_color_map = {
#     'Boise': '#88CCEE',
#     'Edmonton': '#CC6677',
#     'Indianapolis': '#DDCC77',
#     'Nashville': '#117733',
#     'New Orleans': '#332288',
#     'Philadelphia': '#AA4499',
#     'Reno': '#44AA99',
#     'Santa Barbara': '#999933',
#     'St. Louis': '#882255',
#     'Tampa': '#661100',
#     'Tucson': '#6699CC'
# }

metro_color_map = {
    'Philadelphia': '#CAB2D6',       # lavender / light purple
    'Tampa': '#996600',              # brownish
    'St. Louis': '#661100',          # dark red
    'Tucson': '#D9A88C',             # dusty pink/tan
    'Nashville': '#FDC785',          # light orange
    'Reno': '#FDAE6B',               # orange-peach
    'Indianapolis': '#FF7F00',       # bright orange
    'New Orleans': '#33A02C',        # green
    'Santa Barbara': '#B15928',      # brown-orange
    'Edmonton': '#A6CEE3',           # pale blue
    'Boise': '#1F78B4',              # steel blue
    'Other': '#B2DF8A'               # pastel green
}


# --- General plot function ---
def plot_topic_distribution(pivot, title):
    fig, ax = plt.subplots(figsize=(12, 6))
    y_pos = 0
    yticks = []
    yticklabels = []

    for idx, (topic, row) in enumerate(pivot.iterrows()):
        left = 0
        sorted_metros = sorted(row.items(), key=lambda x: x[1], reverse=True)

        for metro, value in sorted_metros:
            width = value
            height = normalized_pop.get(metro, 0.5)
            color = metro_color_map.get(metro, '#888888')

            # Only add label to legend if it's not 'Other' and it's the first row
            label = metro if idx == 0 and metro != 'Other' else None

            ax.barh(y=y_pos, width=width, left=left, height=height,
                    color=color, label=label)
            left += width

        yticks.append(y_pos)
        yticklabels.append(topic)
        y_pos += 1.2

    # Deduplicated legend
    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    ax.legend(by_label.values(), by_label.keys(), title="Metro", bbox_to_anchor=(1.05, 1), loc='upper left')

    ax.set_yticks(yticks)
    ax.set_yticklabels(yticklabels)
    ax.set_title(title, fontsize=14)
    ax.set_xlabel("Percentage of Reviews")
    ax.set_ylabel("Topic")
    plt.tight_layout()
    plt.show()


# --- Plot both positive and negative topic charts ---
plot_topic_distribution(pivot_pos, "Positive Topics – Metro Distribution (Sorted by Review %)")
plot_topic_distribution(pivot_neg, "Negative Topics – Metro Distribution (Sorted by Review %)")