In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

DONATION_CSV = r"C:/Users/Dev/Documents/GitHub/report_jupyter_books/real_data/12570525/donation_table.csv"
MESSAGES_CSV = r"C:/Users/Dev/Documents/GitHub/report_jupyter_books/real_data/12570525/messages_filtered_table.csv"

donations = pd.read_csv(DONATION_CSV)
donations = donations[donations["source"] == "WhatsApp"]

messages = pd.read_csv(MESSAGES_CSV)
messages = messages[messages["donation_id"].isin(donations["donation_id"])]

#Gini and Lorenz 
def calculate_gini(counts):
    values = sorted(counts.values())
    n = len(values)
    total = sum(values)
    if n == 0 or total == 0:
        return 0
    weighted_sum = sum((i + 1) * val for i, val in enumerate(values))
    return (2 * weighted_sum) / (n * total) - (n + 1) / n

def plot_lorenz_curve(counts, title):
    values = np.array(sorted(counts.values()))
    cumulative = np.cumsum(values) / values.sum()
    cumulative = np.insert(cumulative, 0, 0)
    contacts = np.linspace(0, 1, len(values) + 1)

    gini = calculate_gini(counts)

    plt.figure(figsize=(6, 5))
    plt.plot(contacts * 100, cumulative * 100, label='Lorenz Curve', color='blue')
    plt.plot([0, 100], [0, 100], linestyle='--', color='gray', label='Perfect Equality')
    plt.fill_between(contacts * 100, contacts * 100, cumulative * 100, color='lightblue', alpha=0.3)
    plt.title(f"{title} (Gini Index = {gini:.3f})")
    plt.xlabel("Cumulative % of Contacts")
    plt.ylabel("Cumulative % of Messages/Words")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

#Burstiness
def compute_burstiness(days):
    days_sorted = sorted(days)
    inter_event_times = np.diff(pd.to_datetime(days_sorted)).astype('timedelta64[D]').astype(int)
    if len(inter_event_times) == 0:
        return None, None
    mu = np.mean(inter_event_times)
    sigma = np.std(inter_event_times)
    if mu == 0:
        return None, None
    r = sigma / mu
    n = len(days_sorted)
    B1 = (r - 1) / (r + 1) if (r + 1) != 0 else None
    if n > 1:
        numerator = (np.sqrt(n + 1) * r) - np.sqrt(n - 1)
        denominator = ((np.sqrt(n + 1) - 2) * r) + np.sqrt(n - 1)
        B2 = numerator / denominator if denominator != 0 else None
    else:
        B2 = None
    return B1, B2

def plot_burstiness_dashboard(donor_msgs, donor):
    donor_msgs = donor_msgs[donor_msgs['sender_id'] == donor].copy()

    
    donor_msgs['datetime'] = pd.to_datetime(
        donor_msgs['datetime'],
        format='mixed',
        errors='coerce'
    )
    donor_msgs = donor_msgs.dropna(subset=['datetime'])
    donor_msgs['date_only'] = donor_msgs['datetime'].dt.date

    interaction_days = donor_msgs.groupby('conversation_id')['date_only'].apply(lambda x: sorted(set(x)))
    interaction_days = interaction_days[interaction_days.apply(len) >= 10]

    if interaction_days.empty:
        print("No chats with ≥10 interaction days for burstiness analysis.")
        return

    burstiness_results = interaction_days.apply(lambda days: compute_burstiness(days))
    burstiness_df = pd.DataFrame(
        burstiness_results.tolist(),
        index=interaction_days.index,
        columns=['B1', 'B2']
    ).dropna()

    conv_id = burstiness_df['B1'].abs().idxmax()
    B1_extreme = burstiness_df.loc[conv_id, 'B1']
    B2_extreme = burstiness_df.loc[conv_id, 'B2']

    if B1_extreme > 0.5:
        donor_type = "Bursty"
        color = "red"
    elif B1_extreme < -0.5:
        donor_type = "Regular"
        color = "green"
    else:
        donor_type = "Random"
        color = "blue"

    days = interaction_days[conv_id]

    plt.figure(figsize=(8, 2))
    plt.eventplot(pd.to_datetime(sorted(days)), orientation='horizontal', colors=color)
    plt.title(f"{donor_type} chat (Conversation {conv_id})\nB1={B1_extreme:.2f}, B2={B2_extreme:.2f}")
    plt.xlabel("Time")
    plt.yticks([])
    plt.grid(axis='x')
    plt.show()


# WhatsApp Donation Dashboard

This notebook visualizes real WhatsApp donation data. It allows you to explore how messages and words are distributed across donors and conversations using **Gini Index** and **Lorenz Curves**.

We will use interactive widgets to select donors and metrics.

In [9]:
# # Import necessary libraries
# import pandas as pd
# import ipywidgets as widgets
# from IPython.display import display, HTML
# import matplotlib.pyplot as plt

# # Import utility functions for data and plots
# from real_data_utils import donations, messages, calculate_gini, plot_lorenz_curve, plot_burstiness_dashboard

## Donor Selection Functions

These functions allow us to:
- Get a list of unique donor IDs.
- Create a dropdown/combobox to select donors.
- Filter messages for a selected donor.

In [10]:
# Function to get sorted list of donor IDs
def get_donor_ids():
    return sorted(donations['donor_id'].unique())

# Function to create a donor selector widget
def create_donor_selector():
    return widgets.Combobox(
        options=get_donor_ids(),
        placeholder='Type or select donor_id',
        description='Donor:',
        ensure_option=False,
        layout=widgets.Layout(width='300px')
    )

# Function to get messages for a given donor
def get_donor_messages(donor):
    return messages[messages['donation_id'].isin(
        donations[donations['donor_id'] == donor]['donation_id']
    )]

## Gini & Lorenz Dashboard

This section creates an interactive dashboard to analyze message or word distributions for a donor.  

- **Metric options**: Messages or Words  
- **Outputs**: 
  - Bar chart showing counts per conversation
  - Lorenz curve
  - Summary with Gini Index interpretation

In [11]:
# Function to display the interactive Gini & Lorenz dashboard
def show_gini_dashboard():
    donor_select = create_donor_selector()
    metric_select = widgets.RadioButtons(
        options=["Messages", "Words"],
        description="Metric:",
        layout=widgets.Layout(width='200px')
    )
    submit_btn = widgets.Button(description="Load Donor", button_style='success')
    bar_output, lorenz_output, summary_output = widgets.Output(), widgets.Output(), widgets.Output()

    def update_dashboard(change=None):
        bar_output.clear_output()
        lorenz_output.clear_output()
        summary_output.clear_output()

        donor = donor_select.value
        metric = metric_select.value
        donor_ids = get_donor_ids()

        if donor not in donor_ids:
            with summary_output:
                display(HTML(f"<b style='color:red;'> Donor '{donor}' not found in WhatsApp data.</b>"))
            return

        donor_msgs = get_donor_messages(donor)

        if metric == "Messages":
            counts = donor_msgs[donor_msgs['sender_id'] == donor].groupby('conversation_id').size().to_dict()
        else:
            counts = donor_msgs[donor_msgs['sender_id'] == donor].groupby('conversation_id')['word_count'].sum().to_dict()

        gini = calculate_gini(counts)

        with bar_output:
            counts_series = pd.Series(counts).sort_values(ascending=False)
            short_labels = [str(x)[:6] + "..." if len(str(x)) > 6 else str(x) for x in counts_series.index]
            counts_series.plot(kind='bar', figsize=(max(6, len(counts_series) * 0.6), 5), color='blue')
            plt.title(f"{metric} Count per Contact")
            plt.xticks(range(len(short_labels)), short_labels, rotation=45, ha='right')
            plt.tight_layout()
            plt.grid(True)
            plt.show()

        with lorenz_output:
            plot_lorenz_curve(counts, title=f"{metric} Distribution")

        with summary_output:
            display(HTML(
                f"<div style='background:#f5f5f5; padding:15px; border-radius:10px; width:100%; height:100%;'>"
                f"<h4 style='margin-top:0;'>Lorenz Curve & Gini Index Summary</h4>"
                f"<p>This curve shows how unequally messages/words are distributed across contacts. If the curve is close to the diagonal, the distribution is equal. If it bows below the diagonal, it indicates inequality.</p>"
                f"<p>A Gini index of {gini:.3f} means: "
                f"{'High inequality (few contacts dominate)' if gini > 0.5 else 'Relatively balanced distribution'}.</p>"
                f"</div>"
            ))

    submit_btn.on_click(update_dashboard)
    metric_select.observe(update_dashboard, names='value')

    display(widgets.VBox([
        widgets.HTML("<h2>WhatsApp Donation Dashboard</h2><p>Visualize donor message distributions using real WhatsApp data.</p>"),
        widgets.HBox([donor_select, submit_btn, metric_select], layout=widgets.Layout(gap='20px')),
        widgets.HBox([bar_output, lorenz_output, summary_output], layout=widgets.Layout(gap='30px'))
    ]))

### Launch the Dashboard

Run the function below to display the interactive dashboard and start exploring donor data.

In [12]:
show_gini_dashboard()

VBox(children=(HTML(value='<h2>WhatsApp Donation Dashboard</h2><p>Visualize donor message distributions using …