In [1]:
%pip install plotly


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
import plotly.graph_objects as go

# Load the spreadsheet
file_path = 'f84929f2-cfc0-4e3f-a7b5-b2d319802376 (1).xlsx'
df = pd.read_excel(file_path)

# Clean the dataframe by removing rows with missing values in critical columns
df_cleaned = df.dropna(subset=['Sending Agent Company', 'Booking Type', 'Status', 'Error Category'])

# Prepare the data for Sankey diagram
def prepare_sankey_data(df, company):
    df_company = df[df['Sending Agent Company'] == company]
    
    # Define all nodes
    all_nodes = list(pd.concat([df_company['Booking Type'], df_company['Status'], df_company['Error Category']]).unique())
    
    # Map the nodes to indices
    node_indices = {node: i for i, node in enumerate(all_nodes)}
    
    # Aggregate flows from Booking Type to Status
    bt_to_status = df_company.groupby(['Booking Type', 'Status'])['#Shipments'].sum().reset_index()
    bt_to_status['Percentage'] = 100 * bt_to_status['#Shipments'] / bt_to_status['#Shipments'].sum()
    source_indices_bt_to_status = bt_to_status['Booking Type'].map(node_indices).tolist()
    target_indices_bt_to_status = bt_to_status['Status'].map(node_indices).tolist()
    values_bt_to_status = bt_to_status['#Shipments'].tolist()
    
    # Aggregate flows from Status to Error Category
    status_to_ec = df_company.groupby(['Status', 'Error Category'])['#Shipments'].sum().reset_index()
    status_to_ec['Percentage'] = 100 * status_to_ec['#Shipments'] / status_to_ec['#Shipments'].sum()
    source_indices_status_to_ec = status_to_ec['Status'].map(node_indices).tolist()
    target_indices_status_to_ec = status_to_ec['Error Category'].map(node_indices).tolist()
    values_status_to_ec = status_to_ec['#Shipments'].tolist()
    
    # Combine all indices and values
    source_indices = source_indices_bt_to_status + source_indices_status_to_ec
    target_indices = target_indices_bt_to_status + target_indices_status_to_ec
    values = values_bt_to_status + values_status_to_ec
    
    # Update node labels with values
    node_labels = []
    for node in all_nodes:
        total_shipments = df_company[df_company.isin([node]).any(axis=1)]['#Shipments'].sum()
        percentage = 100 * total_shipments / df_company['#Shipments'].sum()
        node_labels.append(f"{node}\n{total_shipments} ({percentage:.2f}%)")
    
    return node_labels, source_indices, target_indices, values

# Create and display Sankey diagram using Plotly
def create_sankey(node_labels, source_indices, target_indices, values, company):
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=node_labels
        ),
        link=dict(
            source=source_indices,
            target=target_indices,
            value=values
        ))])
    
    fig.update_layout(title_text=f'Sankey Diagram for {company}', font_size=10)
    fig.show()

# Get unique Sending Agent Companies
sending_agent_companies = df_cleaned['Sending Agent Company'].unique()

# Create and display Sankey diagrams for each Sending Agent Company
for company in sending_agent_companies:
    node_labels, source_indices, target_indices, values = prepare_sankey_data(df_cleaned, company)
    create_sankey(node_labels, source_indices, target_indices, values, company)


In [20]:
import pandas as pd
import plotly.graph_objects as go

# Load the spreadsheet
file_path = 'f84929f2-cfc0-4e3f-a7b5-b2d319802376 (1).xlsx'
df = pd.read_excel(file_path)

# Clean the dataframe by removing rows with missing values in critical columns
df_cleaned = df.dropna(subset=['Sending Agent Company', 'Booking Type', 'Status', 'Error Category'])

# Prepare the data for Sankey diagram
def prepare_sankey_data(df):
    # Define all nodes
    all_nodes = list(pd.concat([df['Booking Type'], df['Status'], df['Error Category']]).unique())
    
    # Map the nodes to indices
    node_indices = {node: i for i, node in enumerate(all_nodes)}
    
    # Aggregate flows from Booking Type to Status
    bt_to_status = df.groupby(['Booking Type', 'Status'])['#Shipments'].sum().reset_index()
    bt_to_status['Percentage'] = 100 * bt_to_status['#Shipments'] / bt_to_status['#Shipments'].sum()
    source_indices_bt_to_status = bt_to_status['Booking Type'].map(node_indices).tolist()
    target_indices_bt_to_status = bt_to_status['Status'].map(node_indices).tolist()
    values_bt_to_status = bt_to_status['#Shipments'].tolist()
    
    # Aggregate flows from Status to Error Category
    status_to_ec = df.groupby(['Status', 'Error Category'])['#Shipments'].sum().reset_index()
    status_to_ec['Percentage'] = 100 * status_to_ec['#Shipments'] / status_to_ec['#Shipments'].sum()
    source_indices_status_to_ec = status_to_ec['Status'].map(node_indices).tolist()
    target_indices_status_to_ec = status_to_ec['Error Category'].map(node_indices).tolist()
    values_status_to_ec = status_to_ec['#Shipments'].tolist()
    
    # Combine all indices and values
    source_indices = source_indices_bt_to_status + source_indices_status_to_ec
    target_indices = target_indices_bt_to_status + target_indices_status_to_ec
    values = values_bt_to_status + values_status_to_ec
    
    # Update node labels with values
    node_labels = []
    for node in all_nodes:
        total_shipments = df[df.isin([node]).any(axis=1)]['#Shipments'].sum()
        percentage = 100 * total_shipments / df['#Shipments'].sum()
        node_labels.append(f"{node}\n{total_shipments} ({percentage:.2f}%)")
    
    # Define colors for nodes
    color_palette = {
        'Manual': 'rgb(0, 0, 255)', 
        'eBooking': 'rgb(0, 255, 255)', 
        'General Booking': 'rgb(245, 170, 66)',
        'Calculated': 'rgb(0, 128, 0)',
        'Incomplete': 'rgb(255, 0, 0)',
        'Error': 'rgb(255, 255, 0)',  
        'Error Category': 'rgb(150, 150, 150)'  
    }
    
    node_colors = []
    for node in all_nodes:
        if node in color_palette:
            node_colors.append(color_palette[node])
        elif node in df['Booking Type'].unique():
            node_colors.append(color_palette[node])
        elif node in df['Status'].unique():
            node_colors.append(color_palette[node])
        else:
            node_colors.append(color_palette['Error Category'])
    
    return node_labels, source_indices, target_indices, values, node_colors

# Create and display Sankey diagram using Plotly
def create_sankey(node_labels, source_indices, target_indices, values, node_colors):
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=node_labels,
            color=node_colors
        ),
        link=dict(
            source=source_indices,
            target=target_indices,
            value=values,
            color=[node_colors[src] for src in source_indices]  # Color links based on source node color
        ))])
    
    # Update layout for improved readability
    fig.update_layout(
        title_text='Sankey Diagram for All Sending Agent Companies', 
        font=dict(size=10, color='black')  # Set font color and size for better readability
    )
    fig.show()

# Prepare the data
node_labels, source_indices, target_indices, values, node_colors = prepare_sankey_data(df_cleaned)

# Create and display the Sankey diagram
create_sankey(node_labels, source_indices, target_indices, values, node_colors)


: 