In [217]:
import pandas as pd
import numpy as np
import warnings

import plotly.graph_objects as go
import plotly.io as pio


warnings.filterwarnings('ignore')

In [218]:
def format_value(value):
    if value >= 1000000:
        return f'{value/1000000:.1f}M'
    elif value >= 1000:
        return f'{value/1000:.1f}K'
    else:
        return f'{value:.1f}'

In [219]:
def generate_gray_tones(num_colors):
    # Define the range of gray values (0 = black, 255 = white)
    min_gray = 120
    max_gray = 220

    # Calculate the step size to evenly distribute the tones
    step = (max_gray - min_gray) // (num_colors - 1)

    # Generate a list of random gray colors
    gray_colors = []
    for _ in range(num_colors):
        gray_value = min_gray + step * _
        gray_color = "#{:02X}{:02X}{:02X}".format(gray_value, gray_value, gray_value)
        gray_colors.append(gray_color)

    # Shuffle the list to randomize the order
    #random.shuffle(gray_colors)
    
    return gray_colors

In [220]:
folder_path = r'C:\Users\Rafael_Fagundes\Downloads\compiled_data.csv'

df = pd.read_csv(folder_path, encoding='utf-8')

df = df[(df['BU'] == 'CSB') 
            & (df['Fiscal Quarter'] == '2024-Q3') 
            & (df['Country_x'] == 'United States')]

df = df[(df['Segment ID'] != 0) & (df['Segment ID'] != '0')]

In [270]:
# Fix issue with 'Audience Type Name': 'CRM-1PD\xa0CRM'
df['Audience Type Name'] = df['Audience Type Name'].str.replace('CRM-1PD\xa0CRM', 'CRM-1PD CRM')


df_lvl1 = df.groupby(['Audience Type','Audience Type Name'])['Spend'].sum().reset_index()
df_lvl2 = df.groupby(['Audience Type','Audience Type Name','Audience Source'])['Spend'].sum().reset_index()
df_lvl3 = df.groupby(['Audience Type','Audience Type Name','Audience Source','Display Dell Vehicle Mapped'])['Spend'].sum().reset_index()

df_lvl1['Level'] = 1
df_lvl2['Level'] = 2
df_lvl3['Level'] = 3

# Rename columns to Source, Target & Value
df_lvl1.rename(columns={'Audience Type':'Source', 'Audience Type Name':'Target', 'Spend':'Value'}, inplace=True)
df_lvl2.rename(columns={'Audience Type Name':'Source', 'Audience Source':'Target', 'Spend':'Value'}, inplace=True)
df_lvl3.rename(columns={'Audience Source':'Source', 'Display Dell Vehicle Mapped':'Target', 'Spend':'Value'}, inplace=True)

# Contact the dataframes in just one
concat_df = pd.concat([
                       df_lvl1[['Source','Target','Value','Level']]
                       , df_lvl2[['Source','Target','Value', 'Level']]
                       , df_lvl3[['Source','Target','Value', 'Level']]
                    ], ignore_index=True)

unique_values = pd.unique(concat_df[['Source', 'Target']].values.ravel('K'))

mapping_df = pd.DataFrame({'Label': unique_values.tolist()
                                      , 'Value': [i for i in range(len(unique_values))]})

# Replace values based on the mappings
concat_df['Source'] = concat_df['Source'].replace(mapping_df.set_index('Label')['Value'])
concat_df['Target'] = concat_df['Target'].replace(mapping_df.set_index('Label')['Value'])



##########
level_list = concat_df['Level'].unique().tolist()

if len(level_list) > 1:
    level_list_source = level_list[:-1]
    level_list_target = [level_list[-1]]  # Wrap the last level in a list
else:
    level_list_source, level_list_target = [1], [1]


df_source = concat_df[concat_df['Level'].isin(level_list_source)].groupby(['Source','Level'])['Value'].sum().reset_index()
total_sum_source = df_source[df_source['Level'] == level_list[1]]['Value'].sum()
df_source['Percentage'] = (df_source['Value'] / total_sum_source) * 100

df_target = concat_df[concat_df['Level'].isin(level_list_target)].groupby(['Target','Level'])['Value'].sum().reset_index()
total_sum_source = df_target[df_target['Level'].isin(level_list_target)]['Value'].sum()
df_target['Percentage'] = (df_target['Value'] / total_sum_source) * 100


df_source.rename(columns={'Value':'Total','Source':'Value'}, inplace=True)
df_target.rename(columns={'Value':'Total','Target':'Value'}, inplace=True)

totals_df = pd.concat([df_source, df_target])

merged_df = mapping_df.merge(totals_df, on='Value', how='left')

merged_df['Label_x'] = merged_df['Label'] + "<br>" + merged_df['Percentage'].apply(lambda x: f'{x:.1f}%') + " (" + merged_df['Total'].apply(format_value) + ")"
############



# Sample data
source = concat_df['Source'].values.tolist()
target = concat_df['Target'].values.tolist()
value = concat_df['Value'].values.tolist()
labels = merged_df['Label_x'].values.tolist()

# List of colors for each link based on some condition or data
colors = generate_gray_tones(len(source))

# Create links
link = dict(source=source, target=target, value=value, color=colors)

# Create nodes
node = dict(label=labels, pad=30, thickness=20)

# Create a Sankey object
chart = go.Sankey(link=link, node=node, arrangement="snap")

# Build a figure
fig = go.Figure(chart)

# Add a title to the figure
fig.update_layout(title_text='test', hovermode="x", height=600, width=800)

fig.show()


In [222]:
concat_df

Unnamed: 0,Source,Target,Value,Level
0,0,2,87258.86829,1
1,1,3,0.0,1
2,1,4,83743.712169,1
3,1,5,0.0,1
4,2,6,0.0,2
5,2,7,87258.86829,2
6,3,7,0.0,2
7,4,7,62152.856527,2
8,4,8,20340.422987,2
9,4,9,1250.432654,2


In [223]:
df_source = concat_df.groupby(['Source','Level'])['Value'].sum().reset_index()


In [224]:
total_sum_source = df_source[df_source['Level'] == 1]['Value'].sum()
total_sum_source

171002.580458329

In [225]:
df_source['Percentage'] = (df_source['Value'] / total_sum_source) * 100

In [237]:
df_source

Unnamed: 0,Value,Level,Total,Percentage
0,0,1,87258.86829,51.027808
1,1,1,83743.712169,48.972192
2,2,2,87258.86829,51.027808
3,3,2,0.0,0.0
4,4,2,83743.712169,48.972192
5,5,2,0.0,0.0


In [233]:
concat_df['Level'].unique().tolist()

[1, 2, 3]

In [235]:
level_list

[1, 2]

In [267]:
df_target = concat_df[concat_df['Level'].isin(level_list_target)].groupby(['Target','Level'])['Value'].sum().reset_index()
total_sum_source = df_target[df_target['Level'].isin(level_list_target)]['Value'].sum()
df_target['Percentage'] = (df_target['Value'] / total_sum_source) * 100

In [268]:
df_target

Unnamed: 0,Target,Level,Value,Percentage
0,10,3,0.0,0.0
1,11,3,65538.547136,38.326057
2,12,3,2344.488264,1.371025
3,13,3,103119.545059,60.302918
4,14,3,0.0,0.0
