In [83]:
import pandas as pd
import numpy as np
import warnings

import plotly.graph_objects as go
import plotly.io as pio


warnings.filterwarnings('ignore')

In [84]:
def generate_gray_tones(num_colors):
    # Define the range of gray values (0 = black, 255 = white)
    min_gray = 120
    max_gray = 220

    # Calculate the step size to evenly distribute the tones
    step = (max_gray - min_gray) // (num_colors - 1)

    # Generate a list of random gray colors
    gray_colors = []
    for _ in range(num_colors):
        gray_value = min_gray + step * _
        gray_color = "#{:02X}{:02X}{:02X}".format(gray_value, gray_value, gray_value)
        gray_colors.append(gray_color)

    # Shuffle the list to randomize the order
    #random.shuffle(gray_colors)
    
    return gray_colors

In [85]:
def format_value(value):
    if value >= 1000000:
        return f'{value/1000000:.1f}M'
    elif value >= 1000:
        return f'{value/1000:.1f}K'
    else:
        return f'{value:.0f}'

In [86]:
folder_path = r'C:\Users\Rafael_Fagundes\Downloads\compiled_data.csv'

df = pd.read_csv(folder_path, encoding='utf-8')

df = df[(df['BU'] == 'CSB') 
            & (df['Fiscal Quarter'] == '2024-Q3') 
            & (df['Country_x'] == 'United States')
            & (df['Audience Type'] == '1PD')]

#df = df[(df['Segment ID'] != 0) & (df['Segment ID'] != '0')]

In [87]:
# Fix issue with 'Audience Type Name': 'CRM-1PD\xa0CRM'
df['Audience Type Name'] = df['Audience Type Name'].str.replace('CRM-1PD\xa0CRM', 'CRM-1PD CRM')

# Get Segments Unique Counts
filtered_df = df[(df['Segment ID'] != 0) & (df['Segment ID'] != '0')]
df_lvl1_seg = filtered_df.groupby(['Audience Type'])['Segment ID'].nunique().reset_index()
df_lvl2_seg = filtered_df.groupby(['Audience Type Name'])['Segment ID'].nunique().reset_index()
df_lvl3_seg = filtered_df.groupby(['Audience Source'])['Segment ID'].nunique().reset_index()
df_lvl4_seg = filtered_df.groupby(['Display Dell Vehicle Mapped'])['Segment ID'].nunique().reset_index()
df_lvl1_seg.rename(columns={'Audience Type':'Label', 'Segment ID':'Segments'}, inplace=True)
df_lvl2_seg.rename(columns={'Audience Type Name':'Label', 'Segment ID':'Segments'}, inplace=True)
df_lvl3_seg.rename(columns={'Audience Source':'Label', 'Segment ID':'Segments'}, inplace=True)
df_lvl4_seg.rename(columns={'Display Dell Vehicle Mapped':'Label', 'Segment ID':'Segments'}, inplace=True)
segment_df = pd.concat([df_lvl1_seg, df_lvl2_seg, df_lvl3_seg, df_lvl4_seg])
######

test = df.groupby(['Audience Type', 'Audience Type Name', 'Audience Source','Display Dell Vehicle Mapped'])['Spend'].sum().reset_index()

test3 = test.groupby(['Audience Type', 'Audience Type Name', 'Audience Source','Display Dell Vehicle Mapped'])['Spend'].sum().reset_index()
test3 = test3[(test3['Audience Source'] != '0')&(test3['Display Dell Vehicle Mapped'] != '0')]

test1 = test.groupby(['Audience Type','Audience Type Name'])['Spend'].sum().reset_index()
test1 = test1[(test1['Audience Type Name'] != '0')]

test2 = test.groupby(['Audience Type','Audience Type Name','Audience Source'])['Spend'].sum().reset_index()
test2 = test2[(test2['Audience Type Name'] != '0')&(test2['Audience Source'] != '0')]

test4 = test[test['Audience Source'] == '0'].groupby(['Audience Type', 'Audience Type Name', 'Display Dell Vehicle Mapped'])['Spend'].sum().reset_index()
test4 = test4[(test4['Audience Type Name'] != '0')&(test4['Display Dell Vehicle Mapped'] != '0')]

test1['Level'] = 1
test2['Level'] = 2
test3['Level'] = 4
test4['Level'] = 3

test1.rename(columns={'Audience Type':'Source', 'Audience Type Name':'Target', 'Spend':'Total'}, inplace=True)
test2.rename(columns={'Audience Type Name':'Source', 'Audience Source':'Target', 'Spend':'Total'}, inplace=True)
test3.rename(columns={'Audience Source':'Source', 'Display Dell Vehicle Mapped':'Target', 'Spend':'Total'}, inplace=True)
test4.rename(columns={'Audience Type Name':'Source', 'Display Dell Vehicle Mapped':'Target', 'Spend':'Total'}, inplace=True)

concat_df = pd.concat([
                       test1[['Source','Target','Total', 'Level']]
                       , test2[['Source','Target','Total', 'Level']]
                       , test3[['Source','Target','Total', 'Level']]
                        , test4[['Source','Target','Total', 'Level']]
                    ], ignore_index=True)
                    

unique_values = pd.unique(concat_df[['Source', 'Target']].values.ravel('K'))

mapping_df = pd.DataFrame({'Label': unique_values.tolist()
                                     , 'Value': [i for i in range(len(unique_values))]})


unique_values = pd.unique(concat_df[['Source', 'Target']].values.ravel('K'))
    
mapping_df = pd.DataFrame({'Label': unique_values.tolist()
                                          , 'Value': [i for i in range(len(unique_values))]})

# Replace values based on the mappings
concat_df['Source'] = concat_df['Source'].replace(mapping_df.set_index('Label')['Value'])
concat_df['Target'] = concat_df['Target'].replace(mapping_df.set_index('Label')['Value'])






# Create a list of columns to process
columns_to_process = ['Audience Type', 'Audience Type Name', 'Audience Source', 'Display Dell Vehicle Mapped']

# Create an empty DataFrame to store the results
result_df = pd.DataFrame()

# Loop through each column and calculate sums, percentages, and unique "Segment ID" counts
for column_name in columns_to_process:
    # Calculate sums and percentages
    grouped = df.groupby(column_name)['Spend'].sum().reset_index()
    grouped['Percentage'] = (grouped['Spend'] / grouped['Spend'].sum()) * 100
    grouped.rename(columns={column_name: 'Label'}, inplace=True)
    
    # Calculate unique "Segment ID" counts
    grouped_segment = df.groupby(column_name)['Segment ID'].nunique().reset_index()
    grouped_segment.rename(columns={column_name: 'Label', 'Segment ID': 'Segments'}, inplace=True)
    
    # Merge the two DataFrames
    merged_grouped = grouped.merge(grouped_segment, on='Label', how='left')
    
    # Append the result to result_df
    result_df = pd.concat([result_df, merged_grouped], axis=0, ignore_index=True)

# result_df now contains the combined results
result_df


merged_df = mapping_df.merge(result_df, on='Label', how='left')

merged_df['Label_x'] = merged_df['Label'] + " (" + merged_df['Segments'].apply(format_value) + ") " + "<br>" + merged_df['Percentage'].apply(lambda x: f'{x:.1f}%')+ " (" + merged_df['Spend'].apply(format_value) + ")"
    ############


# Sample data
source = concat_df['Source'].values.tolist()
target = concat_df['Target'].values.tolist()
value = concat_df['Total'].values.tolist()
labels = merged_df['Label_x'].values.tolist()

# List of colors for each link based on some condition or data
colors = generate_gray_tones(len(source))

# Create links
link = dict(source=source, target=target, value=value, color=colors)

# Create nodes
node = dict(label=labels, pad=30, thickness=20)

# Create a Sankey object
chart = go.Sankey(link=link, node=node, arrangement="snap")

# Build a figure
fig = go.Figure(chart)

# Add a title to the figure
fig.update_layout(title_text='title')

fig.show()
