In [1]:
import numpy as np
import pandas as pd
# pd.options.display.float_format = '{:.2%}'.format
import plotly.graph_objects as go   

In [2]:
map_1_df = pd.read_csv(r'./onlyDraw/map_moi.csv')
map_name_1 = map_1_df.set_index('LEVEL_06_CODE ')['Tên khối'].to_dict()

# load transaction data
trans_df = pd.read_csv(r'sample data lv pk.csv')[['Lv0', 'Lv1', 'Lv2', 'Lv3', 'Lv4', 'Size']]

# create mapped columns for trans_df
map_columns = ['Lv0', 'Lv1', 'Lv2', 'Lv3', 'Lv4']
mapped_columns = [ col + '_mapped' for col in map_columns]
for i, j in zip(map_columns, mapped_columns):
    trans_df[j] = trans_df[i].map(map_name_1)
    trans_df[j] = trans_df[j] + '_' + i

all_lv_df = trans_df.groupby(mapped_columns)['Size'].sum().reset_index()
lv_04_df = trans_df.groupby(['Lv0_mapped', 'Lv4_mapped'])['Size'].sum().reset_index()

map_2_df = pd.read_csv(r'./onlyDraw/map.csv')
map_name_2 = map_2_df.set_index('ORG_UNIT_ID')['LEVEL_02_NAME'].to_dict()

# load allocation data
al_df = pd.read_csv(r'./Hai ha/KQ phan bo GD2_T092023.csv')
al_df.rename(columns = {'Chi phí nhận phân bổ tại thời điểm':'Size'}, inplace = True)
al_df.dropna(inplace= True)

# create mapped columns for al_df
al_df['Mã đơn vị tổ chức cấp 6_map'] = al_df['Mã đơn vị tổ chức cấp 6'].map(map_name_2) 
al_df['Mã SP cấp 5 PK'] = al_df['Mã SP cấp 5'].astype(str).str[:2]

pk_df = al_df.groupby(['Mã đơn vị tổ chức cấp 6_map',  'Tên phân khúc KH cấp 3'])['Size'].sum().reset_index()

sp_df = al_df.groupby(['Mã đơn vị tổ chức cấp 6_map', 'Mã SP cấp 5 PK'])['Size'].sum().reset_index()
sp_df['Mã SP cấp 5 PK'] = 'SP_' + sp_df['Mã SP cấp 5 PK'].apply(str)

sp_pk_df = al_df.groupby(['Mã đơn vị tổ chức cấp 6_map', 'Mã SP cấp 5 PK', 'Tên phân khúc KH cấp 3'])['Size'].sum().reset_index()
sp_pk_df['Mã SP cấp 5 PK'] = 'SP_' + sp_pk_df['Mã SP cấp 5 PK'].apply(str)

In [3]:
node_list = ['TT DICH VU NOI BO', 'TT QUAN LY CHUNG TOAN HANG', 'TT AO',
            'KHONG CO TRONG CAY', 'TT DOANH THU',
            'TT QUAN LY CHUNG KHOI KINH DOANH', 'TT HO TRO TRUC TIEP',
            'TT HO TRO SAN PHAM', 'TT QUAN LY CHUNG CHI NHANH']

color_list = ['rgba(44, 160, 44, 0.8)', 'rgba(255, 127, 14, 0.8)', 'rgba(140, 86, 75, 0.8)',
        'rgba(23, 190, 207, 0.8)', 'rgba(188, 189, 34, 0.8)', 'rgba(214, 39, 40, 0.8)',
        'rgba(188, 189, 34, 0.8)', 'rgba(31, 119, 180, 0.8)', 'rgba(140, 86, 75, 0.8)']

# color_map = dict(zip(node_list, color_list))

df = pd.read_csv('ColorMap.csv')
color_map_v1 = dict(zip(df['TÊN GỐC'].to_list(), df['Màu'].to_list()))

df = pd.read_csv('ColorMapV2.csv')
color_map_v2 = dict(zip(df['Tên khối'].to_list(), df['Màu'].to_list()))

In [4]:
def sankey_graph(data, mapped_columns, highlighted_nodes, title, color_map= False, autosize= False):
    data[mapped_columns] = data[mapped_columns].astype(str)

    # Function to create a mapping of each unique step to a unique integer
    def create_node_mapping(data, columns):
        unique_nodes = set()
        for col in columns:
            unique_nodes.update(data[col].unique())
        return {node: i for i, node in enumerate(sorted(unique_nodes))}

    # Create
    def create_color_list(node_mapping, color_map):
        new_color_list = []
        for node in node_mapping:
            base_node = node.split('_')[0]
            if base_node in color_map:
                new_color_list.append(color_map[base_node])
            else:
                new_color_list.append('#CB99C9')  # Default color for unmatched nodes
        return new_color_list
    
    # Create the node mapping
    node_mapping = create_node_mapping(data, mapped_columns)

    # Prepare source, target, and value lists for the Sankey diagram
    sources = []
    targets = []
    values = []
    colors = []  # For link colors

    highlight_color = 'rgba(0, 107, 105, 0.2)'  
    default_color = 'rgba(0, 0, 0, 0.1)'  

    # Function to check if a row includes the highlighted node
    def row_includes_highlighted_node(row, highlighted_nodes):
        return any(target in row[mapped_columns].values for target in highlighted_nodes)
        # return highlighted_node in row[mapped_columns].values
    
    # Populate source, target, values, and colors
    for _, row in data.iterrows():
        
        row_highlighted = row_includes_highlighted_node(row, highlighted_nodes)
        
        for i in range(len(mapped_columns) - 1):
            source = node_mapping[row[mapped_columns[i]]]
            target = node_mapping[row[mapped_columns[i + 1]]]
            value = row['Size']
            sources.append(source)
            targets.append(target)
            values.append(value)
            colors.append(highlight_color if row_highlighted else default_color)

    if color_map == False:
        fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=list(node_mapping.keys()),
            color="rgba(255, 199, 47, 0.5)",
            
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values,
            color=colors  
        )
    )])
    else:
        color_list = create_color_list(node_mapping, color_map)
        fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=list(node_mapping.keys()),
            color=color_list,
            
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values,
            color=colors  
        )
    )])
    
    if autosize == False:
        fig.update_layout(autosize=autosize, width=1620, height=1080)
    
    fig.update_layout(font_family="Arial", font_color="blue", font_size=14)
    fig.update_layout(title_text= title, title_font_size=24)
    fig.update_layout(xaxis_visible=False, xaxis_showticklabels=False,
                        yaxis_visible=False, yaxis_showticklabels=False)
    
    return fig

def calculate_distribution(data, highlighted_nodes, page_flag= 1):
    if len(highlighted_nodes) == 0:
        return None

    def calculate_percentage(df, level_column, size_column):
        grouped_df = df[[level_column, size_column]].groupby(level_column)[size_column].sum().reset_index()
        total_size = grouped_df[size_column].sum()
        grouped_df[f'Percentage_{level_column}'] = grouped_df[size_column] / total_size * 100
        grouped_df = grouped_df.sort_values(by= f'Percentage_{level_column}')
        return grouped_df.drop(size_column, axis=1)
    
    values_to_filter = highlighted_nodes
    filtered_df = data[data.isin(values_to_filter).any(axis=1)]
    filtered_columns = filtered_df.columns
    filtered_columns = filtered_columns.drop('Size')
    
    if page_flag == 1:
        for col in filtered_columns:
            filtered_df[col] = filtered_df[col].str[:-4]

    results = []
    for col in filtered_columns:
        result = calculate_percentage(filtered_df, col, 'Size')
        results.append(result)

    final_result = pd.concat(results, axis=1)
    return final_result

def generate_format(column_name):
    modified_list = [item.replace('_mapped', '') for item in column_name]
    modified_list = [item.replace('_map', '') for item in modified_list]
    # modified_list = [item.replace('Percentage_', 'Percentage_') for item in modified_list]
    format_dict = {}

    for orig, mod in zip(column_name, modified_list):
        if orig.startswith('Percentage_'):
            format_dict[orig] = st.column_config.NumberColumn(mod, format="%.2f %%")
        else:
            format_dict[orig] = mod
    return format_dict


In [5]:
highlighted_nodes_1 = [ "KHOI BAN LE_Lv0", "KHOI BAN BUON_Lv4"]
selected_columns = ['Lv0_mapped', 'Lv4_mapped']
fig = sankey_graph(lv_04_df, selected_columns, highlighted_nodes_1, title= "<b>Phân bổ giai đoạn 1: Từ trung tâm đến trung tâm</b>", color_map= color_map_v2)
fig     

In [6]:
dist_result = calculate_distribution(lv_04_df, highlighted_nodes_1)
dist_result



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Lv0_mapped,Percentage_Lv0_mapped,Lv4_mapped,Percentage_Lv4_mapped
5,KHOI KINH DOANH VON VA TIEN TE,0.000445,,
6,KHOI QUAN LY RUI RO,0.119621,,
3,KHOI GIAM SAT VA TUAN THU,0.332209,KHOI KINH DOANH VON VA TIEN TE,6e-06
2,KHOI CNTT VA NGAN HANG SO,0.669599,KHOI DAU TU,0.0
8,KHOI TAI CHINH-KE TOAN,1.352108,,
7,KHOI TAC NGHIEP,2.471835,,
4,KHOI HO TRO,7.978382,,
0,KHOI BAN BUON,29.868029,KHOI BAN BUON,43.431192
1,KHOI BAN LE,57.207772,KHOI BAN LE,56.568802


In [7]:
highlighted_df = lv_04_df[lv_04_df.isin(highlighted_nodes_1).any(axis=1)]
# st.dataframe(highlighted_df)
sub_fig = sankey_graph(highlighted_df, selected_columns, highlighted_nodes_1, title= "Highlighted", color_map= color_map_v2)
sub_fig



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [9]:
highlighted_nodes_1 = [ "KHOI BAN LE_Lv0", "KHOI BAN BUON_Lv4"]
selected_columns = ['Lv0_mapped', 'Lv1_mapped', 'Lv2_mapped', 'Lv3_mapped', 'Lv4_mapped'] # ['Lv0_mapped', 'Lv4_mapped']
fig = sankey_graph(all_lv_df, selected_columns, highlighted_nodes_1, title= "<b>Phân bổ giai đoạn 1: Từ trung tâm đến trung tâm</b>", color_map= color_map_v2)
fig       

In [10]:
dist_result = calculate_distribution(all_lv_df, highlighted_nodes_1)
dist_result



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Lv0_mapped,Percentage_Lv0_mapped,Lv1_mapped,Percentage_Lv1_mapped,Lv2_mapped,Percentage_Lv2_mapped,Lv3_mapped,Percentage_Lv3_mapped,Lv4_mapped,Percentage_Lv4_mapped
5,KHOI KINH DOANH VON VA TIEN TE,0.000445,KHOI KINH DOANH VON VA TIEN TE,0.000493,KHOI KINH DOANH VON VA TIEN TE,0.001329,KHOI QUAN LY RUI RO,0.189841,,
6,KHOI QUAN LY RUI RO,0.11965,KHOI QUAN LY RUI RO,0.120295,KHOI QUAN LY RUI RO,0.178534,KHOI TAC NGHIEP,3.003541,,
3,KHOI GIAM SAT VA TUAN THU,0.332288,KHOI GIAM SAT VA TUAN THU,0.33233,KHOI GIAM SAT VA TUAN THU,0.33233,KHOI GIAM SAT VA TUAN THU,0.33233,KHOI KINH DOANH VON VA TIEN TE,6e-06
2,KHOI CNTT VA NGAN HANG SO,0.669469,KHOI CNTT VA NGAN HANG SO,0.526925,KHOI CNTT VA NGAN HANG SO,0.810318,KHOI CNTT VA NGAN HANG SO,0.852607,KHOI DAU TU,0.0
8,KHOI TAI CHINH-KE TOAN,1.35243,KHOI TAI CHINH-KE TOAN,1.370044,KHOI TAI CHINH-KE TOAN,1.697613,,,,
7,KHOI TAC NGHIEP,2.472424,KHOI TAC NGHIEP,2.476331,KHOI TAC NGHIEP,2.722927,KHOI TAI CHINH-KE TOAN,1.754911,,
4,KHOI HO TRO,7.956738,KHOI HO TRO,8.015775,KHOI HO TRO,3.726057,KHOI HO TRO,2.611595,,
0,KHOI BAN BUON,29.875148,KHOI BAN BUON,29.936392,KHOI BAN BUON,33.309195,KHOI BAN BUON,34.033223,KHOI BAN BUON,43.417709
1,KHOI BAN LE,57.221408,KHOI BAN LE,57.221415,KHOI BAN LE,57.221696,KHOI BAN LE,57.221953,KHOI BAN LE,56.582285


In [11]:
highlighted_df = all_lv_df[all_lv_df.isin(highlighted_nodes_1).any(axis=1)]
# st.dataframe(highlighted_df)
sub_fig = sankey_graph(highlighted_df, selected_columns, highlighted_nodes_1, title= "Highlighted", color_map= color_map_v2)
sub_fig



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

