# Esophageal Cancer Research - Network Analysis & Visualizations
* By Sangwon Baek
* Samsung Medical Center
* August 3rd, 2023

### Import necessary packages and read data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import fpgrowth

from statsmodels.graphics.mosaicplot import mosaic
import itertools

from ..src.utils.network_utils import * 
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.options.mode.chained_assignment = None

### Define Custon Lymphnode locations and preprocess dataset

In [3]:
def N_categorize(x):
    if x == 0:
        return '0'
    elif 1 <= x <= 2:
        return '1'
    elif 3 <= x <= 6:
        return '2'
    else: # x > 7
        return '3'

#Custom Positions of the Lymphnodes (Referring to diagram) 
#ON = other neck; OM=other medial; OA=other abdominal; REG=regional
Lymphnode_Positions = {
    'upper': (0, 3),
    'mid': (0, 0),
    'lower': (0, -3),
    "neckLN": (0, 3),
    "mediaLN": (0, 0),
    "abdoLN": (0, -3),
    '101R': (-0.2, 4),
    '101L': (0.2, 4),
    '102R': (-0.4, 4.3),
    '102L': (0.4, 4.3),
    '104R': (-0.6, 4),
    '104L': (0.6, 4),
    '106recR': (-0.3, 3),
    '106recL': (0.3, 3),
    '106preR': (-0.05, 2.5),
    '106preL': (0.5, 2.5),
    '107': (0.1, 1.5), 
    '105/108/110': (-0.4, 0.5), #*105
    '112pulR': (0, -0.3),
    '112pulL': (0.3, -0.3),
    '1/2/7': (0.4, -2.5),
    '8': (-0.4, -2.5),
    '9': (0, -3)
}

In [4]:
df = pd.read_csv("../Data/Preprocessed/ECA_Dataset.csv").drop(columns="Unnamed: 0")

# Listing the columns that start with "pos_" and "total_"
pos_columns = [col for col in df.columns if col.startswith("pos_")]
total_columns = [col for col in df.columns if col.startswith("total_")]

# Extract T, N, and M categories using regular expressions and then remove the prefixes
df['T_category'] = df['pTNM7_1'].str.extract('(TX|T0|Tis|T1a|T1b|T2|T3|T4a|T4b)').replace('T', '', regex=True)
df['N_category'] = df.total_pos_LN.apply(N_categorize)
df['M_category'] = df['pTNM7_1'].str.extract('(M0|M1)').replace('M', '', regex=True)

#Create the df with the whole dataset
Whole_df = df.copy() 

#Create the subgroup dfs for the subgroup analysis
Whole_upper_df = df.loc[df.Primary_Site=='upper']
Whole_mid_df = df.loc[df.Primary_Site=='mid']
Whole_lower_df = df.loc[df.Primary_Site=='lower']

# Filtering out 'is' category and creating the two groups
T1_df = df[df['T_category'].isin(['1a', '1b'])]
T24_df = df[df['T_category'].isin(['2', '3', '4a', '4b'])]
T2_df = df[df['T_category'].isin(['2'])]
T3_df = df[df['T_category'].isin(['3'])]
T4_df = df[df['T_category'].isin(['4a', '4b'])]

#Create the subgroup dfs for the subgroup analysis
T1_upper_df = T1_df.loc[T1_df.Primary_Site=='upper']
T1_mid_df = T1_df.loc[T1_df.Primary_Site=='mid']
T1_lower_df = T1_df.loc[T1_df.Primary_Site=='lower']

T24_upper_df = T24_df.loc[T24_df.Primary_Site=='upper']
T24_mid_df = T24_df.loc[T24_df.Primary_Site=='mid']
T24_lower_df = T24_df.loc[T24_df.Primary_Site=='lower']

T2_upper_df = T2_df.loc[T2_df.Primary_Site=='upper']
T2_mid_df = T2_df.loc[T2_df.Primary_Site=='mid']
T2_lower_df = T2_df.loc[T2_df.Primary_Site=='lower']

T3_upper_df = T3_df.loc[T3_df.Primary_Site=='upper']
T3_mid_df = T3_df.loc[T3_df.Primary_Site=='mid']
T3_lower_df = T3_df.loc[T3_df.Primary_Site=='lower']

T4_upper_df = T4_df.loc[T4_df.Primary_Site=='upper']
T4_mid_df = T4_df.loc[T4_df.Primary_Site=='mid']
T4_lower_df = T4_df.loc[T4_df.Primary_Site=='lower']

In [5]:
#Describe the datsets 
dataset_descriptors = ["Whole", "Whole_upper", "Whole_mid", "Whole_lower", 
                       "T1", "T1_upper", "T1_mid", "T1_lower", 
                       "T24", "T24_upper", "T24_mid", "T24_lower",
                       "T2", "T2_upper", "T2_mid", "T2_lower",
                       "T3", "T3_upper", "T3_mid", "T3_lower",
                       "T4", "T4_upper", "T4_mid", "T4_lower"]

#Define the numbers I want to test 
numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 44]
N_categories = ['0','1','2','3']

# Specific descriptors and metastasis counts you're interested in
specific_descriptors = ["T1_upper", "T1_mid", "T1_lower", "T24_upper", "T24_mid", "T24_lower"]
final_descriptors = ["T1_upper", "T24_upper", "T1_mid", "T24_mid", "T1_lower", "T24_lower"]

dfs = [
    ("Whole", Whole_df), 
    ("Whole Upper", Whole_upper_df), 
    ("Whole Mid", Whole_mid_df), 
    ("Whole Lower", Whole_lower_df), 
    ("T1", T1_df), 
    ("T1 upper", T1_upper_df), 
    ("T1 mid", T1_mid_df), 
    ("T1 lower", T1_lower_df), 
    ("T2", T2_df), 
    ("T2 upper", T2_upper_df), 
    ("T2 mid", T2_mid_df), 
    ("T2 lower", T2_lower_df), 
    ("T3", T3_df), 
    ("T3 upper", T3_upper_df), 
    ("T3 mid", T3_mid_df), 
    ("T3 lower", T3_lower_df), 
    ("T4", T4_df), 
    ("T4 upper", T4_upper_df), 
    ("T4 mid", T4_mid_df), 
    ("T4 lower", T4_lower_df), 
    ("T24", T24_df), 
    ("T24 upper", T24_upper_df),
    ("T24 mid", T24_mid_df),
    ("T24 lower", T24_lower_df)
]


### Network Analysis between Primary Site and LN station

In [9]:
#Get edges frequencies based on Numbers
for descriptor in dataset_descriptors:
    for num in numbers:
        label = 'all' if num == 44 else str(num)
        # Create the global variable name
        var_name = f"EF_PS_{label}_{descriptor}"
        # Create the dataframe variable name
        df_name = f"{descriptor}_df"
        # Assign the result to the global variable
        globals()[var_name] = get_frequency_PS_LN(globals()[df_name], num, None)
        
#Get edges frequency based on N categories
for descriptor in dataset_descriptors:
    for category in N_categories:
        label = f'N{category}'
        # Create the global variable name
        var_name = f"EF_PS_{label}_{descriptor}"
        # Create the dataframe variable name
        df_name = f"{descriptor}_df"
        # Assign the result to the global variable
        globals()[var_name] = get_frequency_PS_LN(globals()[df_name], None, category)

In [10]:
combined_Whole_PS = combine_edge_frequencies_PS('Whole')
combined_Whole_upper_PS = combine_edge_frequencies_PS('Whole_upper')
combined_Whole_mid_PS = combine_edge_frequencies_PS('Whole_mid')
combined_Whole_lower_PS = combine_edge_frequencies_PS('Whole_lower')

combined_T1_PS = combine_edge_frequencies_PS('T1')
combined_T1_upper_PS = combine_edge_frequencies_PS('T1_upper')
combined_T1_mid_PS = combine_edge_frequencies_PS('T1_mid')
combined_T1_lower_PS = combine_edge_frequencies_PS('T1_lower')

combined_T24_PS = combine_edge_frequencies_PS('T24')
combined_T24_upper_PS = combine_edge_frequencies_PS('T24_upper')
combined_T24_mid_PS = combine_edge_frequencies_PS('T24_mid')
combined_T24_lower_PS = combine_edge_frequencies_PS('T24_lower')

combined_T2_PS = combine_edge_frequencies_PS('T2')
combined_T2_upper_PS = combine_edge_frequencies_PS('T2_upper')
combined_T2_mid_PS = combine_edge_frequencies_PS('T2_mid')
combined_T2_lower_PS = combine_edge_frequencies_PS('T2_lower')

combined_T3_PS = combine_edge_frequencies_PS('T3')
combined_T3_upper_PS = combine_edge_frequencies_PS('T3_upper')
combined_T3_mid_PS = combine_edge_frequencies_PS('T3_mid')
combined_T3_lower_PS = combine_edge_frequencies_PS('T3_lower')

combined_T4_PS = combine_edge_frequencies_PS('T4')
combined_T4_upper_PS = combine_edge_frequencies_PS('T4_upper')
combined_T4_mid_PS = combine_edge_frequencies_PS('T4_mid')
combined_T4_lower_PS = combine_edge_frequencies_PS('T4_lower')

In [11]:
edge_frequencies_PS = [
    ("Whole_PS", combined_Whole_PS),
    ("Whole_upper_PS", combined_Whole_upper_PS),
    ("Whole_mid_PS", combined_Whole_mid_PS),
    ("Whole_lower_PS", combined_Whole_lower_PS),
    ("T1_PS", combined_T1_PS),
    ("T1_upper_PS", combined_T1_upper_PS),
    ("T1_mid_PS", combined_T1_mid_PS),
    ("T1_lower_PS", combined_T1_lower_PS),
    ("T24_PS", combined_T24_PS),
    ("T24_upper_PS", combined_T24_upper_PS),
    ("T24_mid_PS", combined_T24_mid_PS),
    ("T24_lower_PS", combined_T24_lower_PS),
    ("T2_PS", combined_T2_PS),
    ("T2_upper_PS", combined_T2_upper_PS),
    ("T2_mid_PS", combined_T2_mid_PS),
    ("T2_lower_PS", combined_T2_lower_PS),
    ("T3_PS", combined_T3_PS),
    ("T3_upper_PS", combined_T3_upper_PS),
    ("T3_mid_PS", combined_T3_mid_PS),
    ("T3_lower_PS", combined_T3_lower_PS),
    ("T4_PS", combined_T4_PS),
    ("T4_upper_PS", combined_T4_upper_PS),
    ("T4_mid_PS", combined_T4_mid_PS),
    ("T4_lower_PS", combined_T4_lower_PS)
]

save_to_excel('../Results/edge_frequencies_PS.xlsx', edge_frequencies_PS)

#### Network Analysis for Counts

In [None]:
plot_PS_3x3_networks('Whole_upper')

In [None]:
plot_PS_3x3_networks('Whole_mid')

In [None]:
plot_PS_3x3_networks('Whole_lower')

In [None]:
plot_PS_3x3_networks('T1_upper')

In [None]:
plot_PS_3x3_networks('T1_mid')

In [None]:
plot_PS_3x3_networks('T1_lower')

In [None]:
plot_PS_3x3_networks('T24_upper')

In [None]:
plot_PS_3x3_networks('T24_mid')

In [None]:
plot_PS_3x3_networks('T24_lower')

#### Network Analysis for N_Category

In [None]:
plot_PS_1x3_networks('Whole_upper')

In [None]:
plot_PS_1x3_networks('Whole_mid')

In [None]:
plot_PS_1x3_networks('Whole_lower')

In [None]:
plot_PS_1x3_networks('T1_upper')

In [None]:
plot_PS_1x3_networks('T1_mid')

In [None]:
plot_PS_1x3_networks('T1_lower')

In [None]:
plot_PS_1x3_networks('T24_upper')

In [None]:
plot_PS_1x3_networks('T24_mid')

In [None]:
plot_PS_1x3_networks('T24_lower')

#### Network Analysis for Main figures (counts = 2 and 6) 

In [None]:
plot_PS_specific_networks(final_descriptors, 'all')

### Network Analysis between two nodes

In [19]:
#Get edges frequencies based on Numbers
for descriptor in dataset_descriptors:
    for num in numbers:
        label = 'all' if num == 44 else str(num)
        # Create the global variable name
        var_name = f"EF_TN_{label}_{descriptor}"
        # Create the dataframe variable name
        df_name = f"{descriptor}_df"
        # Assign the result to the global variable
        globals()[var_name] = get_frequency_TN(globals()[df_name], num, None)
        
#Get edges frequency based on N categories
for descriptor in dataset_descriptors:
    for category in N_categories:
        label = f'N{category}'
        # Create the global variable name
        var_name = f"EF_TN_{label}_{descriptor}"
        # Create the dataframe variable name
        df_name = f"{descriptor}_df"
        # Assign the result to the global variable
        globals()[var_name] = get_frequency_TN(globals()[df_name], None, category)

In [20]:
combined_Whole = combine_edge_frequencies('Whole')
combined_Whole_upper = combine_edge_frequencies('Whole_upper')
combined_Whole_mid = combine_edge_frequencies('Whole_mid')
combined_Whole_lower = combine_edge_frequencies('Whole_lower')

combined_T1 = combine_edge_frequencies('T1')
combined_T1_upper = combine_edge_frequencies('T1_upper')
combined_T1_mid = combine_edge_frequencies('T1_mid')
combined_T1_lower = combine_edge_frequencies('T1_lower')

combined_T24 = combine_edge_frequencies('T24')
combined_T24_upper = combine_edge_frequencies('T24_upper')
combined_T24_mid = combine_edge_frequencies('T24_mid')
combined_T24_lower = combine_edge_frequencies('T24_lower')

combined_T2 = combine_edge_frequencies('T2')
combined_T2_upper = combine_edge_frequencies('T2_upper')
combined_T2_mid = combine_edge_frequencies('T2_mid')
combined_T2_lower = combine_edge_frequencies('T2_lower')

combined_T3 = combine_edge_frequencies('T3')
combined_T3_upper = combine_edge_frequencies('T3_upper')
combined_T3_mid = combine_edge_frequencies('T3_mid')
combined_T3_lower = combine_edge_frequencies('T3_lower')

combined_T4 = combine_edge_frequencies('T4')
combined_T4_upper = combine_edge_frequencies('T4_upper')
combined_T4_mid = combine_edge_frequencies('T4_mid')
combined_T4_lower = combine_edge_frequencies('T4_lower')

In [21]:
edge_frequencies_TN = [
    ("Whole", combined_Whole),
    ("Whole_upper", combined_Whole_upper),
    ("Whole_mid", combined_Whole_mid),
    ("Whole_lower", combined_Whole_lower),
    ("T1", combined_T1),
    ("T1_upper", combined_T1_upper),
    ("T1_mid", combined_T1_mid),
    ("T1_lower", combined_T1_lower),
    ("T24", combined_T24),
    ("T24_upper", combined_T24_upper),
    ("T24_mid", combined_T24_mid),
    ("T24_lower", combined_T24_lower),
    ("T2", combined_T2),
    ("T2_upper", combined_T2_upper),
    ("T2_mid", combined_T2_mid),
    ("T2_lower", combined_T2_lower),
    ("T3", combined_T3),
    ("T3_upper", combined_T3_upper),
    ("T3_mid", combined_T3_mid),
    ("T3_lower", combined_T3_lower),
    ("T4", combined_T4),
    ("T4_upper", combined_T4_upper),
    ("T4_mid", combined_T4_mid),
    ("T4_lower", combined_T4_lower)
]

save_to_excel('../Results/edge_frequencies_TN.xlsx', edge_frequencies_TN)

In [22]:
# Create combined node degrees for each descriptor
combined_Whole_Node = combine_node_degrees('Whole')
combined_Whole_upper_Node = combine_node_degrees('Whole_upper')
combined_Whole_mid_Node = combine_node_degrees('Whole_mid')
combined_Whole_lower_Node = combine_node_degrees('Whole_lower')

combined_T1_Node = combine_node_degrees('T1')
combined_T1_upper_Node = combine_node_degrees('T1_upper')
combined_T1_mid_Node = combine_node_degrees('T1_mid')
combined_T1_lower_Node = combine_node_degrees('T1_lower')

combined_T24_Node = combine_node_degrees('T24')
combined_T24_upper_Node = combine_node_degrees('T24_upper')
combined_T24_mid_Node = combine_node_degrees('T24_mid')
combined_T24_lower_Node = combine_node_degrees('T24_lower')

combined_T2_Node = combine_node_degrees('T2')
combined_T2_upper_Node = combine_node_degrees('T2_upper')
combined_T2_mid_Node = combine_node_degrees('T2_mid')
combined_T2_lower_Node = combine_node_degrees('T2_lower')

combined_T3_Node = combine_node_degrees('T3')
combined_T3_upper_Node = combine_node_degrees('T3_upper')
combined_T3_mid_Node = combine_node_degrees('T3_mid')
combined_T3_lower_Node = combine_node_degrees('T3_lower')

combined_T4_Node = combine_node_degrees('T4')
combined_T4_upper_Node = combine_node_degrees('T4_upper')
combined_T4_mid_Node = combine_node_degrees('T4_mid')
combined_T4_lower_Node = combine_node_degrees('T4_lower')

In [23]:
node_degrees_TN = [
    ("Whole", combined_Whole_Node),
    ("Whole_upper", combined_Whole_upper_Node),
    ("Whole_mid", combined_Whole_mid_Node),
    ("Whole_lower", combined_Whole_lower_Node),
    ("T1", combined_T1_Node),
    ("T1_upper", combined_T1_upper_Node),
    ("T1_mid", combined_T1_mid_Node),
    ("T1_lower", combined_T1_lower_Node),
    ("T24", combined_T24_Node),
    ("T24_upper", combined_T24_upper_Node),
    ("T24_mid", combined_T24_mid_Node),
    ("T24_lower", combined_T24_lower_Node),
    ("T2", combined_T2_Node),
    ("T2_upper", combined_T2_upper_Node),
    ("T2_mid", combined_T2_mid_Node),
    ("T2_lower", combined_T2_lower_Node),
    ("T3", combined_T3_Node),
    ("T3_upper", combined_T3_upper_Node),
    ("T3_mid", combined_T3_mid_Node),
    ("T3_lower", combined_T3_lower_Node),
    ("T4", combined_T4_Node),
    ("T4_upper", combined_T4_upper_Node),
    ("T4_mid", combined_T4_mid_Node),
    ("T4_lower", combined_T4_lower_Node)
]

# Save them to an Excel file
save_to_excel('../Results/node_degrees_TN.xlsx', node_degrees_TN)

#### Network Analysis (TN) for counts

In [None]:
plot_TN_3x3_networks('Whole_upper')

In [None]:
plot_TN_3x3_networks('Whole_mid')

In [None]:
plot_TN_3x3_networks('Whole_lower')

In [None]:
plot_TN_3x3_networks('T1_upper')

In [None]:
plot_TN_3x3_networks('T1_mid')

In [None]:
plot_TN_3x3_networks('T1_lower')

In [None]:
plot_TN_3x3_networks('T24_upper')

In [None]:
plot_TN_3x3_networks('T24_mid')

In [None]:
plot_TN_3x3_networks('T24_lower')

#### Network Analysis (TN) for N_Category

In [None]:
plot_TN_1x3_networks('Whole_upper')

In [None]:
plot_TN_1x3_networks('Whole_mid')

In [None]:
plot_TN_1x3_networks('Whole_lower')

In [None]:
plot_TN_1x3_networks('T1_upper')

In [None]:
plot_TN_1x3_networks('T1_mid')

In [None]:
plot_TN_1x3_networks('T1_lower')

In [None]:
plot_TN_1x3_networks('T24_upper')

In [None]:
plot_TN_1x3_networks('T24_mid')

In [None]:
plot_TN_1x3_networks('T24_lower')

#### Network Analysis between Two Nodes (ALL)

In [None]:
visualize_Network_TN(EF_TN_2_Whole_mid, '<=2 Metastasis')

In [None]:
visualize_Network_TN(EF_TN_10_Whole_mid, '<=10 Metastasis')

In [None]:
visualize_Network_TN(EF_TN_2_T1, 'T1 <=2 Metastasis')

In [None]:
visualize_Network_TN(EF_TN_2_T24, 'T2-4 <=2 Metastasis')

#### Network Analysis for main figures (counts = 2 and 6)

In [None]:
plot_TN_specific_networks(final_descriptors, 'all')

#### Network Analysis figure 3

In [27]:
#Define color map to be used for final TN figure
color_mapping = {
    "T1_upper": {
        "104L": "#FFD700",
        "106recR": "#FFD700",
        "106recL": "#FFD700",
        "105/108/110": "#98BF64" # Light Green
    },
    "T24_upper": {
        "105/108/110": "#98BF64",# Light Green
        "1/2/7": "#FFD700",
        "106recR": "#FFD700",
        "106recL": "#FFD700"
    },
    "T1_mid": {
        "105/108/110": "#FFD700",
        "1/2/7": "#FFD700",
        "106recR": "#FFD700",
        "106recL": "#FFD700"
    },
    "T24_mid": {
        "105/108/110": "#FFD700",
        "1/2/7": "#FFD700",
        "106recR": "#FFD700",
        "106recL": "#FFD700",
        "107": "#FFD700"
    },
    "T1_lower": {
        "106recR": "#FFD700",
        "106recL": "#FFD700",
        "9": "#006400",
        "1/2/7": "#FFD700",
        "105/108/110": "#006400" # Dark Green
    },
    "T24_lower": {
        "106recR": "#FFD700",
        "105/108/110": "#FFD700",
        "1/2/7": "#FFD700",
        "107": "#006400", # Dark Green
        "9": "#006400" # Light Green
    }
}

In [None]:
# Generate the plots for count = 6
plot_TN_specific_networks_final(final_descriptors, 'all', color_mapping)

### Mosaic Plot

In [None]:
EF_PS_1_Whole_filtered = filter_low_frequencies(EF_PS_1_Whole, 6)
plot_mosaic(EF_PS_1_Whole_filtered, 'Mosaic Plot for Single Metastasis')

In [None]:
EF_PS_2_Whole_filtered = filter_low_frequencies(EF_PS_2_Whole, 8)
plot_mosaic(EF_PS_2_Whole_filtered, 'Mosaic Plot for <=2 Metastasis')

In [None]:
EF_PS_3_Whole_filtered = filter_low_frequencies(EF_PS_3_Whole, 10)
plot_mosaic(EF_PS_3_Whole_filtered, 'Mosaic Plot for <=3 Metastasis')

In [None]:
EF_PS_4_Whole_filtered = filter_low_frequencies(EF_PS_4_Whole, 13)
plot_mosaic(EF_PS_4_Whole_filtered, 'Mosaic Plot for <=4 Metastasis')

In [None]:
EF_PS_5_Whole_filtered = filter_low_frequencies(EF_PS_5_Whole, 15)
plot_mosaic(EF_PS_5_Whole_filtered, 'Mosaic Plot for <=5 Metastasis')

In [None]:
EF_PS_6_Whole_filtered = filter_low_frequencies(EF_PS_6_Whole, 23)
plot_mosaic(EF_PS_6_Whole_filtered, 'Mosaic Plot for <=6 Metastasis')

### Nested Pie-chart (Visualization)

In [None]:
plot_nested_pie(df, 2, '<=2')

In [None]:
plot_nested_pie(df, 3, '<=3')

In [None]:
plot_nested_pie(df, 4, '<=4')

In [None]:
plot_nested_pie(df, 44, '<=44')