# Clustering Experiments

In [None]:
%run Detector.ipynb

In [None]:
from collections import defaultdict
import plotly.express as px
import seaborn as sns
import pandas as pd
import threading
import os
import re

import warnings
warnings.filterwarnings('ignore')

In [None]:
def get_files_like(file_name_pattern=r'(?s).*', folder=None):
    """
    Returns a list of all files in `folder` with a name that matches
    `file_name_pattern`.
    """
    file_names = []
    for file_name in os.listdir(folder):
        if re.search(file_name_pattern, file_name):
            file_names.append(os.path.join(folder, file_name))
    return file_names
                
def load_data_from(file_names, **kwargs):
    """
    Returns the dataframe(s) associated with each file name in 
    `file_names`.
    """
    data = []
    if type(file_names) == str:
        return pd.read_csv(file_names, **kwargs)
    else:
        for file_name in file_names:
            data.append(pd.read_csv(file_name, **kwargs))      
    return data[0] if len(data) == 1 else data

def key_formatter(name):
    """
    Reformats the names of the files for convenience.
    """
    return name.split('_')[-1].split('.')[0]

def run_experiments(locations, kernels, clusters, outputs=defaultdict(dict), *args, **kwargs):
    """
    Returns a nested dictionary, d, such that d[c][k] is the detector 
    used for kernel k with a clustering value of c.
    """
    for c in clusters:
        for n, k in kernels.items():
            outputs[c][n] = Detector(k, locations, as_df=True, n_clusters=c, *args, **kwargs).run()
        print('Completed cluster:', c)
    return outputs

def parallelize_experiments(locations, kernels, clusters, *args, **kwargs):
    """
    Parallelizes the calls to run_experiments. The `clusters` parameter
    is assumed to have the following format:
    
        clusters = [
            [ ...cluster(s)... ],
            [ ...cluster(s)... ],
            ...
            [ ...cluster(s)... ]
        ]
    
    Each list of clusters will be processed by one thread.
    """
    threads = []
    outputs = defaultdict(dict)
    for i, cluster in enumerate(clusters):
        inputs = (locations, kernels, cluster, outputs, *args, )
        threads.append(threading.Thread(target=run_experiments, name=f'thread {i}', args=inputs, kwargs=kwargs))
        threads[i].start()

    for t in threads:
        t.join()
    
    print("Done!")
    return outputs

In [None]:
KER_FOLDER = 'data/kernel1'
LOC_FOLDER = 'data/locations'

stp_loc = load_data_from(get_files_like('stp_cluster', LOC_FOLDER))
kernels = {
    key_formatter(f) : load_data_from(f, header=None) for f in get_files_like('kernel', KER_FOLDER)
}

In [None]:
stp_loc

In [None]:
# Specify tolerance for bridge labeling and steady state detection
b, s = 1e-2, 1e-2

In [None]:
# Uncomment these lines to run experiments in series
# clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 150, 200, 250, 300]
# results = run_experiments(stp_loc, kernels, clusters, b_tol=b, s_tol=s)
# results

In [None]:
# Uncomment these lines to run experiments in parallel
clusters = [
    [2, 3, 4, 5, 6, 7, 8, 9, 10],
    [25, 75, 100],
    [50, 150]
]
results = parallelize_experiments(stp_loc, kernels, clusters, b_tol=b, s_tol=s)
results

# Data Visualization

In [None]:
# Helpers

def get_label_counts(results, cluster_num, kernel_names=None):
    df = pd.DataFrame()
    names = results[cluster_num].keys() if kernel_names is None else kernel_names
    for name in names:
        d = results[cluster_num][name]
        counts = d.results().groupby('type').size().to_frame().rename(columns={0 : "count"})
        if 'bridge' not in counts.index:
            counts = counts.append(pd.DataFrame([0], index=['bridge'], columns=['count']))
        if 'sink'   not in counts.index:
            counts = counts.append(pd.DataFrame([0], index=['sink'], columns=['count']))
        if 'source' not in counts.index:
            counts = counts.append(pd.DataFrame([0], index=['source'], columns=['count']))
        counts = pd.concat({name : counts}, names=['name'])
        df = pd.concat([df, counts])
    df['clusters'] = cluster_num
    return df

def get_steps(d, k, start=None, final=None):
    if type(k) != int:
        return k
    elif start is None and final is None:
        return np.round(np.linspace(0, d.ss_step, k)).astype(int)
    elif start is None:
        return np.round(np.linspace(0, final, k)).astype(int)
    elif final is None:
        return np.round(np.linspace(0, d.ss_step, k)).astype(int)
    else:
        return np.round(np.linspace(start, final, k)).astype(int)

# Static Plots
    
def steady_state_lineplot(results, cluster_num, k=200, start=None, final=None, fs=(8,8)):
    """
    Plots the number of nodes that experienced a population change 
    versus time over `k` evenly spaced values. A custom range may be
    used if `k` is passed in as a list of ints.
    """
    df = pd.DataFrame()
    if final is None: final = np.max([d.ss_step for d in results[cluster_num].values()])
    for n, d in results[cluster_num].items():
        steps = get_steps(d, k, start, final)
        temp = pd.DataFrame()
        temp['count'] = np.sum(~np.isclose(np.diff(np.array([d.migrate(s) for s in steps]), axis=0), 0), axis=1)
        temp['kname'] = n
        temp['steps'] = steps[1:]
        df = pd.concat([df, temp])
    plt.figure(figsize=fs)
    sns.lineplot(x="steps", y="count", hue="kname", data=df)
    plt.title("Steady State Progression")
    plt.xlabel("Time Step")
    plt.ylabel("Number of Migrations")
    
def label_counts_barplot(results, cluster_num, fs=(8,6)):
    """
    Plots a side by side bar plot of the sink/bridge/source counts
    for each kernel.
    """
    df = get_label_counts(results, cluster_num).reset_index().rename(columns={'level_1' : 'type'})
    plt.figure(figsize=fs)
    sns.barplot(x="name", y="count", hue="type", data=df)
    plt.title(f"Sink/Bridge/Source Counts for {cluster_num} Cluster(s)")
    plt.xticks(rotation=30);

def label_counts_lineplot(results, kernel_name, fs=(8,6)):
    """
    Plots a lineplot of sink/bridge/source count versus clusters
    for `kernel_name`.
    """
    df = pd.concat([get_label_counts(results, c, [kernel_name]) for c in results])\
           .reset_index()\
           .rename(columns={'level_1' : 'type'})
    plt.figure(figsize=fs)
    sns.lineplot(x="clusters", y="count", hue="type", data=df)
    plt.title(f"Sink/Bridge/Source Counts for {kernel_name} Cluster(s)")
    plt.xticks(rotation=30);

# Interactive Plots
    
def plot_interactive_label_counts(results, kernel_name):
    """
    Plots a side by side bar plot of the sink/bridge/source counts
    for `kernel_name` over all clusters.
    """
    df = pd.concat([get_label_counts(results, c, [kernel_name]) for c in results])\
           .reset_index()\
           .rename(columns={'level_1' : 'type'})
    fig = px.bar(df, x="type", y="count", color="type",
      animation_frame="clusters", animation_group="clusters", range_y=[0, df["count"].max()])
    fig.show()
    
    
def plot_population_distribution(results, cluster_num, kernel_name, k=200, start=None, final=None):
    """
    Plots an interactive visualization of the population for every 
    state from time step `start` to time step `final` over `k` values
    for `kernel_name`. A custom range may be used if `k` is passed in
    as a list of ints.
    """
    d = results[cluster_num][kernel_name]
    steps = get_steps(d, k, start, final)        
    df = pd.DataFrame({
        'node'       : list(range(len(d.tmtx))) * len(steps),
        'population' : np.array([d.migrate(s) for s in steps]).flatten(),
        'step'       : np.array([[s] * len(d.tmtx) for s in steps]).flatten()
    })
    
    fig = px.bar(df, x="node", y="population", color="node",
      animation_frame="step", animation_group="node", range_y=[0, df['population'].max()])
    fig.show()

In [None]:
print(" Kernels:", list(kernels.keys()))
print("Clusters:", np.array(clusters).flatten())

In [None]:
# Specify a kernel name and a clustering number
KERNEL_NAME = '2500'
CLUSTER_NUM = 100

## Steady State Analysis

In [None]:
steady_state_lineplot(results, CLUSTER_NUM, k=300)

In [None]:
custom_range = list(range(81)) + list(range(500, 1000, 50))
plot_population_distribution(results, CLUSTER_NUM, KERNEL_NAME, k=custom_range)

#### Observation(s): 
- The time step at which steady state occurs appears to be longer as kernel number decreases.
- The number of clusters do not interfere with the time step at which steady state occurs.
- The population changes most drastically in the first hundred time steps. Very minor changes in population occur afterwards. This occurs for all combinations of kernels and clusters.
- When the same initial population is used for all kernels, each kernel reaches the same steady state distribution.

## Sink/Source/Bridge Analysis

In [None]:
label_counts_barplot(results, CLUSTER_NUM)

In [None]:
label_counts_lineplot(results, KERNEL_NAME)

#### Observation(s): 
- Assuming all kernels have reached steady state, there appears to be no variation among the number of sinks, sources, and bridges for each of the kernels.
- The number of sinks and sources increase as the number of clusters increases. 

# Network Plots

In [None]:
import geojson
from descartes import PolygonPatch

# To plot a background, convert the SHP file(s) to a geojson file
# Use: https://mygeodata.cloud/converter/shp-to-geojson
with open("STP.geojson") as json_file:
    json_data = geojson.load(json_file)

poly = json_data['features']

In [None]:
# Retrieves the polygon coordinates for the background 
principe = { 'type' : 'MultiPolygon', 'coordinates' : poly[0]['geometry']['coordinates'] }
sao_tome = { 'type' : 'MultiPolygon', 'coordinates' : poly[1]['geometry']['coordinates'] }
all_data = { 'type' : 'MultiPolygon', 'coordinates' : poly[0]['geometry']['coordinates'][:] }
all_data['coordinates'].extend(poly[1]['geometry']['coordinates'][:])

In [None]:
d = results[CLUSTER_NUM][KERNEL_NAME]
cids = d.clabels()
prps = d.results()

principe_locs = cids[cids['lat'] > 1.25]
principe_tmtx = kernels[KERNEL_NAME].iloc[principe_locs.index, principe_locs.index]
principe_coms = prps[prps.index.isin(principe_locs['cid'].unique())]

sao_tome_locs = cids[cids['lat'] < 0.50]
sao_tome_tmtx = kernels[KERNEL_NAME].iloc[sao_tome_locs.index, sao_tome_locs.index]
sao_tome_coms = prps[prps.index.isin(sao_tome_locs['cid'].unique())]

In [None]:
plot_data(principe_tmtx, principe_locs, principe_coms,\
          nodes_fn=lambda x: x**(1/3),
          bordr_mu=3,
          min_prob=0.007,
          edges_mu=100,
          bgrd_crd=principe,
          bgbd_lwd=5,
          fig_size=(10,9),
          plt_bbar=True,
          plt_pbar=True,
          plt_sbar=True
)

In [None]:
plot_data(sao_tome_tmtx, sao_tome_locs, sao_tome_coms,\
          nodes_fn=lambda x: x**(1/4),
          bordr_mu=5,
          edges_fn=np.log10,
          min_prob=0.009,
          edges_mu=1,
          bgrd_crd=sao_tome,
          bgbd_lwd=2,
          fig_size=(13,9),
          plt_bbar=True,
          plt_pbar=True,
#           plt_sbar=True,
)