## Contextualization
Raw transactional data is loaded and columns of interest are identified for contextualization (layering).

## Clustering
Each layer is clustered independently over all time windows

## Temporal Community Segmentation
Clusters are split up based on their timestamp into multiple time windows


## Feature Engineering
_Features are extracted for each cluster:_
- cluster size
- cluster standard deviation
- cluster scarcity
- cluster popularity (importance I)
- cluster diversity (importance II)
- cluster range/needed space
- cluster center


_Features are extracted for each layer:_
- relative cluster sizes
- layer entropy
- distance from global centers

new:
- number of nodes
- number of clusters
- center of clusters

## Cluster Metrics Calculation

In [None]:
from typing import List
import json
import os
from entities import TimeWindow, Cluster

def calculate_metrics_for_clusters(layer_name: str, feature_names: List[str]):
    '''
    :param layer_name: Name of the layer for which multiple time windows exist
    :param feature_names: Features of the layer
    '''
    print(f"Working on {layer_name}")

    path_in = f'input/timeslices/{layer_name}'
    path_out = f'input/metrics/{layer_name}.json'

    complete_clusters: List[Cluster] = []

    for root, _, files in os.walk(path_in):
        for f in files:
            with open(os.path.join(root, f), 'r') as file:
                # for each time window json
                json_slice = json.loads(file.read())
                time_window = TimeWindow.create_from_serializable_dict(json_slice)

                # create all clusters + metrics for one time window
                clusters = Cluster.create_multiple_from_time_window(time_window, feature_names)
                complete_clusters.extend(clusters)
        
    # store the cluster metrics
    with open(path_out, 'w') as file:
        file.write(json.dumps([cl.__dict__ for cl in complete_clusters]))

In [None]:
layers = [
    ['CallTypeLayer', 'call_type'],
    ['DayTypeLayer', 'day_type'],
    ['TaxiIdLayer', 'taxi_id'],

    ['OriginCallLayer', ('call_type', 'origin_call')],
    ['OriginStandLayer', ('call_type', 'origin_stand')],
    ['StartLocationLayer', ('start_location_lat', 'start_location_long')],
    ['EndLocationLayer', ('end_location_lat', 'end_location_long')],
]

for layer in layers:
    calculate_metrics_for_clusters(layer[0], layer[1])

## ML Input Preparation

In [4]:
import json
from entities import Cluster
import collections
import numpy as np
from typing import Iterable

def get_evolution_label(old_size: int, new_size: int) -> int:
    '''Returns the evolution label as int by mapping 0..4 to {continuing, shrinking, growing, dissolving, forming}.'''
    if old_size == new_size:
        return 0 # continuing
    if old_size == 0 and new_size > 0:
        return 4 # forming
    if old_size > 0 and new_size == 0:
        return 3 # dissolving
    if old_size > new_size:
        return 1 # shrinking
    if old_size < new_size:
        return 2 # growing

def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> (float, float):
    return (np.sin(2*np.pi*time/max_time_value),
            np.cos(2*np.pi*time/max_time_value))

def create_metrics_training_data(layer_name: str, N: int = 3) -> Iterable[list]:
    """
    Loads the metrics training data for an individual layer from disk.
    A single metrics training data point should look like this:

    (cluster_size, cluster_std_dev, cluster_scarcity, cluster_import1, cluster_import2, cluster_range, cluster_center, time_info) ^ N, evolution_label
    time_info ... the time as 2d cyclic feature, i.e. time_info := (time_f1, time_f2)

    The first tuple represents metrics from the cluster in t_i-(N-1).
    The Nth tuple represents metrics from the cluster in t_i.
    The label is one of {continuing, shrinking, growing, dissolving, forming} \ {splitting, merging} and identifies the change for t_i+1.
    
    :param N: number of cluster metric tuples
    :param layer_name: the name of the layer metrics json file
    """
    
    path_in = f"input/metrics/{layer_name}.json"
    with open(path_in, 'r') as file:
        data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())]

    data.sort(key=lambda cl: (cl.cluster_id, cl.time_window_id))

    # manually prepare deque with N metric_tuples + evolution label
    tuples = []

    for i, cur_cluster in enumerate(data[:-1]):

        if cur_cluster.cluster_id != data[i+1].cluster_id:
            # next cluster slice in list will be another cluster id -> restart deque and skip adding the current (last) cluster slice
            tuples = []
            continue

        cur_metrics = (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2, cur_cluster.range_, cur_cluster.center, get_cyclic_time_feature(cur_cluster.get_time_info()))

        # deque function: adding N+1st element will remove oldest one
        if len(tuples) == N:
            tuples.pop(0)
        tuples.append(cur_metrics)

        if len(tuples) == N:
            label = get_evolution_label(cur_cluster.size, data[i+1].size)
            yield list(tuples) + [label]


In [33]:
def flatten_metrics_datapoint(datapoint: list) -> ('X, y: np.array'):
    '''
    Flattens a single metrics data point in the form:
    [(cluster_size, cluster_variance, cluster_density, cluster_import1, cluster_import2, cluster_range, cluster_center, (time_f1, time_f2))^N, evolution_label]
    to:
    (X, y: np.array
    '''
    flat_list = []
    for entry in datapoint[:-1]: # for all x
        flat_list.extend(entry[:-1]) # add all number features except the time tuple
        flat_list.extend(entry[-1]) # add time tuple

    flat_list.append(datapoint[-1]) # y
    return np.asarray(flat_list)


In [55]:
import pandas as pd

def convert_metrics_data_to_dataframe(data: Iterable, columns=['cluster_size', 'cluster_variance', 'cluster_density', 'cluster_import1', 'cluster_import2', 
        'cluster_range', 'cluster_center', 'time_f1', 'time_f2']*3 + ['evolution_label']) -> pd.DataFrame:
    '''Flattens and splits metrics data to match ML conventions.'''
    training_data = []

    for element in data:
        xy: 'np.array' = flatten_metrics_datapoint(element)
        
        training_data.append(xy)

    return pd.DataFrame(data=training_data, columns=columns)

In [59]:
p = [[(1,1,1,1,1,1,1,(1,1)), (2,2,2,2,2,2,2,(2,2)), (3,3,3,3,3,3,3,(3,3)), 4], [(1,1,1,7,1,1,1,(1,1)), (2,2,2,7,2,2,2,(2,2)), (3,3,3,3,3,7,3,(3,7)), 9]]
convert_metrics_data_to_dataframe(p)


Unnamed: 0,cluster_size,cluster_variance,cluster_density,cluster_import1,cluster_import2,cluster_range,cluster_center,time_f1,time_f2,cluster_size.1,...,cluster_size.2,cluster_variance.1,cluster_density.1,cluster_import1.1,cluster_import2.1,cluster_range.1,cluster_center.1,time_f1.1,time_f2.1,evolution_label
0,1,1,1,1,1,1,1,1,1,2,...,3,3,3,3,3,3,3,3,3,4
1,1,1,1,7,1,1,1,1,1,2,...,3,3,3,3,3,7,3,3,7,9


## Balancing and Training/Testing Split

In [60]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import collections
import statistics as stat

def balance_dataset(df: DataFrame) -> DataFrame:
    # TODO
    return data

def save_training_data(layer_name: str, test_dataset_frac: float = .2) -> '(X_train, Y_train, X_test, Y_test)':
    # load metrics data from disk
    data: Iterable = create_metrics_training_data(layer_name=layer_name)
    
    # convert to df
    df: DataFrame = convert_metrics_data_to_dataframe(data)
    df = balance_dataset(df)
    
    df.to_csv(f'output/cluster_metrics/data/{layer_name}')

    # # split in training and test set
    # test_size = int(X.shape[0] * test_dataset_frac) 
    # X_train = X[test_size:]
    # Y_train = Y[test_size:]
    # X_test = X[:test_size]
    # Y_test = Y[:test_size]

    # print(f"\nWorking with: {X_train.shape[0]} training points + {X_test.shape[0]} test points ({X_test.shape[0]/(X_train.shape[0]+X_test.shape[0])}).")
    # print(f"Label Occurrences: Total = {collections.Counter(Y_train.tolist() + Y_test.tolist())}, "\
    #       f"Training = {collections.Counter(Y_train)}, Test = {collections.Counter(Y_test)}")
    # try:
    #     print(f"Label Majority Class: Training = {stat.mode(Y_train)}, Test = {stat.mode(Y_test)}\n")
    # except stat.StatisticsError:
    #     print(f"Label Majority Class: no unique mode; found 2 equally common values")

    # return X_train, Y_train, X_test, Y_test