# Community Prediction based on Taxi Dataset

## Load dataset
1. TRIP_ID: (String) It contains an unique identifier for each trip;
1. CALL_TYPE: (char) It identifies the way used to demand this service. It may contain one of three possible values:
    - ‘A’ if this trip was dispatched from the central;
    - ‘B’ if this trip was demanded directly to a taxi driver on a specific stand;
    - ‘C’ otherwise (i.e. a trip demanded on a random street).
1. ORIGIN_CALL: (integer) It contains an unique identifier for each phone number which was used to demand, at least, one service. It identifies the trip’s customer if CALL_TYPE=’A’. Otherwise, it assumes a NULL value;
1. ORIGIN_STAND: (integer): It contains an unique identifier for the taxi stand. It identifies the starting point of the trip if CALL_TYPE=’B’. Otherwise, it assumes a NULL value;
1. TAXI_ID: (integer): It contains an unique identifier for the taxi driver that performed each trip;
1. TIMESTAMP: (integer) Unix Timestamp (in seconds). It identifies the trip’s start; 
1. DAYTYPE: (char) It identifies the daytype of the trip’s start. It assumes one of three possible values:
    - ‘B’ if this trip started on a holiday or any other special day (i.e. extending holidays, floating holidays, etc.);
    - ‘C’ if the trip started on a day before a type-B day;
    - ‘A’ otherwise (i.e. a normal day, workday or weekend).
1. MISSING_DATA: (Boolean) It is FALSE when the GPS data stream is complete and TRUE whenever one (or more) locations are missing
1. POLYLINE: (String): It contains a list of GPS coordinates (i.e. WGS84 format) mapped as a string. The beginning and the end of the string are identified with brackets (i.e. \[ and \], respectively). Each pair of coordinates is also identified by the same brackets as \[LONGITUDE, LATITUDE\]. This list contains one pair of coordinates for each 15 seconds of trip. The last list item corresponds to the trip’s destination while the first one represents its start;


In [None]:
import csv
import json
from datetime import datetime
from typing import Iterator

enum_mapping = {'A': 1, 'B': 2, 'C': 3}

def load_csv_content() -> Iterator:
    '''Returns a generator for all lines in the csv file with correct field types.'''
    
    with open('input/train.csv') as csv_file:
        reader = csv.reader(csv_file)    

        headers = [h.lower() for h in next(reader)]

        for line in reader:
            # convert line fields to correct type
            for i in range(len(headers)):
                # trip_id AS string
                if i == 0:
                    continue
                # call_type, day_type 
                if i in [1, 6]:
                    line[i] = enum_mapping[line[i]]
                # origin_call, origin_stand, taxi_id AS int
                elif i in [2, 3, 4]:
                    line[i] = int(line[i]) if line[i] != "" else ""
                # timestamp AS timestamp
                elif i == 5:
                    # datetime is not serializable
                    # line[i] = datetime.fromtimestamp(int(line[i]))
                    line[i] = int(line[i])
                # missing_data AS bool
                elif i == 7: 
                    line[i] = line[i].lower() == 'true'
                # polyline AS List[List[float]]
                elif i == 8:
                    line[i] = json.loads(line[i])

            entry = dict(zip(headers, line))
            yield entry


In [None]:
print(next(load_csv_content()))

## Display some dataset routes

In [None]:
from typing import List
import folium

def displayNodes(nodes: List[List[float]]):  
    '''
    Displays the nodes on a map of the city.

    :param nodes: A list of coordinates, eg. [[1,2],[1,3]]
    '''
    m = folium.Map(location=[41.15,-8.6],tiles='stamenterrain',zoom_start=12, control_scale=True) 

    for idx, node in enumerate(nodes): 
        popupLabel = idx

        folium.Marker(
          location=[node[1], node[0]],
          #popup='Cluster Nr: '+ str(node.cluster_no),
          popup=popupLabel,
          icon=folium.Icon(color='red', icon='circle'),
        ).add_to(m)
      
    display(m)

In [None]:
content = load_csv_content()

In [None]:
displayNodes(next(content)['polyline'])

# Model Training

## Split dataset into multiple layers
The SMART pipeline is used to split up the data in multiple layers. Therefore, the csv file is uploaded to the Semantic Linking microservice for layer creation. <br />
Next, the Role Stage Discovery microservice will cluster the individual layers and splits them into multiple time windows based on the timestamp.

## Define features per cluster/ layer
### "Local" features based on single clusters
- cluster size ($\#\ cluster\ nodes$)
- cluster standard deviation (variance from cluster mean)
- cluster scarcity (ratio $\frac{cluster\ range}{cluster\ size}$ ) <br />
Scarcity is perferred over density to avoid divide-by-zero error
- (cluster trustworthiness)
### "Global" features based on clusters in the context of a layer
- cluster importance I (ratio $\frac{cluster\ size}{\#\ layer\ nodes}$)
- cluster importance II (ratio $\frac{1}{diversity}$, where *diversity* = number of clusters with nodes > 0)

## Calculate the Metrics for the Clusters

In [None]:
from typing import List
import json
import os
from entities import TimeWindow, Cluster

def calculate_metrics_for_clusters(layer_name: str = 'CallTypeLayer', feature_names: List[str] = ['call_type']):
    print(f"Working on {layer_name}")

    path_in = f'input/timeslices/{layer_name}'
    path_out = f'input/metrics/{layer_name}.json'

    complete_clusters: List[Cluster] = []

    for root, _, files in os.walk(path_in):
        for f in files:
            with open(os.path.join(root, f), 'r') as file:
                json_slice = json.loads(file.read())
                time_window = TimeWindow.create_from_serializable_dict(json_slice)

                # create all clusters + metrics for one time window
                clusters = Cluster.create_multiple_from_time_window(time_window, feature_names)
                complete_clusters.extend(clusters)
        
    # store the cluster metrics
    with open(path_out, 'w') as file:
        file.write(json.dumps([cl.__dict__ for cl in complete_clusters]))

In [None]:
layers = [
    ['CallTypeLayer', 'call_type'],
    ['DayTypeLayer', 'day_type'],
    ['TaxiIdLayer', 'taxi_id'],

    ['OriginCallLayer', ('call_type', 'origin_call')],
    ['OriginStandLayer', ('call_type', 'origin_stand')],
    ['StartLocationLayer', ('start_location_lat', 'start_location_long')],
    ['EndLocationLayer', ('end_location_lat', 'end_location_long')],
]

for layer in layers:
    calculate_metrics_for_clusters(layer[0], layer[1])

## Prepare cluster metrics and evolution labels for ML

In [None]:
# Example how to convert time to a cyclic 2d feature

MAX_TIME_VAL = 52 # for weeks

import numpy as np
import matplotlib.pyplot as plt

times = np.asarray([i+1 for i in range(52)][::])

df = {}
df['sin_time'] = np.sin(2*np.pi*times/MAX_TIME_VAL)
df['cos_time'] = np.cos(2*np.pi*times/MAX_TIME_VAL)

plt.plot(df['sin_time'])
plt.plot(df['cos_time'])
plt.show()

plt.scatter(df['sin_time'], df['cos_time'])
plt.show()

# feature_new = {i+1:(s,c) for i,(s,c) in enumerate(zip(df['sin_time'], df['cos_time']))}

In [None]:
# Example how to calculate convex hull from points
# Used to calculate 2d area for Scarcity metric

from scipy.spatial import ConvexHull, convex_hull_plot_2d
import numpy as np

points = np.asarray([[0.0,0.0], [1.0,3.0], [3.0,2.0], [0.0,2.0], [1.0,2.0], [2.0,2.0], [2.0,1.0]])

def _get_polygon_border_points(points) -> 'np.array':
        hull = ConvexHull(points)
        return points[hull.vertices]

res = _get_polygon_border_points(points)

import matplotlib.pyplot as plt

plt.plot(points[:,0], points[:,1], 'o')
plt.plot(res[:,0], res[:,1], 'o')

plt.show()

In [None]:
import json
from entities import Cluster
import collections
import numpy as np
from typing import Iterable

def get_evolution_label(old_size: int, new_size: int) -> int:
    '''Returns the evolution label as int by mapping 0..4 to {continuing, shrinking, growing, dissolving, forming}.'''
    if old_size == new_size:
        return 0 # continuing
    if old_size == 0 and new_size != 0:
        return 4 # forming
    if old_size != 0 and new_size == 0:
        return 3 # dissolving
    if old_size > new_size:
        return 1 # shrinking
    if old_size < new_size:
        return 2 # growing

def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> (float, float):
    return (np.sin(2*np.pi*time/max_time_value),
            np.cos(2*np.pi*time/max_time_value))

def create_metrics_training_data(N: int = 3, layer_name: str = 'CallTypeLayer') -> Iterable:
    """
    A single metrics training data point should look like this:

    (cluster_size, cluster_std_dev, cluster_scarcity, cluster_import1, cluster_import2, time_info) ^ N, evolution_label
    time_info ... the time as 2d cyclic feature, i.e. time_info := (time_f1, time_f2)

    The first tuple represents metrics from the cluster in t_i-(N-1).
    The Nth tuple represents metrics from the cluster in t_i.
    The label is one of {continuing, shrinking, growing, dissolving, forming} \ {splitting, merging} and identifies the change for t_i+1.
    
    :param N: number of cluster metric tuples
    """
    
    path_in = f"input/metrics/{layer_name}.json"
    with open(path_in, 'r') as file:
        data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())]

    data.sort(key=lambda cl: (cl.cluster_id, cl.time_window_id))

    # manually prepare deque with N metric_tuples + evolution label
    tuples = []
    prev_cluster_id = -1

    for i, cur_cluster in enumerate(data[:-1]):

        if cur_cluster.cluster_id != data[i+1].cluster_id:
            # next cluster slice in list will be another cluster id -> restart deque and skip adding the current (last) cluster slice
            tuples = []
            continue

        cur_metrics = (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2, get_cyclic_time_feature(cur_cluster.get_time_info()))

        # deque function: adding N+1st element will remove oldest one
        if len(tuples) == N:
            tuples.pop(0)
        tuples.append(cur_metrics)

        label = get_evolution_label(cur_cluster.size, data[i+1].size)

        if len(tuples) == N:
            yield list(tuples) + [label]


In [None]:
def flatten_metrics_datapoint(datapoint: list) -> ('X', 'Y'):
    '''
    Flattens a single metrics data point in the form:
    [(cluster_size, cluster_variance, cluster_density, cluster_import1, cluster_import2, (time_f1, time_f2))^N, evolution_label]
    to:
    (X: np.array, evolution_label)
    '''
    flat_list = []
    for entry in datapoint[:-1]: # for all x
        flat_list.extend(entry[:-1]) # add all number features except the time tuple
        flat_list.extend(entry[-1]) # add time tuple

    # flat_list.append(datapoint[-1]) # add y

    return np.asarray(flat_list), datapoint[-1]


In [None]:
def convert_metrics_data_for_training(data: Iterable) -> ('nparray with Xs', 'nparray with Ys'):
    '''Flattens and splits metrics data to match ML conventions.'''
    X = []
    Y = []

    for element in data:
        x, y = flatten_metrics_datapoint(element)
        
        X.append(x)
        Y.append(y)

    return (np.asarray(X), np.asarray(Y))

## Evolution Prediction Approach

### 1. Prediction of cluster evolution based on metrics from clusters in one layer
Use cluster metrics from last N time windows to predict the change in $t_{i+1}$.
Either use normal classification with $(cluster\_metrics)^{N} \cup (label)$ or choose a RNN.


In [None]:
data = list(create_metrics_training_data(layer_name='CallTypeLayer'))

import random
random.shuffle(data)

# split in 130 training + 20 testing
train_metrics = data[:-30]
test_metrics = data[len(data)-30:]

print(f"Working with: {len(train_metrics)} training points + {len(test_metrics)} test points ({len(test_metrics)/(len(train_metrics)+len(test_metrics))}).")

X_train, Y_train = convert_metrics_data_for_training(train_metrics)
X_test, Y_test = convert_metrics_data_for_training(test_metrics)


import collections
import statistics as stat
print(f"Label Occurrences: Total = {collections.Counter(Y_train.tolist() + Y_test.tolist())}, Training = {collections.Counter(Y_train)}, Test = {collections.Counter(Y_test)}")
print(f"Label Majority Class: Training = {stat.mode(Y_train)}, Test = {stat.mode(Y_test)}\n")

## SVM classifier

In [None]:
# train
from sklearn import svm

svc = svm.SVC(kernel='linear')
svc.fit(X_train, Y_train)

In [None]:
# verify
import sklearn

pred_Y = svc.predict(X_test)

print(sklearn.metrics.classification_report(y_true=Y_test, y_pred=pred_Y))

In [None]:
# export
import pickle 

with open('output/svc.model', 'wb') as file:
    b = pickle.dump(svc, file)

# import  
import pickle 

FILE_NAME = 'svc'
with open(f'output/{FILE_NAME}.model', 'rb') as file:
    svc = pickle.load(file)

## Naive classifiers

In [None]:
import sklearn
import statistics as stat
import random

def show_majority_class_prediction():
    print("### Majority Class Prediction: ###")

    majority_class = stat.mode(Y_train)
    print(f"Training majority class = {stat.mode(Y_train)}, Test majority class = {stat.mode(Y_test)}")

    pred_Y = len(Y_test) * [majority_class]
    print(sklearn.metrics.classification_report(y_true=Y_test, y_pred=pred_Y))

    
def show_random_prediction():
    print("### Random Class Prediction: ###")

    classes = list(set(Y_train))
    print(f"Classes: {classes}")

    pred_Y = random.choices(classes, k=len(Y_test))
    print(sklearn.metrics.classification_report(y_true=Y_test, y_pred=pred_Y))


show_majority_class_prediction()
show_random_prediction()

### 2. Prediction of cluster evolution based on metrics from cluster interaction between multiple layers
*todo*