# Break Down of the TSNE Plots 
I am here to provide some insights about the clustering structure of the the base

## Imports

In [8]:
import pandas as pd
import numpy as np 
import networkx as nx
from networkx import algorithms 
import math
from numpy import arctan2, sin, cos, sqrt, radians

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import adjusted_rand_score

In [4]:
import osmnx as ox 

ModuleNotFoundError: No module named 'osmnx'

In [26]:
from Data import Data 


ModuleNotFoundError: No module named 'Data'

## Making the graph for our data:


In [15]:
graph = ox.graph_from_address('Athens, Municipality of Athens, Regional Unit of Central Athens, Attica, 10667, Greece', network_type='drive')  

NameError: name 'ox' is not defined

## Data.py Functions:
I am modifying on the data.py functions to concate the dataframes from different blocks .

In [16]:
 def process_csv(name):
        "pre processing function to turn csv file into usable material for `process`"
        in_fname = name

        with open(in_fname, "r") as f:
            temp = f.readlines()
    
        rows = temp[1:]
        return rows

In [17]:
def process(row_str, H, D, col_names, idx_names):
        "Creates multi index table using 'track_id' and 'time' as the indexes. Only creates table for each index."
        parts = row_str.strip().strip(";").split(";")
        header = parts[:H]
        data = np.array(parts[H:], dtype=np.float)
        data = data.reshape(-1, D)

        # create MultiIndex from id and time
        timesteps = data[:,-1]
        id_arr = np.full(timesteps.shape, int(header[0].strip()))
        tups = list(zip(id_arr, timesteps))
        mul = pd.MultiIndex.from_tuples(tups, names=idx_names)

        data = data[:,:-1] # exclude time from data
        df = pd.DataFrame(data, columns=col_names, index=mul)
        df = df.assign(
            type=header[1].strip(),
            traveled_d=float(header[2]),
            avg_speed=float(header[3])
        )
        return df

In [18]:
def create(csv_file):
        H = 4 #header length 
        D = 6 #data length
        idx_names = ['id', 'time']
        col_names = ['lat', 'lon', 'speed', 'lon_acc', 'lat_acc']
        a = process_csv(csv_file)
        frames = [process(a[i], H, D, col_names, idx_names) for i in range(len(a))]
        df = pd.concat(frames)
        return df

## Feature Engineering and Aggregating functions:
I am modifying some of the feature engineering functions and creating a couple of aggregating functions. 

In [19]:
def median(df):
    medians = []
    medians.append(df.iloc[0]['type'])
    for i in ['speed', 'lon_acc', 'lat_acc', 'traveled_d', 'avg_speed', 'bearing']:
        medians.append(np.median(df[i].dropna()))
    return medians

In [20]:
def AVG(df):
    avgs = []
    avgs.append(df.iloc[0]['type'])
    for i in ['speed', 'lon_acc', 'lat_acc', 'traveled_d', 'avg_speed', 'bearing']:
        avgs.append(np.average(df[i].dropna()))
    return avgs

In [21]:
def stdev(df):
    stdevss = []
    stdevss.append(df.iloc[0]['type'])
    for i in ['speed', 'lon_acc', 'lat_acc', 'traveled_d', 'avg_speed', 'bearing']:
        stdevss.append(np.median(df[i].dropna()))
    return medians

In [22]:
def bearing(df):
    """calculates and adds bearing column to dataframe
    Example usage:
        df = csv_to_df('sample.csv')
        df = bearing(df)
    """
    df['bearing'] = \
        df.groupby('id', as_index=False, group_keys=False) \
        .apply(__calc_bearings)
    return df

In [23]:
def __bearing(c1, c2):
    """credit to https://bit.ly/3amjz0Q for bearing formula"""
    lat1,lon1 = c1
    lat2,lon2 = c2
    
    dL = lon2 - lon1
    x = cos(lon2) * sin(dL)
    y = cos(lat1) * sin(lat2) - sin(lat1) * cos(lat2) * cos(dL)
    return arctan2(x,y)

In [25]:
def __calc_bearings(df):
    """returns a multi-indexed dataframe of bearings at each timestep for vehicle with specified ID"""
    df1 = df
    df2 = df.shift(-1)

    c1 = (df1['lat'], df1['lon'])
    c2 = (df2['lat'], df2['lon'])
    df3 = __bearing(c1, c2)
    return df3

In [30]:
def __calc_directions(df):
    df1 = df
    df2 = df.shift(-1)
    df3 = (df1['edge_progress'] < df2['edge_progress']).astype(int)
    if len(df3) > 1:
        df3.iloc[-1] = df3.iloc[-2]
    return df3

In [31]:
 def nearest_graph_data(df, graph):
    """uses osmnx to find nearest node and edge data, calculates 
    progress along nearest edge as a ratio, and adds these features
    as columns to the dataframe
    Example usage:
        df = csv_to_df('sample.csv')
        graph = ox.graph_from_address('address_here', network_type='drive') 
        df = nearest_graph_data(df, graph)
    """
    df['nearest_node'],             \
    df['nearest_edge_start_node'],  \
    df['nearest_edge_end_node'],    \
    df['edge_progress']             \
        = zip(*df.apply(__construct_graph_data_cols(graph), axis=1))
    return df

In [32]:
def direction(df):
    """adds column that determiens which direction the vehicle is moving along an edge.
    1 if moving from node with smaller id to node with larger id, 0 otherwise.
    Note: `nearest_graph_data` must have been run on this df, otherwise this will fail!
    Example usage:
        df = csv_to_df('sample.csv')
        df = direction(df)
    """
    df['dir'] = \
        df.groupby(
            ['id', 'nearest_edge_start_node', 'nearest_edge_end_node'], 
            as_index=False, group_keys=False) \
        .apply(__calc_directions)
    return df

In [33]:
 def _calc_vehicle_density(df):
    """returns a dataframe of the unique edges (nearest_edge_start_node and neares_edge_end_node pairs) per direction (0 or 1) for edge progress intervals (in the          range(0.0:0.9), 0.0 represents edge progress between 0-10%, 0.1 represents edge progress between 10-20% and so on. 
        df must have been processed by `direction` first. Example usage: 
        df = csv_to_df(csv.file)
        graph = ox.graph_from_address('Athens, Municipality of Athens, Regional Unit of Central Athens, Attica, 10667, Greece', network_type='drive')  
        df = nearest_graph_data(df,graph)
        df = direction(df)
        vehicle_densities = _calc_vehicle_density(df)
     """
    df['edge_progress_intervals'] = df.groupby(['nearest_edge_start_node'])['edge_progress'].transform(lambda x: x-x%0.1)
    df2 = df.reset_index().groupby(['nearest_edge_start_node','nearest_edge_end_node','dir','edge_progress_intervals']).agg({'id':['nunique']})
    return df2