# MMC Implementation

## Correct Implementation

In [6]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import skmob
from tqdm import tqdm
from skmob.preprocessing import (filtering, 
                                 detection, 
                                 compression, 
                                 clustering)

In [3]:
warnings.filterwarnings('ignore')

In [4]:
def get_clusters_from_tdf(tdf,
                          max_speed_kmh = 50,
                          minutes_for_a_stop = 20,
                          spatial_radius_km = 0.2,
                          spatial_radius_compress_km = 0.3,
                          cluster_radius_km = 1,
                          verbose=True):
    '''Get Clusters From TDF
    
    Generates clusters from a trajectory dataframe.
    
    Parameters:
    -----------
        tdf (Trajectory Data Frame): 
        max_speed_kmh (int):
        minutes_for_a_stop (int): 
        spatial_radius_km (float): 
        spatial_radius_compress_km (float)
        cluster_radius_km (float):
        verbose (bool): 
    
    Returns:
    --------
        clusters (Data Frame): The Dataframe of the clusters with lat and lng.
    '''
    
    # 1. Noise Filtering
    tdf_f = filtering.filter(tdf, 
                             max_speed_kmh=max_speed_kmh)
    if verbose: print('INFO: Noise Filtering applied')
        
    # 2. Detection Stops
    tdf_fs = detection.stops(tdf_f, 
                             minutes_for_a_stop=minutes_for_a_stop,
                             spatial_radius_km=spatial_radius_km,
                             leaving_time=True,
                             min_speed_kmh=None)
    if verbose: print('INFO: Stops generated applied')
        
    # 3. Compression
    tdf_fsc = compression.compress(tdf_fs, 
                                   spatial_radius_km=spatial_radius_compress_km)
    if verbose: print('INFO: Stops compressed')

    # 4. Clustering
    tdf_fsccl = clustering.cluster(tdf_fsc, 
                                   cluster_radius_km=cluster_radius_km,
                                   min_samples=1)
    if verbose: print('INFO: Clusters generated')

    print(tdf.shape, tdf_f.shape, tdf_fs.shape, tdf_fsc.shape, tdf_fsccl.shape)
    
    clusters = tdf_fsccl.groupby(['cluster'])[['lat','lng']].median().reset_index()
    print(f'INFO: {len(clusters)} clusters generated.')

    m = tdf_fsccl.plot_stops(zoom=11)
        
    return clusters, m

In [7]:
def assign_tdf_points_to_clusters(tdf, clusters, 
                                  max_radius_to_cluster_km=0.2):
    '''Assign TDF Points to Clusters
    
    Attempts to assign the corresponding cluster to each of the rows of the TDF.
    
    Parameters:
    -----------
        tdf (Trajectory Data Frame): tdf to be assigned.
        clusters (Data Frame): clusters to be assigned.
        max_radius_to_cluster_km (float): maximum distance to consider a point part of a cluster.
        
    Returns:
    --------
        tdf_ (Trajectory Data Frame): tdf with the clusters assigned (labelled).
        cluster_distances (Data Frame): distance from each point to each cluster.
    
    '''
    ########################## CLUSTER LABELLING #########################
    # Assign each point to a cluster (where possible)
    
    def get_distance_from_cluster(row, coord_cluster):
        coord_tdf = (row['lat'], row['lng'])
        return skmob.utils.utils.distance(coord_tdf, coord_cluster)
    
    tdf_ = tdf.copy()
    cluster_distances = pd.DataFrame(index=tdf_.index)
    for i, cluster in tqdm(clusters.iterrows()):
        cluster_coord = (cluster['lat'], cluster['lng'])
        cluster_distances[f'd_cl_{i:02d}'] = tdf_.apply(get_distance_from_cluster, axis=1, args=[cluster_coord])

    # We will not consider the distances higher than max_radius_to_cluster_km
    cluster_distances_1 = cluster_distances[(cluster_distances <= max_radius_to_cluster_km)]

    # We will assign the point to the closer cluster 
    tdf_['cluster'] = cluster_distances_1.idxmin(axis=1)
    return tdf_, cluster_distances

In [8]:
def get_mmc_transitions(tdf):
    '''Get MMC Transitions
    
    Returns the tdf with the transitions ocurred ammong clusters.
    
    Parameters:
    -----------
        tdf (Trajectory Data Frame): tdf with the clusters already assigned.
    
    Returns:
    --------
        transit_df (Trajectory Data Frame): tdf with different origin and end clusters.
    '''
    ##################### CLUSTER TRANSITIONS ####################
    mmc_df = tdf.dropna(subset=['cluster'])
    mmc_df['cluster_next'] = mmc_df['cluster'].shift(-1)

    mmc_df = mmc_df.dropna(subset=['cluster_next'])
    mmc_df['transition'] = mmc_df['cluster']+'-'+mmc_df['cluster_next']
    transit_df = mmc_df[mmc_df['cluster']!=mmc_df['cluster_next']]
    return transit_df

### Evaluation of Implementation

In [9]:
url_usr82 = 'https://raw.githubusercontent.com/bitmapup/mmc3/master/data/geolife_82.csv'
geo82 = pd.read_csv(url_usr82, header=None)
geo82.columns = ['user', 'hour', 'lat', 'lng']

In [10]:
trgeo82 = skmob.TrajDataFrame(geo82, 
                              datetime='hour', 
                              user_id='user')

In [11]:
clusters, m = get_clusters_from_tdf(trgeo82, verbose=False)
m

(172547, 4) (98575, 4) (198, 5) (136, 5) (136, 6)
INFO: 35 clusters generated.


In [12]:
trgeo82_cl, distances = assign_tdf_points_to_clusters(tdf=trgeo82, clusters=clusters)

35it [01:38,  2.80s/it]


In [13]:
transit_df = get_mmc_transitions(trgeo82_cl)

In [457]:
tdf_['cluster'].value_counts().to_frame().T

Unnamed: 0,d_cl_01,d_cl_02,d_cl_00,d_cl_18,d_cl_03,d_cl_31,d_cl_16,d_cl_29,d_cl_08,d_cl_17,...,d_cl_32,d_cl_33,d_cl_06,d_cl_34,d_cl_28,d_cl_04,d_cl_19,d_cl_22,d_cl_21,d_cl_27
cluster,46163,3614,2797,1758,1747,1474,973,805,804,653,...,48,43,30,30,29,21,21,19,19,12


In [459]:
pd.crosstab(transit_df['cluster'], transit_df['cluster_next'], normalize='index')

cluster_next,d_cl_0,d_cl_1,d_cl_10,d_cl_11,d_cl_12,d_cl_13,d_cl_14,d_cl_15,d_cl_16,d_cl_17,...,d_cl_30,d_cl_31,d_cl_32,d_cl_33,d_cl_34,d_cl_4,d_cl_6,d_cl_7,d_cl_8,d_cl_9
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
d_cl_0,0.0,0.6,0.044444,0.022222,0.0,0.0,0.0,0.066667,0.022222,0.022222,...,0.0,0.022222,0.022222,0.0,0.022222,0.0,0.0,0.022222,0.0,0.0
d_cl_1,0.688889,0.0,0.022222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022222,0.088889
d_cl_10,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d_cl_11,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d_cl_12,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d_cl_13,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d_cl_14,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d_cl_15,0.111111,0.0,0.777778,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d_cl_16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
d_cl_17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## TESTS

In [462]:
############################### CLUSTER LABELLING ###############################
# Assign each point to a cluster (where possible)

# We will calculate the distance from each point to each cluster
cluster_distances = pd.DataFrame(index=trgeo82.index)
cols_d_cluster = []
for i, cluster in tqdm(clusters.iterrows()):
    cluster_coord = (cluster['lat'], cluster['lng'])
    
    cluster_distances[f'd_cl_{i:02d}'] = trgeo82.apply(get_distance_from_cluster, axis=1, args=[cluster_coord])
    cols_d_cluster.append(f'd_cl_{i:02d}')
    if i >3:
        break

# We will not consider the distances higher than max_radius_to_cluster_km
cluster_distances_1 = cluster_distances.copy()

max_radius_to_cluster_km = 0.2
cluster_distances = cluster_distances[(cluster_distances <= max_radius_to_cluster_km)]

# We will assign the point to the closer cluster 
trgeo82['cluster'] = cluster_distances[cols_d_cluster].idxmin(axis=1)

############################### CLUSTER TRANSITIONS ###############################
mmc_df = trgeo82.dropna(subset=['cluster'])
mmc_df['cluster_next'] = mmc_df['cluster'].shift(-1)

mmc_df = mmc_df.dropna(subset=['cluster_next'])
transit_df = mmc_df[mmc_df['cluster']!=mmc_df['cluster_next']]

In [269]:
clusters = trgeo82_fsccl.groupby(['cluster'])[['lat','lng']].median().reset_index()
print(clusters.shape)
clusters.head()

(35, 3)


Unnamed: 0,cluster,lat,lng
0,0,39.975632,116.331212
1,1,40.052413,116.400565
2,2,47.629663,-122.13506
3,3,39.932441,116.395893
4,4,35.00445,135.767367


In [330]:
from tqdm import tqdm

In [295]:
def get_distance_from_cluster(row, coord_cluster):
    coord_tdf = (row['lat'], row['lng'])
    return skmob.utils.utils.distance(coord_tdf, coord_cluster)

In [373]:
############################### CLUSTER LABELLING ###############################
# Assign each point to a cluster (where possible)

# We will calculate the distance from each point to each cluster
cluster_distances = pd.DataFrame(index=trgeo82.index)
cols_d_cluster = []
for i, cluster in tqdm(clusters.iterrows()):
    cluster_coord = (cluster['lat'], cluster['lng'])
    
    cluster_distances[f'd_cl_{i}'] = trgeo82.apply(get_distance_from_cluster, axis=1, args=[cluster_coord])
    cols_d_cluster.append(f'd_cl_{i}')
    if i >3:
        break

# We will not consider the distances higher than max_radius_to_cluster_km
cluster_distances_1 = cluster_distances.copy()

max_radius_to_cluster_km = 0.2
cluster_distances = cluster_distances[(cluster_distances <= max_radius_to_cluster_km)]

# We will assign the point to the closer cluster 
trgeo82['cluster'] = cluster_distances[cols_d_cluster].idxmin(axis=1)

############################### CLUSTER TRANSITIONS ###############################
mmc_df = trgeo82.dropna(subset=['cluster'])
mmc_df['cluster_next'] = mmc_df['cluster'].shift(-1)

mmc_df = mmc_df.dropna(subset=['cluster_next'])
transit_df = mmc_df[mmc_df['cluster']!=mmc_df['cluster_next']]

In [395]:
pd.crosstab(transit_df['cluster'], transit_df['cluster_next'], normalize='index')

cluster_next,d_cl_0,d_cl_1,d_cl_2,d_cl_3,d_cl_4
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
d_cl_0,0.0,0.829268,0.097561,0.04878,0.02439
d_cl_1,0.9,0.0,0.1,0.0,0.0
d_cl_2,0.625,0.375,0.0,0.0,0.0
d_cl_3,0.0,1.0,0.0,0.0,0.0
d_cl_4,1.0,0.0,0.0,0.0,0.0


In [389]:
mmc_df['transition'] = mmc_df['cluster']+'-'+mmc_df['cluster_next']

In [390]:
mmc_df['transition'].value_counts()

d_cl_1-d_cl_1    46123
d_cl_2-d_cl_2     3606
d_cl_0-d_cl_0     2755
d_cl_3-d_cl_3     1745
d_cl_1-d_cl_0       36
d_cl_0-d_cl_1       34
d_cl_4-d_cl_4       20
d_cl_2-d_cl_0        5
d_cl_0-d_cl_2        4
d_cl_1-d_cl_2        4
d_cl_2-d_cl_1        3
d_cl_3-d_cl_1        2
d_cl_0-d_cl_3        2
d_cl_4-d_cl_0        1
d_cl_0-d_cl_4        1
Name: transition, dtype: int64

__2 métodos de POIs a probar:__
- stop detection
- noise filtering + clustering (dbscan)

### Clustering.cluster

In [163]:
from skmob.preprocessing import clustering

In [None]:
trgeo82cluster_1 = clustering.cluster(trgeo82, 
                                      cluster_radius_km=10, 
                                      min_samples=100)
print(trgeo82cluster_1.shape)
trgeo82cluster_1.head(3)