# MMC Implementation

## Correct Implementation

!pip install  seaborn

In [1]:
import numpy as np
#import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import skmob
from tqdm import tqdm
from skmob.preprocessing import (filtering, 
                                 detection, 
                                 compression, 
                                 clustering)

In [2]:
warnings.filterwarnings('ignore')

In [3]:
def get_clusters_from_tdf(tdf,
                          filter_noise=True,
                          max_speed_kmh = 50,
                          detect_stops=True,
                          minutes_for_a_stop = 20,
                          compress=True,
                          spatial_radius_km = 0.2,
                          spatial_radius_compress_km = 0.3,
                          cluster_radius_km = 1,
                          min_samples=1,
                          verbose=True):
    '''Get Clusters From TDF
    
    Generates clusters from a trajectory dataframe.
    
    Parameters:
    -----------
        tdf (Trajectory Data Frame): 
        max_speed_kmh (int):
        minutes_for_a_stop (int): 
        spatial_radius_km (float): 
        spatial_radius_compress_km (float)
        cluster_radius_km (float):
        verbose (bool): 
    
    Returns:
    --------
        clusters (Data Frame): The Dataframe of the clusters with lat and lng.
    '''
    if filter_noise:
        # 1. Noise Filtering
        tdf_f = filtering.filter(tdf, 
                                 max_speed_kmh=max_speed_kmh)
        if verbose: print('INFO: Noise Filtering applied')
    
    if detect_stops:
    # 2. Detection Stops
        tdf_fs = detection.stops(tdf_f, 
                                 minutes_for_a_stop=minutes_for_a_stop,
                                 spatial_radius_km=spatial_radius_km,
                                 leaving_time=True,
                                 min_speed_kmh=None)
        if verbose: print('INFO: Stops generated applied')
    else:
        tdf_fs = tdf_f
        
    # 3. Compression
    if compress:
        tdf_fsc = compression.compress(tdf_fs, 
                                       spatial_radius_km=spatial_radius_compress_km)
        if verbose: print('INFO: Stops compressed')
    else:
        tdf_fsc = tdf_fs            

    # 4. Clustering
    tdf_fsccl = clustering.cluster(tdf_fsc, 
                                   cluster_radius_km=cluster_radius_km,
                                   min_samples=1)
    if verbose: print('INFO: Clusters generated')

    print(tdf.shape, tdf_f.shape, tdf_fs.shape, tdf_fsc.shape, tdf_fsccl.shape)

    clusters = tdf_fsccl.groupby(['cluster'])[['lat','lng']].median().reset_index()
    print(f'INFO: {len(clusters)} clusters generated.')

    m = tdf_fsccl.plot_stops(zoom=11)
        
    return clusters, m

In [4]:
def assign_tdf_points_to_clusters(tdf, clusters, 
                                  max_radius_to_cluster_km=0.2):
    '''Assign TDF Points to Clusters
    
    Attempts to assign the corresponding cluster to each of the rows of the TDF.
    
    Parameters:
    -----------
        tdf (Trajectory Data Frame): tdf to be assigned.
        clusters (Data Frame): clusters to be assigned.
        max_radius_to_cluster_km (float): maximum distance to consider a point part of a cluster.
        
    Returns:
    --------
        tdf_ (Trajectory Data Frame): tdf with the clusters assigned (labelled).
        cluster_distances (Data Frame): distance from each point to each cluster.
    
    '''
    ########################## CLUSTER LABELLING #########################
    # Assign each point to a cluster (where possible)
    
    def get_distance_from_cluster(row, coord_cluster):
        coord_tdf = (row['lat'], row['lng'])
        return skmob.utils.utils.distance(coord_tdf, coord_cluster)
    
    tdf_ = tdf.copy()
    cluster_distances = pd.DataFrame(index=tdf_.index)
    for i, cluster in tqdm(list(clusters.iterrows())):
        cluster_coord = (cluster['lat'], cluster['lng'])
        cluster_distances[f'd_cl_{i:02d}'] = tdf_.apply(get_distance_from_cluster, axis=1, args=[cluster_coord])

    # We will not consider the distances higher than max_radius_to_cluster_km
    cluster_distances_1 = cluster_distances[(cluster_distances <= max_radius_to_cluster_km)]

    # We will assign the point to the closer cluster 
    tdf_['cluster'] = cluster_distances_1.idxmin(axis=1)
    return tdf_, cluster_distances

In [5]:
def get_mmc_transitions(tdf):
    '''Get MMC Transitions
    
    Returns the tdf with the transitions ocurred ammong clusters.
    
    Parameters:
    -----------
        tdf (Trajectory Data Frame): tdf with the clusters already assigned.
    
    Returns:
    --------
        transit_df (Trajectory Data Frame): tdf with different origin and end clusters.
    '''
    ##################### CLUSTER TRANSITIONS ####################
    mmc_df = tdf.dropna(subset=['cluster'])
    mmc_df['cluster_next'] = mmc_df['cluster'].shift(-1)

    mmc_df = mmc_df.dropna(subset=['cluster_next'])
    mmc_df['transition'] = mmc_df['cluster']+'-'+mmc_df['cluster_next']
    transit_df = mmc_df[mmc_df['cluster']!=mmc_df['cluster_next']]
    return transit_df

In [6]:
def getStationaryVector(transitMatrix):
    n = transitMatrix.shape[0]
    A = np.append(transitMatrix.T - np.identity(n), np.ones(n).reshape((1,n)), axis=0)
    B = np.zeros(n+1).reshape((n+1,1))
    B[n][0]=1
    return np.linalg.solve((A.T).dot(A), (A.T).dot(B)) 

In [14]:
P = np.array([[0.2,0.7,0.1],
    [0.9,0.0,0.1],
    [0.2,0.8,0.0]])

In [15]:
P

array([[0.2, 0.7, 0.1],
       [0.9, 0. , 0.1],
       [0.2, 0.8, 0. ]])

In [16]:
getStationaryVector(P)

array([[0.49197861],
       [0.4171123 ],
       [0.09090909]])

### Evaluation of Implementation

In [7]:
url_usr82 = 'https://raw.githubusercontent.com/bitmapup/mmc3/master/data/geolife_82.csv'
geo82 = pd.read_csv(url_usr82, header=None)
geo82.columns = ['user', 'hour', 'lat', 'lng']

In [8]:
trgeo82 = skmob.TrajDataFrame(geo82, 
                              datetime='hour', 
                              user_id='user')

In [24]:
clusters, m = get_clusters_from_tdf(trgeo82,
                                    verbose=True,
                                    max_speed_kmh= 0.001,
                                    detect_stops=False,
                                    compress=False ,
                                    minutes_for_a_stop=2,
                                    spatial_radius_km=0.2,
                                    spatial_radius_compress_km=.2,
                                    cluster_radius_km=0.5,
                                    min_samples=2)
m

INFO: Noise Filtering applied
INFO: Clusters generated
(172547, 4) (40, 4) (40, 4) (40, 4) (40, 5)
INFO: 1 clusters generated.


In [10]:
trgeo82_cl, distances = assign_tdf_points_to_clusters(tdf=trgeo82, clusters=clusters)

100%|██████████| 8/8 [00:29<00:00,  3.69s/it]


In [11]:
transit_df = get_mmc_transitions(trgeo82_cl)

tdf_['cluster'].value_counts().to_frame().T

pd.crosstab(transit_df['cluster'], transit_df['cluster_next'], normalize='index')

In [12]:
transitMatrix = pd.crosstab(transit_df['cluster'], transit_df['cluster_next'], normalize='index').values

In [13]:
staVector = getStationaryVector(transitMatrix)

staVector

array([[0.15706366],
       [0.16937648],
       [0.17242702],
       [0.17157506],
       [0.0041741 ],
       [0.15026744],
       [0.17087127],
       [0.00424496]])

## TESTS

In [None]:
############################### CLUSTER LABELLING ###############################
# Assign each point to a cluster (where possible)

# We will calculate the distance from each point to each cluster
cluster_distances = pd.DataFrame(index=trgeo82.index)
cols_d_cluster = []
for i, cluster in tqdm(clusters.iterrows()):
    cluster_coord = (cluster['lat'], cluster['lng'])
    
    cluster_distances[f'd_cl_{i:02d}'] = trgeo82.apply(get_distance_from_cluster, axis=1, args=[cluster_coord])
    cols_d_cluster.append(f'd_cl_{i:02d}')
    if i >3:
        break

# We will not consider the distances higher than max_radius_to_cluster_km
cluster_distances_1 = cluster_distances.copy()

max_radius_to_cluster_km = 0.2
cluster_distances = cluster_distances[(cluster_distances <= max_radius_to_cluster_km)]

# We will assign the point to the closer cluster 
trgeo82['cluster'] = cluster_distances[cols_d_cluster].idxmin(axis=1)

############################### CLUSTER TRANSITIONS ###############################
mmc_df = trgeo82.dropna(subset=['cluster'])
mmc_df['cluster_next'] = mmc_df['cluster'].shift(-1)

mmc_df = mmc_df.dropna(subset=['cluster_next'])
transit_df = mmc_df[mmc_df['cluster']!=mmc_df['cluster_next']]

In [None]:
clusters = trgeo82_fsccl.groupby(['cluster'])[['lat','lng']].median().reset_index()
print(clusters.shape)
clusters.head()

In [None]:
from tqdm import tqdm

In [None]:
def get_distance_from_cluster(row, coord_cluster):
    coord_tdf = (row['lat'], row['lng'])
    return skmob.utils.utils.distance(coord_tdf, coord_cluster)

In [None]:
############################### CLUSTER LABELLING ###############################
# Assign each point to a cluster (where possible)

# We will calculate the distance from each point to each cluster
cluster_distances = pd.DataFrame(index=trgeo82.index)
cols_d_cluster = []
for i, cluster in tqdm(clusters.iterrows()):
    cluster_coord = (cluster['lat'], cluster['lng'])
    
    cluster_distances[f'd_cl_{i}'] = trgeo82.apply(get_distance_from_cluster, axis=1, args=[cluster_coord])
    cols_d_cluster.append(f'd_cl_{i}')
    if i >3:
        break

# We will not consider the distances higher than max_radius_to_cluster_km
cluster_distances_1 = cluster_distances.copy()

max_radius_to_cluster_km = 0.2
cluster_distances = cluster_distances[(cluster_distances <= max_radius_to_cluster_km)]

# We will assign the point to the closer cluster 
trgeo82['cluster'] = cluster_distances[cols_d_cluster].idxmin(axis=1)

############################### CLUSTER TRANSITIONS ###############################
mmc_df = trgeo82.dropna(subset=['cluster'])
mmc_df['cluster_next'] = mmc_df['cluster'].shift(-1)

mmc_df = mmc_df.dropna(subset=['cluster_next'])
transit_df = mmc_df[mmc_df['cluster']!=mmc_df['cluster_next']]

In [None]:
pd.crosstab(transit_df['cluster'], transit_df['cluster_next'], normalize='index')

In [None]:
mmc_df['transition'] = mmc_df['cluster']+'-'+mmc_df['cluster_next']

In [None]:
mmc_df['transition'].value_counts()

__2 métodos de POIs a probar:__
- stop detection
- noise filtering + clustering (dbscan)

### Clustering.cluster

In [None]:
from skmob.preprocessing import clustering

In [None]:
trgeo82cluster_1 = clustering.cluster(trgeo82, 
                                      cluster_radius_km=10, 
                                      min_samples=100)
print(trgeo82cluster_1.shape)
trgeo82cluster_1.head(3)