# MMC Implementation

## Correct Implementation

In [1]:
import numpy as np
#import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import skmob
from tqdm import tqdm
import sys
sys.path.append('../')
from src.geo_utils import (get_clusters_from_tdf,
                            assign_tdf_points_to_clusters,
                            get_mmc_transitions,
                            get_stationary_vector)


In [2]:
warnings.filterwarnings('ignore')

In [3]:
def get_distance_bw_clusters(cluster_1, cluster_2):
    '''Get Distance Between Clusters
    
    Parameters
    ----------
        cluster_1 (pd.Dataframe): df with columns ['cluster', 'lat', 'lng', 'sta_vector']
        cluster_2 (pd.Dataframe): df with columns ['cluster', 'lat', 'lng', 'sta_vector']
    
    Returns
    -------
        distance (float): Distance between clusters (one-way)
    '''
    valid_distances = []
    valid_clusters = []
    for i, row_i in cluster_1.iterrows():    
        coord_i = (row_i['lat'], row_i['lng'])

        clusters, distances = [], []    
        for j, row_j in cluster_2.iterrows():
            cluster_j = row_j['cluster']
            coord_j = (row_j['lat'], row_j['lng'])

            distance_ij = skmob.utils.utils.distance(coord_i, coord_j)    
            clusters.append(cluster_j)
            distances.append(distance_ij)

        distances = np.array(distances)

        idx_min = distances.argmin()
        min_dist = distances.min()
        cluster_min = clusters[idx_min]

        valid_distances.append(min_dist)
        valid_clusters.append(cluster_min)

    cluster_est = cluster_1.copy()
    cluster_est['cluster_other'] = valid_clusters
    cluster_est['distance_other'] = valid_distances
    distance = (cluster_est['distance_other'] * cluster_est['sta_vector']).sum()
    return distance

def get_mean_distance_bw_clusters(cluster_1, cluster_2):
    '''Get Mean Distance Between Clusters
    
    Get the distances of cluster_1 to cluster_2 and viceversa and returns the average.

    Parameters
    ----------
        cluster_1 (pd.Dataframe): df with columns ['cluster', 'lat', 'lng', 'sta_vector']
        cluster_2 (pd.Dataframe): df with columns ['cluster', 'lat', 'lng', 'sta_vector']
    
    Returns
    -------
        mean_distance (float): Distance between clusters (two-way)
    '''
    d1 = get_distance_bw_clusters(cluster_1, cluster_2)
    d2 = get_distance_bw_clusters(cluster_2, cluster_1)
    mean_distance = (d1+d2)/2
    return mean_distance

In [74]:
def get_mmc_clusters_stavectors(geo):
    '''Get MMC Clusters and Stationary Vectors
    
    Parameters
    ----------
        geo (pd.Dataframe): Dataframe of 1 user_id. Must contain only the columns: ['user', 'hour', 'lat', 'lng']
    
    Returns
    -------
        clusters (pd.Dataframe): Dataframe with the clusters and the stationary value
        m (folium.folium.Map): Map generated with the clusters on it.
    '''
    # TDF Definition
    trgeo = skmob.TrajDataFrame(
        geo, 
        datetime='hour',
        user_id='user'
    )

    # Cluster Generation
    clusters, m = get_clusters_from_tdf(
        trgeo,
        verbose=True,
        max_speed_kmh= 0.01,
        detect_stops=False,
        compress=False ,
        minutes_for_a_stop=2,
        spatial_radius_km=0.2,
        spatial_radius_compress_km=.2,
        cluster_radius_km=0.5,
        min_samples=2
    )

    # Cluster Assignation
    trgeo_cl, distances = assign_tdf_points_to_clusters(
        tdf=trgeo, 
        clusters=clusters
    )

    # Generation of Transit Dataframe
    transit_df = get_mmc_transitions(trgeo_cl)

    # Generation of Transit Matrix
    transit_matrix = pd.crosstab(transit_df['cluster'], 
                                 transit_df['cluster_next'],
                                 normalize='index').values
    display(transit_matrix)
    
    #Lógica adicional para remover clusters sin salida o entrada
    ##Retiramos las filas que tienen solo ceros y una columna con 1
    if ((transit_matrix.shape[0]>2 )& (transit_matrix.shape[1]>2)):
        print('Revisando filas de la Matriz de transición...')
        print('mostrando clusters iniciales')
        display(clusters)
        rows_to_drop = (transit_matrix==0).sum(axis=1)==(transit_matrix.shape[1]-1)#todos los valores en 0 excepto 1
        rows_to_drop = [idx for idx,row in enumerate(rows_to_drop) if row]
        if len(rows_to_drop)>0: 
            transit_matrix = np.delete(transit_matrix,rows_to_drop,0)
            clusters=clusters.drop(rows_to_drop, axis=0  )
            print('clusters {} eliminados'.format(rows_to_drop))
            print('Transit matrix actual:')
            display(transit_matrix)
            
        ##Retiramos las columnas que tienen solo ceros una fila con 1
        print('Revisando columnas de la Matriz de transición...')
        print('mostrando clusters iniciales')
        display(clusters)
        cols_to_drop = (transit_matrix==0).sum(axis=0)==(transit_matrix.shape[0]-1)
        cols_to_drop = [idx for idx,col in enumerate(cols_to_drop) if col]
        if len(cols_to_drop)>0: 
            transit_matrix = np.delete(transit_matrix,cols_to_drop,1)
            clusters=clusters.drop(cols_to_drop, axis=0  )
            print('clusters {} eliminados'.format(cols_to_drop))
            print('Transit matrix actual:')
            display(transit_matrix)
    display(transit_matrix)
    # Stationary Vector Assignation
    
    clusters['sta_vector'] = get_stationary_vector(transit_matrix)
    
    return clusters, m

## Reading for 10 users

In [49]:
# Reading 10 frames
data = pd.read_csv('../data/geo_000_009.csv.zip')

data['hour'] = data['date'] + ' ' + data['time']
data['user'] = data['user_id'].map(int)

data = data[geo_columns]

In [73]:
# Extraction of clusters 
geo_clusters = {}
geo_clusters_img = {}
users = sorted(data['user'].unique())
for user in [2]:#users:#tqdm(users):
    #try:
    geo = data[data['user']==user].reset_index(drop=True).copy()

    clusters, m = get_mmc_clusters_stavectors(geo)

    geo_clusters[user] = clusters
    geo_clusters_img[user] = m
    #except:
        #pass

INFO: Noise Filtering applied
INFO: Clusters generated
(248217, 4) (155, 4) (155, 4) (155, 4) (155, 5)
INFO: 6 clusters generated.


100%|██████████| 6/6 [00:26<00:00,  4.37s/it]


array([[0.        , 0.83333333, 0.01190476, 0.04761905, 0.02380952,
        0.08333333],
       [0.16      , 0.        , 0.        , 0.8       , 0.04      ,
        0.        ],
       [0.0625    , 0.03125   , 0.        , 0.        , 0.90625   ,
        0.        ],
       [0.046875  , 0.015625  , 0.        , 0.        , 0.        ,
        0.9375    ],
       [0.95522388, 0.02985075, 0.        , 0.        , 0.01492537,
        0.        ]])

mostrando clusters


Unnamed: 0,cluster,lat,lng
0,0,39.926178,116.337746
1,1,39.90025,116.385515
2,2,39.9895,116.222359
3,3,39.902306,116.367091
4,4,39.90583,116.356539
5,5,39.909956,116.346575


columnas [2] eliminadas


array([[0.        , 0.83333333, 0.04761905, 0.02380952, 0.08333333],
       [0.16      , 0.        , 0.8       , 0.04      , 0.        ],
       [0.0625    , 0.03125   , 0.        , 0.90625   , 0.        ],
       [0.046875  , 0.015625  , 0.        , 0.        , 0.9375    ],
       [0.95522388, 0.02985075, 0.        , 0.01492537, 0.        ]])

Shape of transitMatrix:  (5, 5)


In [None]:
# TODO: Corregir la función de vector estacionario. Hay casos en los que falla

In [None]:
geo_clusters

## Distance Estimation

In [None]:
transitMatrix.T cluster_0 = geo_clusters[0]
cluster_1 = geo_clusters[3]
cluster_2 = geo_clusters[4]

In [None]:
cluster_1

In [None]:
cluster_2

In [None]:
get_mean_distance_bw_clusters(cluster_1=cluster_1, cluster_2=cluster_2)

In [None]:
get_mean_distance_bw_clusters(cluster_1=cluster_0, cluster_2=cluster_2)

## Testing of Parameters

We will test different parameters with cluster combinations to select the best params.

In [None]:
from IPython import display

In [None]:
def test_geo_parameters(geo,
                        verbose=True,
                        max_speed_kmh= 0.01,
                        detect_stops=False,
                        compress=False ,
                        minutes_for_a_stop=2,
                        spatial_radius_km=0.2,
                        spatial_radius_compress_km=.2,
                        cluster_radius_km=0.5,
                        min_samples=2):

    '''Test Parameters for generation of MMCs
    
    '''
    # TDF Definition
    trgeo = skmob.TrajDataFrame(
        geo, 
        datetime='hour',
        user_id='user'
    )

    # Cluster Generation
    clusters, m = get_clusters_from_tdf(
        trgeo,
        verbose=verbose,
        max_speed_kmh= max_speed_kmh,
        detect_stops=detect_stops,
        compress=compress ,
        minutes_for_a_stop=minutes_for_a_stop,
        spatial_radius_km=spatial_radius_km,
        spatial_radius_compress_km=spatial_radius_compress_km,
        cluster_radius_km=cluster_radius_km,
        min_samples=min_samples
    )
    print(f'\nUSER: {geo["user"].values[0]}\n')
    display.display_html(m)

In [None]:
# TESTING PARAMETERS
users = sorted(data['user'].unique())
for user in tqdm(users):
    geo = data[data['user']==user].reset_index(drop=True).copy()
    test_geo_parameters(
        geo,
        verbose=True,
        max_speed_kmh= 0.01,
        detect_stops=False,
        compress=False ,
        minutes_for_a_stop=2,
        spatial_radius_km=0.2,
        spatial_radius_compress_km=.2,
        cluster_radius_km=0.5,
        min_samples=2
    )