# MMC Implementation

## Correct Implementation

In [1]:
import numpy as np
#import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import skmob
from tqdm import tqdm
import sys
sys.path.append('../')
from src.geo_utils import (get_clusters_from_tdf,
                            assign_tdf_points_to_clusters,
                            get_mmc_transitions,
                            get_stationary_vector)


In [2]:
warnings.filterwarnings('ignore')

In [3]:
def get_distance_bw_clusters(cluster_1, cluster_2):
    '''Get Distance Between Clusters
    
    Parameters
    ----------
        cluster_1 (pd.Dataframe): df with columns ['cluster', 'lat', 'lng', 'sta_vector']
        cluster_2 (pd.Dataframe): df with columns ['cluster', 'lat', 'lng', 'sta_vector']
    
    Returns
    -------
        distance (float): Distance between clusters (one-way)
    '''
    valid_distances = []
    valid_clusters = []
    for i, row_i in cluster_1.iterrows():    
        coord_i = (row_i['lat'], row_i['lng'])

        clusters, distances = [], []    
        for j, row_j in cluster_2.iterrows():
            cluster_j = row_j['cluster']
            coord_j = (row_j['lat'], row_j['lng'])

            distance_ij = skmob.utils.utils.distance(coord_i, coord_j)    
            clusters.append(cluster_j)
            distances.append(distance_ij)

        distances = np.array(distances)

        idx_min = distances.argmin()
        min_dist = distances.min()
        cluster_min = clusters[idx_min]

        valid_distances.append(min_dist)
        valid_clusters.append(cluster_min)

    cluster_est = cluster_1.copy()
    cluster_est['cluster_other'] = valid_clusters
    cluster_est['distance_other'] = valid_distances
    distance = (cluster_est['distance_other'] * cluster_est['sta_vector']).sum()
    return distance

def get_mean_distance_bw_clusters(cluster_1, cluster_2):
    '''Get Mean Distance Between Clusters
    
    Get the distances of cluster_1 to cluster_2 and viceversa and returns the average.

    Parameters
    ----------
        cluster_1 (pd.Dataframe): df with columns ['cluster', 'lat', 'lng', 'sta_vector']
        cluster_2 (pd.Dataframe): df with columns ['cluster', 'lat', 'lng', 'sta_vector']
    
    Returns
    -------
        mean_distance (float): Distance between clusters (two-way)
    '''
    d1 = get_distance_bw_clusters(cluster_1, cluster_2)
    d2 = get_distance_bw_clusters(cluster_2, cluster_1)
    mean_distance = (d1+d2)/2
    return mean_distance

In [4]:
def get_mmc_clusters_stavectors(geo):
    '''Get MMC Clusters and Stationary Vectors
    
    Parameters
    ----------
        geo (pd.Dataframe): Dataframe of 1 user_id. Must contain only the columns: ['user', 'hour', 'lat', 'lng']
    
    Returns
    -------
        clusters (pd.Dataframe): Dataframe with the clusters and the stationary value
        m (folium.folium.Map): Map generated with the clusters on it.
    '''
    # TDF Definition
    trgeo = skmob.TrajDataFrame(
        geo, 
        datetime='hour',
        user_id='user'
    )

    # Cluster Generation
    clusters, m = get_clusters_from_tdf(
        trgeo,
        verbose=True,
        max_speed_kmh= 0.1,
        detect_stops=False,
        compress=False ,
        minutes_for_a_stop=2,
        spatial_radius_km=0.2,
        spatial_radius_compress_km=.2,
        cluster_radius_km=0.5,
        min_samples=2
    )

    # Cluster Assignation
    trgeo_cl, distances = assign_tdf_points_to_clusters(
        tdf=trgeo, 
        clusters=clusters
    )

    # Generation of Transit Dataframe
    transit_df = get_mmc_transitions(trgeo_cl)

    # Generation of Transit Matrix
    transit_matrix = pd.crosstab(transit_df['cluster'], 
                                 transit_df['cluster_next'],
                                 normalize='index').values
    display(transit_matrix)
    
################################## START FIX ANTHONY ################################

    #Lógica adicional para remover clusters sin salida o entrada
    ##Retiramos las filas que tienen solo ceros y una columna con 1
#    if ((transit_matrix.shape[0]>2 )& (transit_matrix.shape[1]>2)):
#        print('Revisando filas de la Matriz de transición...')
#        print('mostrando clusters iniciales')
#        display(clusters)
#        rows_to_drop = (transit_matrix==0).sum(axis=1)==(transit_matrix.shape[1]-1)#todos los valores en 0 excepto 1
#        rows_to_drop = [idx for idx,row in enumerate(rows_to_drop) if row]
#        if len(rows_to_drop)>0: 
#            transit_matrix = np.delete(transit_matrix,rows_to_drop,0)
#            clusters=clusters.drop(rows_to_drop, axis=0  )
#            print('clusters {} eliminados'.format(rows_to_drop))
#            print('Transit matrix actual:')
#            display(transit_matrix)
            
        ##Retiramos las columnas que tienen solo ceros una fila con 1
#        print('Revisando columnas de la Matriz de transición...')
#        print('mostrando clusters iniciales')
#        display(clusters)
#        cols_to_drop = (transit_matrix==0).sum(axis=0)==(transit_matrix.shape[0]-1)
#        cols_to_drop = [idx for idx,col in enumerate(cols_to_drop) if col]
#        if len(cols_to_drop)>0: 
#            transit_matrix = np.delete(transit_matrix,cols_to_drop,1)
#            clusters=clusters.drop(cols_to_drop, axis=0  )
#            print('clusters {} eliminados'.format(cols_to_drop))
#            print('Transit matrix actual:')
#            display(transit_matrix)
#    display(transit_matrix)

################################## END FIX ANTHONY ################################


    # Stationary Vector Assignation
    try:
        clusters['sta_vector'] = get_stationary_vector(transit_matrix)
    except:
        pass
    
    return clusters, m, transit_matrix, transit_df

## Reading for 10 users

In [5]:
# Reading 10 frames
data = pd.read_csv('../data/geo_000_009.csv.zip')

geo_columns = ['user', 'hour', 'lat', 'lng']

data['hour'] = data['date'] + ' ' + data['time']
data['user'] = data['user_id'].map(int)

data = data[geo_columns]

In [7]:
data = pd.read_parquet('../data/geolife_consolidated.parquet')

In [9]:
data[(data['user']>=0) & (data['user']<10)].to_parquet('../data/geolife_sample.parquet')

In [10]:
data = pd.read_parquet('../data/geolife_sample.parquet')

In [None]:
# Extraction of clusters 
geo_clusters = {}
geo_clusters_img = {}
geo_clusters_transit = {}
geo_clusters_transit_df = {}
users = sorted(data['user'].unique())
for user in tqdm(users):
    try:
        geo = data[data['user']==user].reset_index(drop=True).copy()

        clusters, m, transit_matrix, transit_df = get_mmc_clusters_stavectors(geo)

        geo_clusters[user] = clusters
        geo_clusters_img[user] = m
        geo_clusters_transit[user] = transit_matrix
        geo_clusters_transit_df[user] = transit_df
    except:
        pass

 10%|█         | 1/10 [03:15<29:21, 195.74s/it]

INFO: Clusters generated
(108607, 7) (108607, 7) (108607, 7) (108607, 7) (108607, 8)
INFO: 3 clusters generated.



  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:01<00:02,  1.46s/it][A
 67%|██████▋   | 2/3 [00:02<00:01,  1.44s/it][A
100%|██████████| 3/3 [00:04<00:00,  1.62s/it][A


array([[1., 0.],
       [0., 1.]])

 20%|██        | 2/10 [04:18<15:39, 117.44s/it]

Shape of transitMatrix:  (2, 2)


 30%|███       | 3/10 [14:19<39:27, 338.28s/it]

- Motivo por el que solo se generaban 3 vectores estacionarios:
    - Solo 3 usuarios tenían más de 2 clusters generados. Los demás tenían o solo 1 o solo 2

In [70]:
list(geo_clusters)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [71]:
geo_clusters

{0:     cluster        lat         lng  sta_vector
 0         0  40.003246  116.324191    0.245846
 1         1  39.991716  116.349004    0.021381
 2         2  39.981235  116.332803    0.085540
 3         3  39.988041  116.429559    0.075040
 4         4  40.026428  116.316120    0.010689
 5         5  39.987518  116.326923    0.198747
 6         6  39.986203  116.368617    0.160799
 7         7  40.015903  116.324611    0.032067
 8         8  40.005311  116.303108    0.042016
 9         9  39.967301  116.327827    0.054509
 10       10  39.909989  116.304525    0.009171
 11       11  39.984702  116.318417    0.064196,
 1:    cluster        lat         lng
 0        0  40.013800  116.306496
 1        1  40.001162  116.309039
 2        2  40.012005  116.321618
 3        3  39.988112  116.309925
 4        4  39.984094  116.319236,
 2:    cluster        lat         lng
 0        0  39.899802  116.384715
 1        1  39.926184  116.337697
 2        2  39.908563  116.346460
 3        3  39

In [75]:
geo_clusters[6]

Unnamed: 0,cluster,lat,lng
0,0,39.981983,116.34465
1,1,39.959561,116.340412
2,2,39.920292,116.304426
3,3,39.984071,116.320147


In [79]:
geo_clusters_transit[6]

array([[0. , 0.5, 0.5],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ]])

In [141]:
geo_clusters_transit_df[6]

Unnamed: 0,uid,datetime,lat,lng,cluster,cluster_next,transition
92,6,2008-10-23 07:07:05,39.984248,116.322444,d_cl_03,d_cl_00,d_cl_03-d_cl_00
365,6,2008-10-24 10:51:18,39.982554,116.346848,d_cl_00,d_cl_02,d_cl_00-d_cl_02
2048,6,2008-10-25 06:06:30,39.918977,116.303938,d_cl_02,d_cl_00,d_cl_02-d_cl_00
5371,6,2008-11-05 11:53:58,39.981906,116.342307,d_cl_00,d_cl_01,d_cl_00-d_cl_01
6028,6,2008-11-06 06:38:31,39.960441,116.340483,d_cl_01,d_cl_00,d_cl_01-d_cl_00
10789,6,2008-11-12 11:51:42,39.982794,116.342671,d_cl_00,d_cl_01,d_cl_00-d_cl_01
11157,6,2008-11-13 06:21:55,39.961223,116.339768,d_cl_01,d_cl_00,d_cl_01-d_cl_00
12929,6,2008-11-16 06:07:49,39.982195,116.346976,d_cl_00,d_cl_02,d_cl_00-d_cl_02
13444,6,2008-11-16 06:49:18,39.918702,116.30378,d_cl_02,d_cl_00,d_cl_02-d_cl_00
15763,6,2008-11-19 11:50:53,39.982055,116.342364,d_cl_00,d_cl_01,d_cl_00-d_cl_01


- Este tipo de Clusters no debería existir

In [78]:
dd['cluster_next'].value_counts()

d_cl_00    7
d_cl_01    3
d_cl_02    3
Name: cluster_next, dtype: int64

In [55]:
dd

Unnamed: 0,uid,datetime,lat,lng,cluster,cluster_next,transition


In [103]:
# 0, 3, 4, 7
geo_clusters[]

Unnamed: 0,cluster,lat,lng,sta_vector
0,0,40.000718,116.326845,0.5
1,1,40.005502,116.333585,0.5


In [142]:
geo_clusters[0]

Unnamed: 0,cluster,lat,lng,sta_vector
0,0,40.003246,116.324191,0.245846
1,1,39.991716,116.349004,0.021381
2,2,39.981235,116.332803,0.08554
3,3,39.988041,116.429559,0.07504
4,4,40.026428,116.31612,0.010689
5,5,39.987518,116.326923,0.198747
6,6,39.986203,116.368617,0.160799
7,7,40.015903,116.324611,0.032067
8,8,40.005311,116.303108,0.042016
9,9,39.967301,116.327827,0.054509


In [137]:
cluster_list = [geo_clusters[0], geo_clusters[3], geo_clusters[4], geo_clusters[7],
               geo_clusters[0], geo_clusters[3]]

In [1]:
import sys

In [2]:
sys.path.append('../')

In [None]:
from src.geo_utils import get_mmc_distances_matrix

In [152]:
mmc_matrix = get_mmc_distances_matrix(cluster_list, cluster_list)
mmc_matrix

array([[0.    , 0.5945, 0.8877, 1.7545, 0.    , 0.5945],
       [0.5945, 0.    , 0.7266, 1.4086, 0.5945, 0.    ],
       [0.8877, 0.7266, 0.    , 1.5542, 0.8877, 0.7266],
       [1.7545, 1.4086, 1.5542, 0.    , 1.7545, 1.4086],
       [0.    , 0.5945, 0.8877, 1.7545, 0.    , 0.5945],
       [0.5945, 0.    , 0.7266, 1.4086, 0.5945, 0.    ]])

In [149]:
print(get_mmc_distances_matrix(cluster_list, cluster_list))

[[0.     0.5945 0.8877 1.7545 0.     0.5945]
 [0.5945 0.     0.7266 1.4086 0.5945 0.    ]
 [0.8877 0.7266 0.     1.5542 0.8877 0.7266]
 [1.7545 1.4086 1.5542 0.     1.7545 1.4086]
 [0.     0.5945 0.8877 1.7545 0.     0.5945]
 [0.5945 0.     0.7266 1.4086 0.5945 0.    ]]


In [153]:
mmc_matrix

array([[0.    , 0.5945, 0.8877, 1.7545, 0.    , 0.5945],
       [0.5945, 0.    , 0.7266, 1.4086, 0.5945, 0.    ],
       [0.8877, 0.7266, 0.    , 1.5542, 0.8877, 0.7266],
       [1.7545, 1.4086, 1.5542, 0.    , 1.7545, 1.4086],
       [0.    , 0.5945, 0.8877, 1.7545, 0.    , 0.5945],
       [0.5945, 0.    , 0.7266, 1.4086, 0.5945, 0.    ]])

In [111]:
distance_matrix

array([[0.        , 0.59449514, 0.88774429, 1.75452123],
       [0.59449514, 0.        , 0.726551  , 1.40861446],
       [0.88774429, 0.726551  , 0.        , 1.55416061],
       [1.75452123, 1.40861446, 1.55416061, 0.        ]])

In [109]:
distance_matrix

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

## Distance Estimation

In [81]:
cluster_0 = geo_clusters[0]
cluster_1 = geo_clusters[3]
cluster_2 = geo_clusters[4]

In [84]:
get_mean_distance_bw_clusters(cluster_1=cluster_1, cluster_2=cluster_2)

0.7265510040177836

In [85]:
get_mean_distance_bw_clusters(cluster_1=cluster_0, cluster_2=cluster_2)

0.8877442937799871

## Testing of Parameters

We will test different parameters with cluster combinations to select the best params.

In [None]:
from IPython import display

In [None]:
def test_geo_parameters(geo,
                        verbose=True,
                        max_speed_kmh= 0.01,
                        detect_stops=False,
                        compress=False ,
                        minutes_for_a_stop=2,
                        spatial_radius_km=0.2,
                        spatial_radius_compress_km=.2,
                        cluster_radius_km=0.5,
                        min_samples=2):

    '''Test Parameters for generation of MMCs
    
    '''
    # TDF Definition
    trgeo = skmob.TrajDataFrame(
        geo, 
        datetime='hour',
        user_id='user'
    )

    # Cluster Generation
    clusters, m = get_clusters_from_tdf(
        trgeo,
        verbose=verbose,
        max_speed_kmh= max_speed_kmh,
        detect_stops=detect_stops,
        compress=compress ,
        minutes_for_a_stop=minutes_for_a_stop,
        spatial_radius_km=spatial_radius_km,
        spatial_radius_compress_km=spatial_radius_compress_km,
        cluster_radius_km=cluster_radius_km,
        min_samples=min_samples
    )
    print(f'\nUSER: {geo["user"].values[0]}\n')
    display.display_html(m)

In [None]:
# TESTING PARAMETERS
users = sorted(data['user'].unique())
for user in tqdm(users):
    geo = data[data['user']==user].reset_index(drop=True).copy()
    test_geo_parameters(
        geo,
        verbose=True,
        max_speed_kmh= 0.1,
        detect_stops=False,
        compress=False ,
        minutes_for_a_stop=2,
        spatial_radius_km=0.2,
        spatial_radius_compress_km=.2,
        cluster_radius_km=0.5,
        min_samples=2
    )