# MMC Implementation

## Correct Implementation

In [1]:
import numpy as np
#import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import skmob
from tqdm import tqdm
import sys
sys.path.append('../')
from src.geo_utils import (get_clusters_from_tdf,
                            assign_tdf_points_to_clusters,
                            get_mmc_transitions,
                            get_stationary_vector)


In [2]:
warnings.filterwarnings('ignore')

In [99]:
def get_distance_bw_clusters(cluster_1, cluster_2):
    '''Get Distance Between Clusters
    
    Parameters
    ----------
        cluster_1 (pd.Dataframe): df with columns ['cluster', 'lat', 'lng', 'sta_vector']
        cluster_2 (pd.Dataframe): df with columns ['cluster', 'lat', 'lng', 'sta_vector']
    
    Returns
    -------
        distance (float): Distance between clusters (one-way)
    '''
    valid_distances = []
    valid_clusters = []
    for i, row_i in cluster_1.iterrows():    
        coord_i = (row_i['lat'], row_i['lng'])

        clusters, distances = [], []    
        for j, row_j in cluster_2.iterrows():
            cluster_j = row_j['cluster']
            coord_j = (row_j['lat'], row_j['lng'])

            distance_ij = skmob.utils.utils.distance(coord_i, coord_j)    
            clusters.append(cluster_j)
            distances.append(distance_ij)

        distances = np.array(distances)

        idx_min = distances.argmin()
        min_dist = distances.min()
        cluster_min = clusters[idx_min]

        valid_distances.append(min_dist)
        valid_clusters.append(cluster_min)

    cluster_est = cluster_1.copy()
    cluster_est['cluster_other'] = valid_clusters
    cluster_est['distance_other'] = valid_distances
    distance = (cluster_est['distance_other'] * cluster_est['sta_vector']).sum()
    return distance

def get_mean_distance_bw_clusters(cluster_1, cluster_2):
    '''Get Mean Distance Between Clusters
    
    Get the distances of cluster_1 to cluster_2 and viceversa and returns the average.

    Parameters
    ----------
        cluster_1 (pd.Dataframe): df with columns ['cluster', 'lat', 'lng', 'sta_vector']
        cluster_2 (pd.Dataframe): df with columns ['cluster', 'lat', 'lng', 'sta_vector']
    
    Returns
    -------
        mean_distance (float): Distance between clusters (two-way)
    '''
    d1 = get_distance_bw_clusters(cluster_1, cluster_2)
    d2 = get_distance_bw_clusters(cluster_2, cluster_1)
    mean_distance = (d1+d2)/2
    return mean_distance

In [8]:
def get_mmc_clusters_stavectors(geo):
    '''Get MMC Clusters and Stationary Vectors
    
    Parameters
    ----------
        geo (pd.Dataframe): Dataframe of 1 user_id. Must contain only the columns: ['user', 'hour', 'lat', 'lng']
    
    Returns
    -------
        clusters (pd.Dataframe): Dataframe with the clusters and the stationary value
        m (folium.folium.Map): Map generated with the clusters on it.
    '''
    # TDF Definition
    trgeo = skmob.TrajDataFrame(
        geo, 
        datetime='hour',
        user_id='user'
    )

    # Cluster Generation
    clusters, m = get_clusters_from_tdf(
        trgeo,
        verbose=True,
        max_speed_kmh= 0.01,
        detect_stops=False,
        compress=False ,
        minutes_for_a_stop=2,
        spatial_radius_km=0.2,
        spatial_radius_compress_km=.2,
        cluster_radius_km=0.5,
        min_samples=2
    )

    # Cluster Assignation
    trgeo_cl, distances = assign_tdf_points_to_clusters(
        tdf=trgeo, 
        clusters=clusters
    )

    # Generation of Transit Dataframe
    transit_df = get_mmc_transitions(trgeo_cl)

    # Generation of Transit Matrix
    transit_matrix = pd.crosstab(transit_df['cluster'], 
                                 transit_df['cluster_next'],
                                 normalize='index').values

    # Stationary Vector Assignation
    clusters['sta_vector'] = get_stationary_vector(transit_matrix)

    return clusters, m

In [24]:
#url_usr82 = 'https://raw.githubusercontent.com/bitmapup/mmc3/master/data/geolife_82.csv'
#geo82 = pd.read_csv(url_usr82, header=None)

geo_columns = ['user', 'hour', 'lat', 'lng']
geo = pd.read_csv('../data/geo82.csv.zip')
geo.columns = geo_columns

## Reading for 10 users

In [29]:
# Reading 10 frames
data = pd.read_csv('../data/geo_000_009.csv.zip')

data['hour'] = data['date'] + ' ' + data['time']
data['user'] = data['user_id'].map(int)

data = data[geo_columns]

In [43]:
# Extraction of clusters 
geo_clusters = {}
geo_clusters_img = {}
users = sorted(data['user'].unique())
for user in tqdm(users):
    try:
        geo = data[data['user']==user].reset_index(drop=True).copy()

        clusters, m = get_mmc_clusters_stavectors(geo)

        geo_clusters[user] = clusters
        geo_clusters_img[user] = m
    except:
        pass

  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A

INFO: Noise Filtering applied
INFO: Clusters generated
(173870, 4) (211, 4) (211, 4) (211, 4) (211, 5)
INFO: 2 clusters generated.



 50%|█████     | 1/2 [00:02<00:02,  2.66s/it][A
100%|██████████| 2/2 [00:05<00:00,  2.61s/it][A
 10%|█         | 1/10 [00:11<01:40, 11.12s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A

INFO: Noise Filtering applied
INFO: Clusters generated
(108607, 4) (46, 4) (46, 4) (46, 4) (46, 5)
INFO: 4 clusters generated.



 25%|██▌       | 1/4 [00:01<00:04,  1.63s/it][A
 50%|█████     | 2/4 [00:03<00:03,  1.63s/it][A
 75%|███████▌  | 3/4 [00:04<00:01,  1.61s/it][A
100%|██████████| 4/4 [00:06<00:00,  1.58s/it][A
 20%|██        | 2/10 [00:20<01:20, 10.09s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

INFO: Noise Filtering applied
INFO: Clusters generated
(248217, 4) (155, 4) (155, 4) (155, 4) (155, 5)
INFO: 6 clusters generated.



 17%|█▋        | 1/6 [00:03<00:18,  3.73s/it][A
 33%|███▎      | 2/6 [00:07<00:14,  3.67s/it][A
 50%|█████     | 3/6 [00:10<00:10,  3.64s/it][A
 67%|██████▋   | 4/6 [00:14<00:07,  3.74s/it][A
 83%|████████▎ | 5/6 [00:18<00:03,  3.70s/it][A
100%|██████████| 6/6 [00:22<00:00,  3.68s/it][A
 30%|███       | 3/10 [00:52<02:21, 20.21s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

INFO: Noise Filtering applied
INFO: Clusters generated
(485226, 4) (390, 4) (390, 4) (390, 4) (390, 5)
INFO: 3 clusters generated.



 33%|███▎      | 1/3 [00:06<00:13,  6.77s/it][A
 67%|██████▋   | 2/3 [00:13<00:06,  6.96s/it][A
100%|██████████| 3/3 [00:20<00:00,  6.93s/it][A
 40%|████      | 4/10 [01:43<03:14, 32.45s/it]
  0%|          | 0/8 [00:00<?, ?it/s][A

INFO: Noise Filtering applied
INFO: Clusters generated
(439397, 4) (384, 4) (384, 4) (384, 4) (384, 5)
INFO: 8 clusters generated.



 12%|█▎        | 1/8 [00:06<00:44,  6.34s/it][A
 25%|██▌       | 2/8 [00:13<00:40,  6.78s/it][A
 38%|███▊      | 3/8 [00:20<00:34,  6.82s/it][A
 50%|█████     | 4/8 [00:26<00:26,  6.67s/it][A
 62%|██████▎   | 5/8 [00:33<00:19,  6.65s/it][A
 75%|███████▌  | 6/8 [00:39<00:13,  6.56s/it][A
 88%|████████▊ | 7/8 [00:46<00:06,  6.52s/it][A
100%|██████████| 8/8 [00:52<00:00,  6.56s/it][A
 50%|█████     | 5/10 [03:02<04:05, 49.18s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

INFO: Noise Filtering applied
INFO: Clusters generated
(109046, 4) (85, 4) (85, 4) (85, 4) (85, 5)
INFO: 1 clusters generated.



100%|██████████| 1/1 [00:01<00:00,  1.60s/it][A
 60%|██████    | 6/10 [03:07<02:16, 34.05s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A

INFO: Noise Filtering applied
INFO: Clusters generated
(31830, 4) (24, 4) (24, 4) (24, 4) (24, 5)
INFO: 2 clusters generated.



 50%|█████     | 1/2 [00:00<00:00,  2.35it/s][A
100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A
 70%|███████   | 7/10 [03:09<01:10, 23.44s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

INFO: Noise Filtering applied
INFO: Clusters generated
(87217, 4) (60, 4) (60, 4) (60, 4) (60, 5)
INFO: 1 clusters generated.



100%|██████████| 1/1 [00:01<00:00,  1.17s/it][A
 80%|████████  | 8/10 [03:12<00:34, 17.06s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

INFO: Noise Filtering applied
INFO: Clusters generated
(77910, 4) (41, 4) (41, 4) (41, 4) (41, 5)
INFO: 1 clusters generated.



100%|██████████| 1/1 [00:01<00:00,  1.17s/it][A
 90%|█████████ | 9/10 [03:15<00:12, 12.69s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A

INFO: Noise Filtering applied
INFO: Clusters generated
(84616, 4) (41, 4) (41, 4) (41, 4) (41, 5)
INFO: 2 clusters generated.



 50%|█████     | 1/2 [00:01<00:01,  1.44s/it][A
100%|██████████| 2/2 [00:02<00:00,  1.46s/it][A
100%|██████████| 10/10 [03:20<00:00, 20.06s/it]


In [104]:
# TODO: Corregir la función de vector estacionario. Hay casos en los que falla

In [44]:
geo_clusters_img

{0: <folium.folium.Map at 0x7fe4c7698950>,
 3: <folium.folium.Map at 0x7fe4c78b3390>,
 4: <folium.folium.Map at 0x7fe4f954f990>}

## Distance Estimation

In [101]:
cluster_0 = geo_clusters[0]
cluster_1 = geo_clusters[3]
cluster_2 = geo_clusters[4]

In [52]:
cluster_1

Unnamed: 0,cluster,lat,lng,sta_vector
0,0,39.999859,116.326749,0.474359
1,1,39.987315,116.309833,0.384615
2,2,39.995477,116.288653,0.141026


In [51]:
cluster_2

Unnamed: 0,cluster,lat,lng,sta_vector
0,0,39.999944,116.326035,0.312925
1,1,39.9391,116.348471,0.136054
2,2,39.966866,116.378085,0.054422
3,3,39.986601,116.381269,0.108844
4,4,39.974776,116.331752,0.217687
5,5,39.974543,116.41531,0.013605
6,6,39.987192,116.398921,0.061224
7,7,39.987036,116.389978,0.095238


In [100]:
get_mean_distance_bw_clusters(cluster_1=cluster_1, cluster_2=cluster_2)

2.245555609708128

In [102]:
get_mean_distance_bw_clusters(cluster_1=cluster_0, cluster_2=cluster_2)

1.9047901696538245

## Testing of Parameters

We will test different parameters with cluster combinations to select the best params.

In [112]:
from IPython import display

In [119]:
def test_geo_parameters(geo,
                        verbose=True,
                        max_speed_kmh= 0.01,
                        detect_stops=False,
                        compress=False ,
                        minutes_for_a_stop=2,
                        spatial_radius_km=0.2,
                        spatial_radius_compress_km=.2,
                        cluster_radius_km=0.5,
                        min_samples=2):

    '''Test Parameters for generation of MMCs
    
    '''
    # TDF Definition
    trgeo = skmob.TrajDataFrame(
        geo, 
        datetime='hour',
        user_id='user'
    )

    # Cluster Generation
    clusters, m = get_clusters_from_tdf(
        trgeo,
        verbose=verbose,
        max_speed_kmh= max_speed_kmh,
        detect_stops=detect_stops,
        compress=compress ,
        minutes_for_a_stop=minutes_for_a_stop,
        spatial_radius_km=spatial_radius_km,
        spatial_radius_compress_km=spatial_radius_compress_km,
        cluster_radius_km=cluster_radius_km,
        min_samples=min_samples
    )
    print(f'\nUSER: {geo["user"].values[0]}\n')
    display.display_html(m)

In [None]:
# Extraction of clusters 
users = sorted(data['user'].unique())
for user in tqdm(users):
    geo = data[data['user']==user].reset_index(drop=True).copy()
    test_geo_parameters(
        geo,
        verbose=True,
        max_speed_kmh= 0.01,
        detect_stops=False,
        compress=False ,
        minutes_for_a_stop=2,
        spatial_radius_km=0.2,
        spatial_radius_compress_km=.2,
        cluster_radius_km=0.5,
        min_samples=2
    )

  0%|          | 0/10 [00:00<?, ?it/s]

INFO: Noise Filtering applied
INFO: Clusters generated
(173870, 4) (211, 4) (211, 4) (211, 4) (211, 5)
INFO: 2 clusters generated.

USER: 0



 10%|█         | 1/10 [00:06<00:56,  6.28s/it]

INFO: Noise Filtering applied
INFO: Clusters generated
(108607, 4) (46, 4) (46, 4) (46, 4) (46, 5)
INFO: 4 clusters generated.

USER: 1



 20%|██        | 2/10 [00:09<00:35,  4.46s/it]

INFO: Noise Filtering applied
INFO: Clusters generated
(248217, 4) (155, 4) (155, 4) (155, 4) (155, 5)
INFO: 6 clusters generated.

USER: 2



 30%|███       | 3/10 [00:21<00:55,  7.96s/it]