In [None]:
import os
import numpy as np
from dask import persist
import dask.dataframe as df
from operator import add
from dask.distributed import Client
import matplotlib.pyplot as plt
from functools import reduce
import geopandas as gpd

head, tail = os.path.split(os.getcwd())
data_dir = os.path.join(head, 'data')
data_raw_dir = os.path.join(data_dir, 'raw')
data_interim_dir = os.path.join(data_dir, 'interim')

In [None]:
from dask.distributed import Client

client = Client() # "tcp://127.0.0.1:58293"
client

# Loading data

In [None]:
park_spot_data = df.read_csv(os.path.join(data_interim_dir, 'spots_with_cities.csv'), 
                             encoding='cp1252', blocksize='512KiB')[['sNoPlace','nPositionCentreLongitude','nPositionCentreLatitude']]

list_of_borough =['Plateau-Mont-Royal',
                  'Saint-Léonard',
                  'Rosemont - La Petite-Patrie',
                  'Outremont',
                  'Ville-Marie',
                  'Mercier - Hochelaga-Maisonneuve',
                  'LaSalle',
                  'Villeray - Saint-Michel - Parc-Extension',
                  'Rivière-des-Prairies - Pointe-aux-Trembles',
                  'Côte-des-Neiges - Notre-Dame-de-Grâce',
                  'Lachine',
                  'Saint-Laurent',
                  'Ahuntsic - Cartierville',
                  'Sud-Ouest',
                  'Anjou',
                  'Montréal-Nord',
                  'Verdun',
                  'Pierrefonds - Roxboro']


# Initializing centroids randomly

In [None]:
init_centroids = park_spot_data.sample(frac=0.002).head(len(list_of_borough),npartitions=-1)

In [None]:
centroids = init_centroids.reset_index(drop=True).rename(columns={'sNoPlace':'name'})

In [None]:
park_spot_data['centroids']=''
epoch=0

In [None]:
def assignCentroid(row, centroids):
    c = reduce((lambda a,b: a if a[1]<b[1] else b),[(centroids.name,\
        (row.nPositionCentreLongitude-centroids.nPositionCentreLongitude)**2 + (row.nPositionCentreLatitude-centroids.nPositionCentreLatitude)**2) \
            for centroids in centroids.itertuples()]) 
    return  c[0]

# Fitting loop

In [None]:
while True:
    epoch+=1
    park_spot_data['new_centroids'] = park_spot_data.apply(assignCentroid, axis=1, args=(centroids,), meta=('centroid', 'f8'))
    park_spot_data = park_spot_data.persist()
    spot_changed_number =len(park_spot_data[park_spot_data['centroids']!=park_spot_data['new_centroids']])
    print('Epoch ',epoch,':',spot_changed_number)
    if spot_changed_number==0:
        break
    centroids = park_spot_data[['nPositionCentreLongitude','nPositionCentreLatitude','new_centroids']].groupby('new_centroids').mean().reset_index().rename(columns={'new_centroids':'name'})
    centroids = centroids.persist()
    park_spot_data['centroids']=park_spot_data['new_centroids']
    

# Plotting the result

In [None]:
# groups = park_spot_data.groupby('centroids')
# groups.head()
# for name, group in groups:
#     plt.plot(group.nLongitude, group.nLatitude, marker='.', linestyle='',  label=name)


In [None]:
parking_spots = park_spot_data.compute()

In [None]:
parking_spots.head()

In [None]:
parking_spots_geo = gpd.GeoDataFrame(parking_spots, geometry=gpd.points_from_xy(parking_spots.nLongitude, parking_spots.nLatitude))
parking_spots_geo

In [None]:
boroughs = gpd.read_file(os.path.join(data_raw_dir, 'montreal_boroughs.geojson'))

In [None]:
fig, ax = plt.subplots(1, figsize=(8, 4))
boroughs.plot(ax=ax)
parking_spots_geo.plot(ax=ax, marker='o', cmap = "hsv", alpha=0.075, label='centroids')
plt.title('Towing in the year')
# plt.savefig(os.path.join(RPT_FIGURES_DIR, output_filename))

In [None]:


parking_spots_geo['centroids'].unique()