In [38]:
#conda install -c conda-forge tslearn
#pip install https://github.com/scikit-learn-contrib/scikit-learn-extra/archive/master.zip

import os
import pathlib

import dask
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pytest
#import seaborn as sns 
import xarray as xr
from dask.diagnostics import ProgressBar
from dask.distributed import Client
from shapely.ops import cascaded_union, unary_union
from scipy import stats

from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score

import geokit as gk
import metis_utils.io_tools as ito
import metis_utils.time_tools as tto
import metis_utils.computation_tools as cto
import spagat.grouping as spgm
import spagat.representation as spr
#import spagatti.input as spti
#import spagatti.utils as spu
#import spagatti.dataset as sptd

In [14]:
#dummy data 
data = pd.read_csv("/home/s-patil/data/humidity.csv") 
data = data.dropna()
data.head()

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
1,2012-10-01 13:00:00,76.0,81.0,88.0,81.0,88.0,82.0,22.0,23.0,50.0,...,71.0,58.0,93.0,68.0,50.0,63.0,22.0,51.0,51.0,50.0
2,2012-10-01 14:00:00,76.0,80.0,87.0,80.0,88.0,81.0,21.0,23.0,49.0,...,70.0,57.0,91.0,68.0,51.0,62.0,22.0,51.0,51.0,50.0
3,2012-10-01 15:00:00,76.0,80.0,86.0,80.0,88.0,81.0,21.0,23.0,49.0,...,70.0,57.0,87.0,68.0,51.0,62.0,22.0,51.0,51.0,50.0
4,2012-10-01 16:00:00,77.0,80.0,85.0,79.0,88.0,81.0,21.0,23.0,49.0,...,69.0,57.0,84.0,68.0,52.0,62.0,22.0,51.0,51.0,50.0
5,2012-10-01 17:00:00,78.0,79.0,84.0,79.0,88.0,80.0,21.0,24.0,49.0,...,69.0,57.0,80.0,68.0,54.0,62.0,23.0,51.0,51.0,50.0


In [24]:
xarray_values = data.values
xarray_values = xarray_values[:,1:]
xarray_values

array([[76.0, 81.0, 88.0, ..., 51.0, 51.0, 50.0],
       [76.0, 80.0, 87.0, ..., 51.0, 51.0, 50.0],
       [76.0, 80.0, 86.0, ..., 51.0, 51.0, 50.0],
       ...,
       [82.0, 33.0, 22.0, ..., 95.0, 95.0, 60.0],
       [87.0, 35.0, 20.0, ..., 96.0, 96.0, 56.0],
       [87.0, 40.0, 22.0, ..., 96.0, 96.0, 60.0]], dtype=object)

In [25]:
time = data.datetime.values
time

array(['2012-10-01 13:00:00', '2012-10-01 14:00:00',
       '2012-10-01 15:00:00', ..., '2017-10-27 22:00:00',
       '2017-10-27 23:00:00', '2017-10-28 00:00:00'], dtype=object)

In [21]:
regions_ids = list(data.columns)
regions_ids.pop(0)
regions_ids

['Vancouver',
 'Portland',
 'San Francisco',
 'Seattle',
 'Los Angeles',
 'San Diego',
 'Las Vegas',
 'Phoenix',
 'Albuquerque',
 'Denver',
 'San Antonio',
 'Dallas',
 'Houston',
 'Kansas City',
 'Minneapolis',
 'Saint Louis',
 'Chicago',
 'Nashville',
 'Indianapolis',
 'Atlanta',
 'Detroit',
 'Jacksonville',
 'Charlotte',
 'Miami',
 'Pittsburgh',
 'Toronto',
 'Philadelphia',
 'New York',
 'Montreal',
 'Boston',
 'Beersheba',
 'Tel Aviv District',
 'Eilat',
 'Haifa',
 'Nahariyya',
 'Jerusalem']

In [26]:
test_xr_DataArray = xr.DataArray(xarray_values, 
                                 coords={'time': time, 'region_ids': regions_ids}, 
                                 dims=['time','region_ids'])

test_xr_DataArray

<xarray.DataArray (time: 36263, region_ids: 36)>
array([[76.0, 81.0, 88.0, ..., 51.0, 51.0, 50.0],
       [76.0, 80.0, 87.0, ..., 51.0, 51.0, 50.0],
       [76.0, 80.0, 86.0, ..., 51.0, 51.0, 50.0],
       ...,
       [82.0, 33.0, 22.0, ..., 95.0, 95.0, 60.0],
       [87.0, 35.0, 20.0, ..., 96.0, 96.0, 56.0],
       [87.0, 40.0, 22.0, ..., 96.0, 96.0, 60.0]], dtype=object)
Coordinates:
  * time        (time) object '2012-10-01 13:00:00' ... '2017-10-28 00:00:00'
  * region_ids  (region_ids) <U17 'Vancouver' 'Portland' ... 'Jerusalem'

In [27]:
#http://xarray.pydata.org/en/stable/generated/xarray.DataArray.transpose.html  DOES NOT WORK !!!

## Transposing dimensions is enough
transposed_test_xr_DataArray = test_xr_DataArray.transpose(transpose_coords= True)
transposed_test_xr_DataArray

<xarray.DataArray (region_ids: 36, time: 36263)>
array([[76.0, 76.0, 76.0, ..., 82.0, 87.0, 87.0],
       [81.0, 80.0, 80.0, ..., 33.0, 35.0, 40.0],
       [88.0, 87.0, 86.0, ..., 22.0, 20.0, 22.0],
       ...,
       [51.0, 51.0, 51.0, ..., 95.0, 96.0, 96.0],
       [51.0, 51.0, 51.0, ..., 95.0, 96.0, 96.0],
       [50.0, 50.0, 50.0, ..., 60.0, 56.0, 60.0]], dtype=object)
Coordinates:
  * time        (time) object '2012-10-01 13:00:00' ... '2017-10-28 00:00:00'
  * region_ids  (region_ids) <U17 'Vancouver' 'Portland' ... 'Jerusalem'

In [54]:
kmedoids_cluster = KMedoids(n_clusters=2, metric='euclidean').fit(transposed_test_xr_DataArray)
kmedoids_cluster

KMedoids(init='heuristic', max_iter=300, metric='euclidean', n_clusters=2,
         random_state=None)

In [55]:
kmedoids_cluster.labels_

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0])

In [56]:
silhouette_score(transposed_test_xr_DataArray, kmedoids_cluster.labels_ )

0.08825941086200596