In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from itertools import cycle, islice
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates

In [2]:
%matplotlib auto

Using matplotlib backend: Qt5Agg


In [3]:
data = pd.read_csv('minute_weather.csv')

In [22]:
data.shape

(1587257, 13)

<h1 style="color:purple">Data Sampling</h1>

In [21]:
# get every 10th row
sampled_df = data[(data['rowID'] % 10) == 0]
sampled_df.shape

(158726, 13)

In [None]:
sampled_df.describe()

In [23]:
sampled_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rowID,158726.0,793625.0,458203.93751,0.0,396812.5,793625.0,1190437.5,1587250.0
air_pressure,158726.0,916.830161,3.051717,905.0,914.8,916.7,918.7,929.5
air_temp,158726.0,61.851589,11.833569,31.64,52.7,62.24,70.88,99.5
avg_wind_direction,158680.0,162.1561,95.278201,0.0,62.0,182.0,217.0,359.0
avg_wind_speed,158680.0,2.775215,2.057624,0.0,1.3,2.2,3.8,31.9
max_wind_direction,158680.0,163.462144,92.452139,0.0,68.0,187.0,223.0,359.0
max_wind_speed,158680.0,3.400558,2.418802,0.1,1.6,2.7,4.6,36.0
min_wind_direction,158680.0,166.774017,97.441109,0.0,76.0,180.0,212.0,359.0
min_wind_speed,158680.0,2.134664,1.742113,0.0,0.8,1.6,3.0,31.6
rain_accumulation,158725.0,0.000318,0.011236,0.0,0.0,0.0,0.0,3.12


In [24]:
# how many rows have value zero for x columns
sample_rows, sample_columns = sampled_df.shape
no_rain, columns = sampled_df[sampled_df['rain_accumulation'] == 0].shape
sampled_df[sampled_df['rain_duration'] == 0].shape

(157237, 13)

In [7]:
print('No of days without rain: ', no_rain)
print('Out of sample: ', sample_rows)
print('% no rain: ', (no_rain / sample_rows))

No of days without rain:  157812
Out of sample:  158726
% no rain:  0.9942416491312073


In [8]:
print(len(sampled_df.columns))
print(type(sampled_df.columns))

13
<class 'pandas.core.indexes.base.Index'>


In [25]:
# deleting the rain columns as there is an insignificant amount of rain.
del sampled_df['rain_accumulation']
del sampled_df['rain_duration']


In [26]:
rows_before = sampled_df.shape[0]
sampled_df = sampled_df.dropna()
rows_after = sampled_df.shape[0]

In [27]:
rows_before - rows_after

46

In [28]:
sampled_df.columns

Index(['rowID', 'hpwren_timestamp', 'air_pressure', 'air_temp',
       'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction',
       'max_wind_speed', 'min_wind_direction', 'min_wind_speed',
       'relative_humidity'],
      dtype='object')

In [29]:
len(sampled_df.columns)

11

<h1 style="color:purple">Select Features of Interest for Clustering</h1>
<h4><ol>
    <li>What do we want to base the clustering on</li>
    </ol>


In [30]:
features = ['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed',
            'max_wind_direction', 'max_wind_speed', 'relative_humidity']
len(features)

7

<ol start='2'><li><h4>Apply selected columns as a filter</h4></li></ol>

In [31]:
select_df = sampled_df[features]
select_df.columns


Index(['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed',
       'max_wind_direction', 'max_wind_speed', 'relative_humidity'],
      dtype='object')

In [32]:
len(select_df.columns)

7

<ol start='3'><li><h4>Scale the Data</h4></li></ol>

In [33]:
x = StandardScaler().fit_transform(select_df)

print('shape: ', x.shape)
print('type: ', type(x))
print()
print(x)

shape:  (158680, 7)
type:  <class 'numpy.ndarray'>

[[-1.48456281  0.24544455 -0.68385323 ... -0.62153592 -0.74440309
   0.49233835]
 [-1.48456281  0.03247142 -0.19055941 ...  0.03826701 -0.66171726
  -0.34710804]
 [-1.51733167  0.12374562 -0.65236639 ... -0.44847286 -0.37231683
   0.40839371]
 ...
 [-0.30488381  1.15818654  1.90856325 ...  2.0393087  -0.70306017
   0.01538018]
 [-0.30488381  1.12776181  2.06599745 ... -1.67073075 -0.74440309
  -0.04948614]
 [-0.30488381  1.09733708 -1.63895404 ... -1.55174989 -0.62037434
  -0.05711747]]


In [37]:
plt.scatter(x,x)

<matplotlib.collections.PathCollection at 0x234874f20b8>

<ol start='4'><li><h4>Use k-mean clustering</h4></li></ol>

In [34]:
kmeans = KMeans(n_clusters=12)
model = kmeans.fit(x)
print("model\n", model)

model
 KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=12, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)


<h5>The clusters have now been created</h5>

<ol start='5'><li><h4>You can now view the cluster centers</h4></li></ol>

In [39]:
centers = model.cluster_centers_
centers

array([[-0.84002474, -1.19862151,  0.37511805,  0.35661453,  0.47357947,
         0.34456084,  1.36230622],
       [ 0.13178406,  0.84358797,  1.410738  , -0.63856848,  1.67479528,
        -0.5893245 , -0.71399846],
       [-0.16424999,  0.86412183, -1.31099357, -0.58984787, -1.1667006 ,
        -0.60516001, -0.64111121],
       [ 1.19025844, -0.25461999, -1.15487591,  2.12282305, -1.05339992,
         2.23945426, -1.13469273],
       [ 0.73288269,  0.43261265,  0.28561733, -0.53398709,  0.47336531,
        -0.5402235 , -0.77214188],
       [ 0.06006972, -0.78781194, -1.19669912, -0.5708903 , -1.04262825,
        -0.58541687,  0.8777325 ],
       [-1.18010859, -0.87564065,  0.44663625,  1.97745237,  0.53859908,
         1.9389203 ,  0.91376446],
       [ 0.2339313 ,  0.31909585,  1.88794143, -0.65198177, -1.55164369,
        -0.57681439, -0.28251551],
       [ 0.25085981, -0.9944655 ,  0.65947935, -0.54728597,  0.85080864,
        -0.52997109,  1.15874504],
       [-0.69406128,  0.5468

In [42]:
print(type(centers))
print(len(centers))
print(centers.shape)
print(centers[0])

<class 'numpy.ndarray'>
12
(12, 7)
[-0.84002474 -1.19862151  0.37511805  0.35661453  0.47357947  0.34456084
  1.36230622]


<h2>Plot the cluster centers</h2>
<ol><li><h4>Create helper functions to aid in creating the 
    data to plot</h4></li></ol>

In [47]:
def pd_centers(featuresUsed, centers):
    colNames = list(featuresUsed)
    colNames.append('prediction')
    
    # zip with a column called 'prediction' (index)
    Z = [np.append(A, index) for index, A in enumerate(centers)]
    
    # convert to pandas dataframe for plotting
    P = pd.DataFrame(Z, columns=colNames)
    P['prediction'] = P['prediction'].astype(int)
    return P

<ol start='2'><li><h4>Create helper functions to aid in creating the 
    data to plot</h4></li></ol>

In [50]:
def parallel_plot(data):
    my_colors = list(islice(cycle(['b', 'r', 'g', 'y', 'k']), None, len(data)))
    plt.figure().gca().axes.set_ylim([-3, +3])
    parallel_coordinates(data, 'prediction', color=my_colors, marker='o')

<ol start='2'><li><h4>Fun Helper Functions </h4></li></ol>


In [51]:
P = pd_centers(features, centers)
P

Unnamed: 0,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,relative_humidity,prediction
0,-0.840025,-1.198622,0.375118,0.356615,0.473579,0.344561,1.362306,0
1,0.131784,0.843588,1.410738,-0.638568,1.674795,-0.589325,-0.713998,1
2,-0.16425,0.864122,-1.310994,-0.589848,-1.166701,-0.60516,-0.641111,2
3,1.190258,-0.25462,-1.154876,2.122823,-1.0534,2.239454,-1.134693,3
4,0.732883,0.432613,0.285617,-0.533987,0.473365,-0.540224,-0.772142,4
5,0.06007,-0.787812,-1.196699,-0.57089,-1.042628,-0.585417,0.877732,5
6,-1.180109,-0.875641,0.446636,1.977452,0.538599,1.93892,0.913764,6
7,0.233931,0.319096,1.887941,-0.651982,-1.551644,-0.576814,-0.282516,7
8,0.25086,-0.994466,0.659479,-0.547286,0.850809,-0.529971,1.158745,8
9,-0.694061,0.54685,0.178103,-0.583777,0.347398,-0.597186,-0.11677,9


<ol start='3'><li><h4>Start Plotting</h4></li></ol>

In [52]:
# dry days
parallel_plot(P[P['relative_humidity'] < -0.5])

In [53]:
# Warm days
parallel_plot(P[P['air_temp'] > 0.5])

In [54]:
# cool days
parallel_plot(P[(P['relative_humidity'] > 0.5) & (P['air_temp'] < 0.5)])