In [22]:
import numpy as np
import pandas as pd
import datetime

# 1. Functions

In [23]:
def add_day_of_year_column(df_src, column_name='date'):
    df_out = df_src.copy()
    df_out['day_of_year'] = df_out[column_name].apply(lambda d: date_to_nth_day(str(d)))
    return df_out

def date_to_nth_day(date, format='%Y%m%d'):
    date = datetime.datetime.strptime(date, format)
    new_year_day = datetime.datetime(year=date.year, month=1, day=1)
    return (date - new_year_day).days + 1 + (365 * (date.year - 2014))

# 2. Generating Cluster File

*Idea:* _Cluster the stations based on the existing training data and add the cluster groups as additional input for predictions_

In [24]:
df_time_path = '../data/tmp/df_time.csv'
df_time = pd.read_csv(df_time_path)
df_time = df_time.drop(['Unnamed: 0'], axis=1)

In [25]:
df_time.shape

(18683824, 3)

In [26]:
df_time.head()

Unnamed: 0,station,date,TMIN
0,AE000041196,20140101,128
1,AE000041196,20140102,145
2,AE000041196,20140103,140
3,AE000041196,20140106,162
4,AE000041196,20140109,115


In [27]:
df_test = df_time.copy()
df_test.head()
df_test['date'] = df_test['date'].apply(lambda d: date_to_nth_day(str(d)))

# create pivot table
df_test = df_test.drop_duplicates(['station','date'])
df_pivot = df_test.pivot(index='station', columns='date', values='TMIN')

#fill NaN value forward
df_pivot = df_pivot.fillna(method='ffill', axis=1)
df_pivot = df_pivot.fillna(method='bfill', axis=1)
df_pivot = df_pivot.groupby('station').min()
df_flattened = pd.DataFrame(df_pivot.to_records())

KeyboardInterrupt: 

In [None]:
df_flattened.to_csv('../data/cluster/pre_clustering_temperature.csv', index=False)

# Cluster by min temperature similarity

In [28]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

In [31]:
df_cluster = pd.read_csv('../data/cluster/pre_clustering_temperature.csv')

df_cluster.shape

(15693, 1461)

In [35]:
results = []
df_silhouette = df_cluster.drop('station', axis=1)

#df_silhouette.head()
#for i in [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 25, 50, 100, 500]:
    #kmeans = KMeans(n_clusters=i, random_state=1).fit(df_silhouette)
    #score = silhouette_score(df_silhouette, kmeans.labels_)
    #print('[' + str(i) + '] :: ', score)
    #results.append((i, score))

In [42]:
selected_k = 2

kmeans = KMeans(n_clusters=selected_k, random_state=1).fit(df_silhouette)

df_station_cluster_labels = pd.DataFrame(columns=['station'])
df_station_cluster_labels['station'] = df_cluster['station']
df_station_cluster_labels['cluster_id_2'] = pd.Series(kmeans.labels_)

selected_k = 4

kmeans = KMeans(n_clusters=selected_k, random_state=1).fit(df_silhouette)

df_station_cluster_labels['cluster_id_4'] = pd.Series(kmeans.labels_)

selected_k = 6

kmeans = KMeans(n_clusters=selected_k, random_state=1).fit(df_silhouette)

df_station_cluster_labels['cluster_id_6'] = pd.Series(kmeans.labels_)


#df_station_cluster_labels = df_station_cluster_labels.fillna(5)




In [47]:
df_station_cluster_labels.describe()

Unnamed: 0,cluster_id_2,cluster_id_4,cluster_id_6
count,15693.0,15693.0,15693.0
mean,6.4e-05,1.035047,3.026381
std,0.007983,1.046909,1.934353
min,0.0,0.0,0.0
25%,0.0,0.0,1.0
50%,0.0,1.0,4.0
75%,0.0,1.0,5.0
max,1.0,3.0,5.0


In [44]:
df_station_cluster_labels.to_csv('../data/cluster/station_clustertemperature.csv', index=False)

# Cluster by location

In [20]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

In [21]:
results = []
df_red = df_flattened.drop('station')
for i in range(2,1000,100):
    kmeans = KMeans(n_clusters=i, random_state=1).fit(df_head)
    score = silhouette_score(df_head, kmeans.labels_)
    print('[' + str(i) + '] :: ', score)
    results.append((i, score))

ValueError: labels ['station'] not contained in axis

In [None]:
best_k, best_score = select_max(results)

k = best_k # best result

kmeans = KMeans(n_clusters=k, random_state=0).fit(df_pivot)

clustering = []
for i in range(0, k):
    clustering.append([[],[],[]])
    
for i, e in enumerate(df_pivot):
    c_i = kmeans.labels_[i]
    clustering[c_i][0].append(e[0])
    clustering[c_i][1].append(e[1])
    clustering[c_i][2].append(e[2])

for c, s in enumerate(clustering):
    print('C' + str(c), ':: ')
    plt.figure(1,figsize=(12,5))
    plt.subplot(221)
    plt.scatter(s[0], s[1])
    plt.scatter(kmeans.cluster_centers_[c][0],kmeans.cluster_centers_[c][1])
    plt.legend(['Training Data','Cluster Center'])
    plt.subplot(222)
    plt.scatter(s[0], s[2])
    plt.scatter(kmeans.cluster_centers_[c][0],kmeans.cluster_centers_[c][2])
    plt.legend(['Training Data','Cluster Center'])
    plt.subplot(223)
    plt.scatter(s[1], s[2])
    plt.scatter(kmeans.cluster_centers_[c][1],kmeans.cluster_centers_[c][2])
    plt.legend(['Training Data','Cluster Center'])
    plt.show()