In [1]:
import numpy as np
import pandas as pd
import datetime

# 1. Functions

In [2]:
def add_day_of_year_column(df_src, column_name='date'):
    df_out = df_src.copy()
    df_out['day_of_year'] = df_out[column_name].apply(lambda d: date_to_nth_day(str(d)))
    return df_out

def date_to_nth_day(date, format='%Y%m%d'):
    date = datetime.datetime.strptime(date, format)
    new_year_day = datetime.datetime(year=date.year, month=1, day=1)
    return (date - new_year_day).days + 1 + (365 * (date.year - 2014))

# 2. Generating Cluster File

*Idea:* _Cluster the stations based on the existing training data and add the cluster groups as additional input for predictions_

In [3]:
df_time_path = '../data/tmp/df_time.csv'
df_time = pd.read_csv(df_time_path)
df_time = df_time.drop(['Unnamed: 0'], axis=1)

In [4]:
df_time.shape

(18683824, 3)

In [5]:
df_time.head()

Unnamed: 0,station,date,TMIN
0,AE000041196,20140101,128
1,AE000041196,20140102,145
2,AE000041196,20140103,140
3,AE000041196,20140106,162
4,AE000041196,20140109,115


In [6]:
df_test = df_time.copy()
df_test.head()
df_test['date'] = df_test['date'].apply(lambda d: date_to_nth_day(str(d)))

# create pivot table
df_test = df_test.drop_duplicates(['station','date'])
df_pivot = df_test.pivot(index='station', columns='date', values='TMIN')

#fill NaN value forward
df_pivot = df_pivot.fillna(method='ffill', axis=1)
df_pivot = df_pivot.fillna(method='bfill', axis=1)
df_pivot = df_pivot.groupby('station').min()
df_flattened = pd.DataFrame(df_pivot.to_records())

In [7]:
df_flattened.to_csv('../data/cluster/pre_clustering_temperature.csv', index=False)

# Cluster by min temperature similarity

In [8]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

In [9]:
df_cluster = pd.read_csv('../data/cluster/pre_clustering_temperature.csv')

df_cluster.shape

(15693, 1461)

In [11]:
results = []
df_silhouette = df_cluster.drop('station', axis=1)

df_silhouette.head()
for i in [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 25, 50, 100, 500]:
    kmeans = KMeans(n_clusters=i, random_state=1).fit(df_silhouette)
    score = silhouette_score(df_silhouette, kmeans.labels_)
    print('[' + str(i) + '] :: ', score)
    results.append((i, score))

[2] ::  0.9848858407547253
[3] ::  0.3445747097540417
[4] ::  0.2512567148333283
[5] ::  0.21522268934446143
[6] ::  0.2308841230041261
[7] ::  0.20332684001619356
[8] ::  0.2122391904479273
[9] ::  0.21868539469614845
[10] ::  0.20099075267167657
[15] ::  0.17353536618757207
[25] ::  0.16146646588403274
[50] ::  0.14215508117300074
[100] ::  0.12921120540616574
[500] ::  0.11520510006300916


In [32]:
selected_k = 2

kmeans = KMeans(n_clusters=selected_k, random_state=1).fit(df_silhouette)

df_station_cluster_labels = pd.DataFrame(columns=['station', 'cluster_id'])
df_station_cluster_labels['station'] = df_cluster['station']
df_station_cluster_labels['cluster_id'] = pd.Series(kmeans.labels_)

df_station_cluster_labels = df_station_cluster_labels.fillna(5)


df_station_cluster_labels.to_csv('../data/cluster/station_clustering__by_temperature_2.csv', index=False)

In [13]:
selected_k = 3

kmeans = KMeans(n_clusters=selected_k, random_state=1).fit(df_silhouette)

df_station_cluster_labels = pd.DataFrame(columns=['station', 'cluster_id'])
df_station_cluster_labels['station'] = df_cluster['station']
df_station_cluster_labels['cluster_id'] = pd.Series(kmeans.labels_)

df_station_cluster_labels = df_station_cluster_labels.fillna(5)


df_station_cluster_labels.to_csv('../data/cluster/station_clustering__by_temperature_3.csv', index=False)

In [14]:
selected_k = 4

kmeans = KMeans(n_clusters=selected_k, random_state=1).fit(df_silhouette)

df_station_cluster_labels = pd.DataFrame(columns=['station', 'cluster_id'])
df_station_cluster_labels['station'] = df_cluster['station']
df_station_cluster_labels['cluster_id'] = pd.Series(kmeans.labels_)

df_station_cluster_labels = df_station_cluster_labels.fillna(5)


df_station_cluster_labels.to_csv('../data/cluster/station_clustering__by_temperature_4.csv', index=False)

In [15]:
selected_k = 6

kmeans = KMeans(n_clusters=selected_k, random_state=1).fit(df_silhouette)

df_station_cluster_labels = pd.DataFrame(columns=['station', 'cluster_id'])
df_station_cluster_labels['station'] = df_cluster['station']
df_station_cluster_labels['cluster_id'] = pd.Series(kmeans.labels_)

df_station_cluster_labels = df_station_cluster_labels.fillna(5)


df_station_cluster_labels.to_csv('../data/cluster/station_clustering__by_temperature_6.csv', index=False)

# Cluster by location and temperature

In [16]:
from sklearn.metrics import silhouette_score
from sklearn import preprocessing
from sklearn.cluster import KMeans

In [17]:
def get_station_df():
    STATIONS_FILE = '../data/ghcnd-stations.csv'
    return pd.read_csv(STATIONS_FILE, header=None, names=['station','lat', 'long', 'elev'], sep=';')
    
def add_coordinates(df_src, df_stations, src_index='station', foreign_index='station'):
    df_out = df_src.copy()
    return df_out.join(df_stations.set_index(foreign_index), on=src_index)

In [21]:
df_cluster = pd.read_csv('../data/cluster/pre_clustering_temperature.csv')

df_stations = get_station_df()
df_cluster_location = add_coordinates(df_cluster, df_stations, src_index='station', foreign_index='station')
df_cluster_location = df_cluster_location.drop('station', axis=1)
df_cluster_location.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1454,1455,1456,1457,1458,1459,1460,lat,long,elev
0,128.0,145.0,140.0,140.0,140.0,162.0,162.0,162.0,115.0,115.0,...,126.0,126.0,150.0,132.0,114.0,113.0,114.0,25.333,55.517,34.0
1,159.0,159.0,160.0,156.0,150.0,186.0,186.0,186.0,148.0,148.0,...,199.0,162.0,162.0,189.0,189.0,176.0,176.0,25.255,55.364,10.4
2,136.0,128.0,136.0,136.0,112.0,141.0,141.0,141.0,122.0,126.0,...,164.0,164.0,155.0,186.0,186.0,186.0,126.0,24.433,54.651,26.8
3,131.0,137.0,137.0,137.0,106.0,164.0,164.0,164.0,164.0,124.0,...,129.0,152.0,139.0,139.0,124.0,124.0,124.0,24.262,55.609,264.9
4,25.0,25.0,25.0,68.0,85.0,58.0,50.0,46.0,48.0,48.0,...,29.0,36.0,36.0,104.0,104.0,79.0,52.0,36.7167,3.25,24.0


In [20]:
x = df_cluster_location.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_cluster_location_scaled = pd.DataFrame(x_scaled)

df_cluster_location_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1453,1454,1455,1456,1457,1458,1459,1460,1461,1462
0,0.464359,0.760133,0.740572,0.719975,0.71232,0.726079,0.726079,0.726079,0.696685,0.696685,...,0.824376,0.818875,0.838557,0.821462,0.799057,0.803419,0.81286,0.668597,0.655624,0.181357
1,0.477132,0.769435,0.753576,0.730088,0.718574,0.741088,0.741088,0.741088,0.717323,0.717323,...,0.894434,0.853194,0.849953,0.875594,0.869811,0.863248,0.872361,0.668145,0.655198,0.177218
2,0.467656,0.748837,0.737971,0.717446,0.694809,0.712946,0.712946,0.712946,0.701063,0.703565,...,0.860845,0.8551,0.843305,0.872745,0.866981,0.872745,0.824376,0.66338,0.653213,0.180094
3,0.465595,0.754817,0.738622,0.718078,0.691057,0.72733,0.72733,0.72733,0.72733,0.702314,...,0.827255,0.843661,0.82811,0.82811,0.808491,0.813865,0.822457,0.662388,0.65588,0.22186
4,0.42192,0.680399,0.6658,0.674463,0.677924,0.661038,0.656035,0.653533,0.654784,0.654784,...,0.731286,0.733079,0.730294,0.794872,0.789623,0.77113,0.753359,0.73459,0.510114,0.179603


In [23]:
results = []

for i in [2, 3, 4, 5, 15, 25, 100]:
    kmeans = KMeans(n_clusters=i, random_state=1).fit(df_cluster_location_scaled)
    score = silhouette_score(df_cluster_location_scaled, kmeans.labels_)
    print('[' + str(i) + '] :: ', score)
    results.append((i, score))

[2] ::  0.3529523081101939
[3] ::  0.25956195513684555
[4] ::  0.2316225532227601
[5] ::  0.19076715952905401
[15] ::  0.1690584151083668
[25] ::  0.15986059512081532
[100] ::  0.1327477303687484


In [25]:
selected_k = 2

kmeans = KMeans(n_clusters=selected_k, random_state=1).fit(df_silhouette)

df_station_cluster_labels = pd.DataFrame(columns=['station', 'cluster_id'])
df_station_cluster_labels['station'] = df_cluster['station']
df_station_cluster_labels['cluster_id'] = pd.Series(kmeans.labels_)

df_station_cluster_labels = df_station_cluster_labels.fillna(5)


df_station_cluster_labels.to_csv('../data/cluster/station_clustering_by_location_and_temperature_2.csv', index=False)

In [26]:
selected_k = 3

kmeans = KMeans(n_clusters=selected_k, random_state=1).fit(df_silhouette)

df_station_cluster_labels = pd.DataFrame(columns=['station', 'cluster_id'])
df_station_cluster_labels['station'] = df_cluster['station']
df_station_cluster_labels['cluster_id'] = pd.Series(kmeans.labels_)

df_station_cluster_labels = df_station_cluster_labels.fillna(5)


df_station_cluster_labels.to_csv('../data/cluster/station_clustering_by_location_and_temperature_3.csv', index=False)

In [35]:
selected_k = 4

kmeans = KMeans(n_clusters=selected_k, random_state=1).fit(df_silhouette)

df_station_cluster_labels = pd.DataFrame(columns=['station', 'cluster_id'])
df_station_cluster_labels['station'] = df_cluster['station']
df_station_cluster_labels['cluster_id'] = pd.Series(kmeans.labels_)

df_station_cluster_labels = df_station_cluster_labels.fillna(5)


df_station_cluster_labels.to_csv('../data/cluster/station_clustering_by_location_and_temperature_4.csv', index=False)

# Generate Single Combined Cluster File

In [36]:
paths = [
    '../data/cluster/station_clustering__by_temperature_2.csv',
    '../data/cluster/station_clustering__by_temperature_3.csv',
    '../data/cluster/station_clustering__by_temperature_4.csv',
    '../data/cluster/station_clustering__by_temperature_6.csv',
    '../data/cluster/station_clustering_by_location_and_temperature_2.csv',
    '../data/cluster/station_clustering_by_location_and_temperature_3.csv',
    '../data/cluster/station_clustering_by_location_and_temperature_4.csv',
]

df_joined = pd.DataFrame()
df_joined['station'] = pd.read_csv(paths[0])['station']
for i, path in enumerate(paths):
    df_new = pd.read_csv(path)
    df_new = df_new.rename(index=str, columns={'cluster_id': 'cluster_id_' + str(i)})
    df_joined = df_joined.join(df_new.set_index('station'), on='station')
    
df_joined.head()

Unnamed: 0,station,cluster_id_0,cluster_id_1,cluster_id_2,cluster_id_3,cluster_id_4,cluster_id_5,cluster_id_6
0,AE000041196,0,1,3,4,3,1,3
1,AEM00041194,0,1,3,4,3,1,3
2,AEM00041217,0,1,3,4,3,1,3
3,AEM00041218,0,1,3,4,3,1,3
4,AG000060390,0,1,1,1,1,1,1


In [37]:
df_joined.isnull().any()

station         False
cluster_id_0    False
cluster_id_1    False
cluster_id_2    False
cluster_id_3    False
cluster_id_4    False
cluster_id_5    False
cluster_id_6    False
dtype: bool

In [39]:
df_joined.to_csv('../data/cluster/all_clusters.csv', index=False)