In [1]:
import numpy as np
import pandas as pd

# Preprocessing

In [2]:
COLUMNS = ['station','date','feature', 'value', 'measurement','quality', 'source', 'hour']

In [3]:
#Transform Data to meaningful features
df = pd.read_csv('../data/2014.csv', header=None, names=COLUMNS)

In [4]:
df = df.append(pd.read_csv('../data/2015.csv', header=None, names=COLUMNS))
df = df.append(pd.read_csv('../data/2016.csv', header=None, names=COLUMNS))
df = df.append(pd.read_csv('../data/2017.csv', header=None, names=COLUMNS))

In [5]:
df.head()

Unnamed: 0,station,date,feature,value,measurement,quality,source,hour
0,ASN00015643,20140101,TMAX,424,,,a,
1,ASN00015643,20140101,TMIN,227,,,a,
2,ASN00015643,20140101,PRCP,0,,,a,
3,US1MNCV0008,20140101,PRCP,0,,,N,
4,US1MNCV0008,20140101,SNOW,0,,,N,


In [6]:
#Only selecting features who are available in Test Set + Feature to be predicted
selected_features = ['TMIN']

In [7]:
df = df[df['feature'].isin(selected_features)]

In [8]:
df_pivot = df.pivot_table(index=['station','date'], columns='feature', values='value', aggfunc=np.min)

In [9]:
#Include Location Information?
df_stations = pd.read_csv('../data/ghcnd-stations.csv', header=None, names=['station','lat', 'long', 'elev'], sep=';')

In [10]:
df_stations = df_stations.set_index('station')

In [11]:
df_time = df_pivot['TMIN']
df_time = df_time.reset_index()
len(df_time)

18683824

In [13]:
stations = df_time.station.unique()

In [14]:
def derive_all_nth_day_features(input_data, feature, N):
    '''
        Generates a new dataframe with <feature>_1 .. <feature>_N columns
        which represent the feature values of the previous N days.
    
        @param input_data:  Pandas dataframe with columns 
                            'station', 'date' and <feature>.
                            Needs to be ordered by station and date.
        @param feature:     Name of the feature column that should be used.
        @param N:           How many feature columns should be generated.
    '''
    
    # avoid side effects on input dataframe
    data = input_data.copy()
    
    # add empty columns
    col_name_for = lambda n: "{}_{}".format(feature, n)
    for i in range(1, N+1):
        data[col_name_for(i)] = [None]*len(data)

    # iterate over all stations and write compute the new columns
    for i, station in enumerate(stations):
        rows_for_station = data[data['station']==station]
        first_index = rows_for_station.index[0]
        
        # add verbose output to see whether stuff is still running
        print (''*20, '\r', end='')
        print (i+1, '/', len(stations), '::', station, '\r', end='')
        
        for row_index in range(first_index, len(rows_for_station)):
            for i in range(1, N+1):
                if (row_index - i >= first_index):
                    data.at[row_index, col_name_for(i)] = data.loc[row_index-i,feature]

    print('')
    return data

In [None]:
N = 20
file_path = '../data/export_features_2014_{}.csv'.format(N)
df_train = derive_all_nth_day_features(df_time, 'TMIN', N)
df_train.to_csv(file_path)

df_train.head()