# 0. Imports

In [36]:
import pandas as pd
import numpy as np
import datetime

# 1. Setup and Functions

In [78]:
TRAIN_FILE = '../data/export_features_2014_to_2017_20.csv'
STATIONS_FILE = '../data/ghcnd-stations.csv'

def get_original_df():
    df_original = pd.read_csv(TRAIN_FILE)
    df_original = df_original.drop(['Unnamed: 0'], axis=1)
    return df_original

def get_station_df():
     return pd.read_csv(STATIONS_FILE, header=None, names=['station','lat', 'long', 'elev'], sep=';')

def add_coordinates(df_src, df_stations, src_index='station', foreign_index='station'):
    df_out = df_src.copy()
    return df_out.join(df_stations.set_index(foreign_index), on=src_index)

def add_day_of_year_column(df_src, column_name='date'):
    df_out = df_src.copy()
    df_out['day_of_year'] = df_out[column_name].apply(lambda d: date_to_nth_day(str(d)))
    return df_out

def date_to_nth_day(date, format='%Y%m%d'):
    date = datetime.datetime.strptime(date, format)
    new_year_day = datetime.datetime(year=date.year, month=1, day=1)
    return (date - new_year_day).days + 1


# 2. Set Up Train Data

In [29]:
df = get_original_df()

In [31]:
df.head()
# df.groupby(['loc', 'date']).mean()
# df.groupby('20150101').count()df_test.iloc[0]

Unnamed: 0,station,date,TMIN,TMIN_1,TMIN_2,TMIN_3,TMIN_4,TMIN_5,TMIN_6,TMIN_7,...,TMIN_11,TMIN_12,TMIN_13,TMIN_14,TMIN_15,TMIN_16,TMIN_17,TMIN_18,TMIN_19,TMIN_20
0,AE000041196,20140101,128,,,,,,,,...,,,,,,,,,,
1,AE000041196,20140102,145,128.0,,,,,,,...,,,,,,,,,,
2,AE000041196,20140103,140,145.0,128.0,,,,,,...,,,,,,,,,,
3,AE000041196,20140106,162,140.0,145.0,128.0,,,,,...,,,,,,,,,,
4,AE000041196,20140109,115,162.0,140.0,145.0,128.0,,,,...,,,,,,,,,,


In [32]:
df_stations = get_station_df()

In [33]:
df_stations.head()

Unnamed: 0,station,lat,long,elev
0,ACW00011604,17.1167,-61.7833,10.1
1,ACW00011647,17.1333,-61.7833,19.2
2,AE000041196,25.333,55.517,34.0
3,AEM00041194,25.255,55.364,10.4
4,AEM00041217,24.433,54.651,26.8


In [34]:
df_joined = add_coordinates(df, df_stations)
df_joined.head()

Unnamed: 0,station,date,TMIN,TMIN_1,TMIN_2,TMIN_3,TMIN_4,TMIN_5,TMIN_6,TMIN_7,...,TMIN_14,TMIN_15,TMIN_16,TMIN_17,TMIN_18,TMIN_19,TMIN_20,lat,long,elev
0,AE000041196,20140101,128,,,,,,,,...,,,,,,,,25.333,55.517,34.0
1,AE000041196,20140102,145,128.0,,,,,,,...,,,,,,,,25.333,55.517,34.0
2,AE000041196,20140103,140,145.0,128.0,,,,,,...,,,,,,,,25.333,55.517,34.0
3,AE000041196,20140106,162,140.0,145.0,128.0,,,,,...,,,,,,,,25.333,55.517,34.0
4,AE000041196,20140109,115,162.0,140.0,145.0,128.0,,,,...,,,,,,,,25.333,55.517,34.0


In [46]:
df_date = add_day_of_year_column(df_joined)
df_date.head()

Unnamed: 0,station,date,TMIN,TMIN_1,TMIN_2,TMIN_3,TMIN_4,TMIN_5,TMIN_6,TMIN_7,...,TMIN_15,TMIN_16,TMIN_17,TMIN_18,TMIN_19,TMIN_20,lat,long,elev,day_of_year
0,AE000041196,20140101,128,,,,,,,,...,,,,,,,25.333,55.517,34.0,1
1,AE000041196,20140102,145,128.0,,,,,,,...,,,,,,,25.333,55.517,34.0,2
2,AE000041196,20140103,140,145.0,128.0,,,,,,...,,,,,,,25.333,55.517,34.0,3
3,AE000041196,20140106,162,140.0,145.0,128.0,,,,,...,,,,,,,25.333,55.517,34.0,6
4,AE000041196,20140109,115,162.0,140.0,145.0,128.0,,,,...,,,,,,,25.333,55.517,34.0,9


In [47]:
df_date.to_csv('../data/df_train_mike.csv')

# 3. Training

In [60]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

In [61]:
def save_model(model, PATH):
    joblib.dump(model, PATH) 
    
def load_model(PATH):
    return joblib.load(PATH) 
    


In [56]:
df_train = pd.read_csv('../data/df_train_mike.csv')
df_train = df_train.drop(['Unnamed: 0'], axis=1)
df_train = df_train.dropna(axis=0, how='any')

df_train.head()

Unnamed: 0,station,date,TMIN,TMIN_1,TMIN_2,TMIN_3,TMIN_4,TMIN_5,TMIN_6,TMIN_7,...,TMIN_15,TMIN_16,TMIN_17,TMIN_18,TMIN_19,TMIN_20,lat,long,elev,day_of_year
20,AE000041196,20140202,155,147.0,149.0,130.0,138.0,123.0,115.0,120.0,...,159.0,115.0,162.0,140.0,145.0,128.0,25.333,55.517,34.0,33
21,AE000041196,20140203,155,155.0,147.0,149.0,130.0,138.0,123.0,115.0,...,145.0,159.0,115.0,162.0,140.0,145.0,25.333,55.517,34.0,34
22,AE000041196,20140207,99,155.0,155.0,147.0,149.0,130.0,138.0,123.0,...,137.0,145.0,159.0,115.0,162.0,140.0,25.333,55.517,34.0,38
23,AE000041196,20140208,112,99.0,155.0,155.0,147.0,149.0,130.0,138.0,...,118.0,137.0,145.0,159.0,115.0,162.0,25.333,55.517,34.0,39
24,AE000041196,20140209,123,112.0,99.0,155.0,155.0,147.0,149.0,130.0,...,117.0,118.0,137.0,145.0,159.0,115.0,25.333,55.517,34.0,40


In [108]:
def train_simple_random_forest_model():
    TRAIN_COLUMNS = [
        'day_of_year',
        'lat',
        'long',
        'elev',
    ]

    y = df_train['TMIN']
    X = df_train[TRAIN_COLUMNS]

    regr = RandomForestRegressor(n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=3, max_depth=3, bootstrap=True, random_state=1337)
    regr.fit(X, y)

    save_model(regr, '../models/mike/simple_random_forest.pkl')

In [110]:
def train_20_days_random_forest_model():
    TRAIN_COLUMNS = [
        'day_of_year',
        'lat',
        'long',
        'elev',
        'TMIN_1',
        'TMIN_2',
        'TMIN_3',
        'TMIN_4',
        'TMIN_5',
        'TMIN_6',
        'TMIN_7',
        'TMIN_8',
        'TMIN_9',
        'TMIN_10',
        'TMIN_11',
        'TMIN_12',
        'TMIN_13',
        'TMIN_14',
        'TMIN_15',
        'TMIN_16',
        'TMIN_17',
        'TMIN_18',
        'TMIN_19',
        'TMIN_20',
    ]

    y = df_train['TMIN']
    X = df_train[TRAIN_COLUMNS]

    regr = RandomForestRegressor(n_estimators=100, min_samples_split=10, min_samples_leaf=10, bootstrap=True, random_state=1337)
    regr.fit(X, y)

    save_model(regr, '../models/mike/20_days_random_forest.pkl')

In [425]:
def train_5_days_random_forest_model():
    TRAIN_COLUMNS = [
        'day_of_year',
        'lat',
        'long',
        'elev',
        'TMIN_1',
        'TMIN_2',
        'TMIN_3',
        'TMIN_4',
        'TMIN_5',
    ]

    y = df_train['TMIN']
    X = df_train[TRAIN_COLUMNS]

    regr = RandomForestRegressor(n_estimators=100, min_samples_split=10, min_samples_leaf=10, bootstrap=True, random_state=1337)
    regr.fit(X, y)

    save_model(regr, '../models/mike/5_days_random_forest.pkl')

In [426]:
train_simple_random_forest_model()
train_20_days_random_forest_model()
train_5_days_random_forest_model()

# 4. Predict

In [338]:
SUBMISSION_PATH = '../data/2018_test.csv'
def load_submission_file():
    df_test = pd.read_csv(SUBMISSION_PATH)
    return df_test

def prepare_submission_file(df_test):
    df_stations = get_station_df()
    df_out = add_coordinates(df_test, df_stations, src_index='ID', foreign_index='station')
    df_out = add_day_of_year_column(df_out, column_name='DATE')
    return df_out
    
def save_submission(df_src, PATH):
    df_submission = pd.DataFrame()
    df_submission['SUB_ID'] = df_src['DATE'].apply(lambda d: str(d)) + df_src['ID']
    df_submission['DATA_VALUE'] = df_src['DATA_VALUE']
    df_submission.to_csv(PATH, index=False)
    return df_submission

In [287]:
def generate_predictions_from_simple_random_forest_model():
    model = load_model('../models/mike/simple_random_forest.pkl')
    required_features = [
        'day_of_year',
        'lat',
        'long',
        'elev',
        ]
    PREDICITON_FILE_PATH = '../data/predictions/prediction_simple_random_forest.csv'

    df_test = load_submission_file()
    df_test = prepare_submission_file(df_test)
    df_test.head()

    # create predictions
    df_predict = df_test
    df_predict['DATA_VALUE'] = model.predict(df_test[required_features])

    #save predictions
    df_submission = save_submission(df_predict, PREDICITON_FILE_PATH)


In [427]:
# helper functions for iteratively generating predictions

def load_compltete_test_frame():
     return pd.read_csv('../data/2018_submission_complete.csv')

def generate_complete_test_frame(df_src):
    df_test = pd.DataFrame(columns=['ID', 'DATE'])
    stations = df_src['ID'].unique()
    for i, station in enumerate(stations):
        print('Station (',i,'/',len(stations),') ::', station, '\r', end='')
        required_dates = [
            '20180101',
            '20180102',
            '20180103',
            '20180104',
            '20180105',
            '20180106',
            '20180107',
            '20180108',
            '20180109',
            '20180110',
            '20180111',
            '20180112',
            '20180113',
            '20180114',
            '20180115',
            '20180116',
            '20180117',
            '20180118',
            '20180119',
            '20180120',
            '20180121',
            '20180122',
            '20180123',
            '20180124',
            '20180125',
            '20180126',
            '20180127',
            '20180128',
            '20180129',
            '20180130',
            '20180131',
            '20180201',
            '20180202',
            '20180203',
            '20180204',
            '20180205',
            '20180206',
            '20180207',
            '20180208',
            '20180209',
            '20180210',
            '20180211',
            '20180212',
            '20180213',
            '20180214',
            '20180215',
            '20180216',
            '20180217',
            '20180218',
            '20180219',
            '20180220',
        ]
        
        rows = []
        for date in required_dates:
            rows.append([station, str(date)])

        df_append = pd.DataFrame(rows, columns=['ID', 'DATE'])
        df_test = df_test.append(df_append)

    print('')
    return df_test

def generate_and_save_complete_test_frame():
    print('* Loading submission file')
    df_test = load_submission_file()

    # fill in the missing days
    print('* Generating complete test frame \t\t\t',)
    df_test = generate_complete_test_frame(df_test)
    
    df_test.to_csv('../data/2018_submission_complete.csv', index=False)
    
def generate_initial_lookup_of_last_20_days():
    COLUMNS = ['station','date','feature', 'value', 'measurement','quality', 'source', 'hour']
    df_lookup = pd.read_csv('../data/df_train_mike.csv')
    df_lookup = df_lookup[['station', 'date', 'TMIN']]
    df_lookup = df_lookup[df_lookup['date'].isin([
        '20171231',
        '20171230',
        '20171229',
        '20171228',
        '20171227',
        '20171226',
        '20171225',
        '20171224',
        '20171223',
        '20171222',
        '20171221',
        '20171220',
        '20171219',
        '20171218',
        '20171217',
        '20171216',
        '20171215',
        '20171214',
        '20171213',
        '20171212',
        '20171211',
        '20171210',
    ])]
    df_lookup = add_day_of_year_column(df_lookup, column_name='date')
    df_lookup = df_lookup.sort_values(by=['station', 'date'])
    
    return df_lookup

def get_last_20_day_indeces(current_day_index):
    return list(map(lambda i: (i-1) % 366, range(current_day_index-20, current_day_index)))

def get_last_5_day_indeces(current_day_index):
    return list(map(lambda i: (i-1) % 366, range(current_day_index-5, current_day_index)))

In [374]:
def get_value_from_lookup(df_look_up, day_of_year):
    df_sorted_and_filtered = df_look_up[df_look_up['day_of_year'] <= day_of_year].sort_values(by=['day_of_year'], ascending=False)
    if len(df_sorted_and_filtered) == 0:
        return 120
    else:
        return df_sorted_and_filtered.iloc[0]['TMIN']

In [423]:
def generate_predictions_from_20_days_random_forest_model():
    model = load_model('../models/mike/20_days_random_forest.pkl')
    TRAIN_COLUMNS = [
        'day_of_year',
        'lat',
        'long',
        'elev',
        'TMIN_1',
        'TMIN_2',
        'TMIN_3',
        'TMIN_4',
        'TMIN_5',
        'TMIN_6',
        'TMIN_7',
        'TMIN_8',
        'TMIN_9',
        'TMIN_10',
        'TMIN_11',
        'TMIN_12',
        'TMIN_13',
        'TMIN_14',
        'TMIN_15',
        'TMIN_16',
        'TMIN_17',
        'TMIN_18',
        'TMIN_19',
        'TMIN_20',
    ]

    # ------
    # EITHER   
    # ------
      # df_test = load_submission_file()
      # # fill in the missing days
      # print('* Generating complete test frame \t\t\t', end='')
      # df_test = generate_complete_test_frame(df_test)
    # ------
    # OR
    # ------
    print('* Loading complete test frame... ')
    df_test = load_compltete_test_frame()
    
    print('* Enriching test frame (date, coordinates) ...')
    df_test = prepare_submission_file(df_test)
    df_test = df_test.sort_values(by=['ID', 'DATE'])

    # iteratively create predictions
    df_prediction = pd.DataFrame(columns=['SUB_ID', 'DATA_VALUE'])
    
    print('* Generating Lookup Table ...')
    df_lookup = generate_initial_lookup_of_last_20_days()

    print('* Iteratively generating predictions ...')
    stations = df_test['ID'].unique()
    for s, station in enumerate(stations):
        print(' '*30, '\r', '(' + str(s+1) + '/' + str(len(stations)) + ') ::', station, '\r', end='')
        df_filtered_lookup = df_lookup[df_lookup['station'] == station]
        df_filtered_test = df_test[df_test['ID'] == station]

        for i,row in df_filtered_test.iterrows():
            # get the last 20 days indeces
            last_20_days_indices = get_last_20_day_indeces(row['day_of_year'])

            features = [
                row['day_of_year'],
                row['lat'],
                row['long'],
                row['elev'],
            ]
            
            # lookup the last 20 days and write them into prediction frame
            for (j, day_index) in enumerate(last_20_days_indices):
                features.append(int(get_value_from_lookup(df_filtered_lookup, day_index)))
                
            # predict temperature for that day
            prediction = model.predict([features])
            
            # write prediction into lookup frame
            df_filtered_lookup = df_filtered_lookup.append(pd.DataFrame([[station, row['DATE'], row['day_of_year'], prediction[0]]], columns=['station', 'date', 'day_of_year', 'TMIN']))
            
            # write prediction into solution frame
            df_prediction = df_prediction.append(pd.DataFrame([[str(row['DATE']) + station, prediction[0]]], columns=['SUB_ID', 'DATA_VALUE']))


    print('')    
    
    print('* Saving full prediction set ...')
    df_prediction.to_csv('../data/predictions/2018_20_days_predictions_complete.csv', index=False)
    df_prediction = df_prediction.set_index('SUB_ID')
    
    print('* Generating submission file ...')
    df_submission = load_submission_file()
    
    predictions = []
    for i, row in df_submission.iterrows():
        SUB_ID = str(row['DATE']) + row['ID']
        if SUB_ID in df_prediction.index:
#             print('\t', 'SUCCESS :: <' + SUB_ID + '>  in index')
            predictions.append([SUB_ID, df_prediction.loc[SUB_ID]['DATA_VALUE']])
        else:
            print('\t', 'Error :: <' + SUB_ID + '> not in index')
        
    df_final_predictions = pd.DataFrame(predictions, columns=['SUB_ID', 'DATA_VALUE'])
    df_final_predictions.to_csv('../data/predictions/2018_20_days_submission_.csv', index=False)
    
    print('DONE')


In [424]:
generate_predictions_from_20_days_random_forest_model()

* Loading complete test frame... 
* Enriching test frame (date, coordinates) ...
* Generating Lookup Table ...
* Iteratively generating predictions ...
 (11874/11874) :: ZI000067983  
* Saving full prediction set ...
* Generating submission file ...
DONE


In [428]:
def generate_predictions_from_5_days_random_forest_model():
    model = load_model('../models/mike/5_days_random_forest.pkl')
    TRAIN_COLUMNS = [
        'day_of_year',
        'lat',
        'long',
        'elev',
        'TMIN_1',
        'TMIN_2',
        'TMIN_3',
        'TMIN_4',
        'TMIN_5',
    ]

    # ------
    # EITHER   
    # ------
      # df_test = load_submission_file()
      # # fill in the missing days
      # print('* Generating complete test frame \t\t\t', end='')
      # df_test = generate_complete_test_frame(df_test)
    # ------
    # OR
    # ------
    print('* Loading complete test frame... ')
    df_test = load_compltete_test_frame()
    
    print('* Enriching test frame (date, coordinates) ...')
    df_test = prepare_submission_file(df_test)
    df_test = df_test.sort_values(by=['ID', 'DATE'])

    # iteratively create predictions
    df_prediction = pd.DataFrame(columns=['SUB_ID', 'DATA_VALUE'])
    
    print('* Generating Lookup Table ...')
    df_lookup = generate_initial_lookup_of_last_20_days()

    print('* Iteratively generating predictions ...')
    stations = df_test['ID'].unique()
    for s, station in enumerate(stations):
        print(' '*30, '\r', '(' + str(s+1) + '/' + str(len(stations)) + ') ::', station, '\r', end='')
        df_filtered_lookup = df_lookup[df_lookup['station'] == station]
        df_filtered_test = df_test[df_test['ID'] == station]

        for i,row in df_filtered_test.iterrows():
            # get the last 20 days indeces
            last_5_days_indices = get_last_5_day_indeces(row['day_of_year'])

            features = [
                row['day_of_year'],
                row['lat'],
                row['long'],
                row['elev'],
            ]
            
            # lookup the last 5 days and write them into prediction frame
            for (j, day_index) in enumerate(last_5_days_indices):
                features.append(int(get_value_from_lookup(df_filtered_lookup, day_index)))
                
            # predict temperature for that day
            prediction = model.predict([features])
            
            # write prediction into lookup frame
            df_filtered_lookup = df_filtered_lookup.append(pd.DataFrame([[station, row['DATE'], row['day_of_year'], prediction[0]]], columns=['station', 'date', 'day_of_year', 'TMIN']))
            
            # write prediction into solution frame
            df_prediction = df_prediction.append(pd.DataFrame([[str(row['DATE']) + station, prediction[0]]], columns=['SUB_ID', 'DATA_VALUE']))


    print('')    
    
    print('* Saving full prediction set ...')
    df_prediction.to_csv('../data/predictions/2018_5_days_predictions_complete.csv', index=False)
    df_prediction = df_prediction.set_index('SUB_ID')
    
    print('* Generating submission file ...')
    df_submission = load_submission_file()
    
    predictions = []
    for i, row in df_submission.iterrows():
        SUB_ID = str(row['DATE']) + row['ID']
        if SUB_ID in df_prediction.index:
#             print('\t', 'SUCCESS :: <' + SUB_ID + '>  in index')
            predictions.append([SUB_ID, df_prediction.loc[SUB_ID]['DATA_VALUE']])
        else:
            print('\t', 'Error :: <' + SUB_ID + '> not in index')
        
    df_final_predictions = pd.DataFrame(predictions, columns=['SUB_ID', 'DATA_VALUE'])
    df_final_predictions.to_csv('../data/predictions/2018_5_days_submission_.csv', index=False)
    
    print('DONE')


In [429]:
generate_predictions_from_5_days_random_forest_model()

* Loading complete test frame... 
* Enriching test frame (date, coordinates) ...
* Generating Lookup Table ...
* Iteratively generating predictions ...
 (11874/11874) :: ZI000067983  
* Saving full prediction set ...
* Generating submission file ...
DONE
