# 0. Imports

In [36]:
import pandas as pd
import numpy as np
import datetime

# 1. Setup and Functions

In [78]:
TRAIN_FILE = '../data/export_features_2014_to_2017_20.csv'
STATIONS_FILE = '../data/ghcnd-stations.csv'

def get_original_df():
    df_original = pd.read_csv(TRAIN_FILE)
    df_original = df_original.drop(['Unnamed: 0'], axis=1)
    return df_original

def get_station_df():
     return pd.read_csv(STATIONS_FILE, header=None, names=['station','lat', 'long', 'elev'], sep=';')

def add_coordinates(df_src, df_stations, src_index='station', foreign_index='station'):
    df_out = df_src.copy()
    return df_out.join(df_stations.set_index(foreign_index), on=src_index)

def add_day_of_year_column(df_src, column_name='date'):
    df_out = df_src.copy()
    df_out['day_of_year'] = df_out[column_name].apply(lambda d: date_to_nth_day(str(d)))
    return df_out

def date_to_nth_day(date, format='%Y%m%d'):
    date = datetime.datetime.strptime(date, format)
    new_year_day = datetime.datetime(year=date.year, month=1, day=1)
    return (date - new_year_day).days + 1


# 2. Set Up Train Data

In [29]:
df = get_original_df()

In [31]:
df.head()
# df.groupby(['loc', 'date']).mean()
# df.groupby('20150101').count()df_test.iloc[0]

Unnamed: 0,station,date,TMIN,TMIN_1,TMIN_2,TMIN_3,TMIN_4,TMIN_5,TMIN_6,TMIN_7,...,TMIN_11,TMIN_12,TMIN_13,TMIN_14,TMIN_15,TMIN_16,TMIN_17,TMIN_18,TMIN_19,TMIN_20
0,AE000041196,20140101,128,,,,,,,,...,,,,,,,,,,
1,AE000041196,20140102,145,128.0,,,,,,,...,,,,,,,,,,
2,AE000041196,20140103,140,145.0,128.0,,,,,,...,,,,,,,,,,
3,AE000041196,20140106,162,140.0,145.0,128.0,,,,,...,,,,,,,,,,
4,AE000041196,20140109,115,162.0,140.0,145.0,128.0,,,,...,,,,,,,,,,


In [32]:
df_stations = get_station_df()

In [33]:
df_stations.head()

Unnamed: 0,station,lat,long,elev
0,ACW00011604,17.1167,-61.7833,10.1
1,ACW00011647,17.1333,-61.7833,19.2
2,AE000041196,25.333,55.517,34.0
3,AEM00041194,25.255,55.364,10.4
4,AEM00041217,24.433,54.651,26.8


In [34]:
df_joined = add_coordinates(df, df_stations)
df_joined.head()

Unnamed: 0,station,date,TMIN,TMIN_1,TMIN_2,TMIN_3,TMIN_4,TMIN_5,TMIN_6,TMIN_7,...,TMIN_14,TMIN_15,TMIN_16,TMIN_17,TMIN_18,TMIN_19,TMIN_20,lat,long,elev
0,AE000041196,20140101,128,,,,,,,,...,,,,,,,,25.333,55.517,34.0
1,AE000041196,20140102,145,128.0,,,,,,,...,,,,,,,,25.333,55.517,34.0
2,AE000041196,20140103,140,145.0,128.0,,,,,,...,,,,,,,,25.333,55.517,34.0
3,AE000041196,20140106,162,140.0,145.0,128.0,,,,,...,,,,,,,,25.333,55.517,34.0
4,AE000041196,20140109,115,162.0,140.0,145.0,128.0,,,,...,,,,,,,,25.333,55.517,34.0


In [46]:
df_date = add_day_of_year_column(df_joined)
df_date.head()

Unnamed: 0,station,date,TMIN,TMIN_1,TMIN_2,TMIN_3,TMIN_4,TMIN_5,TMIN_6,TMIN_7,...,TMIN_15,TMIN_16,TMIN_17,TMIN_18,TMIN_19,TMIN_20,lat,long,elev,day_of_year
0,AE000041196,20140101,128,,,,,,,,...,,,,,,,25.333,55.517,34.0,1
1,AE000041196,20140102,145,128.0,,,,,,,...,,,,,,,25.333,55.517,34.0,2
2,AE000041196,20140103,140,145.0,128.0,,,,,,...,,,,,,,25.333,55.517,34.0,3
3,AE000041196,20140106,162,140.0,145.0,128.0,,,,,...,,,,,,,25.333,55.517,34.0,6
4,AE000041196,20140109,115,162.0,140.0,145.0,128.0,,,,...,,,,,,,25.333,55.517,34.0,9


In [47]:
df_date.to_csv('../data/df_train_mike.csv')

# 3. Training

In [60]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

In [61]:
def save_model(model, PATH):
    joblib.dump(model, PATH) 
    
def load_model(PATH):
    return joblib.load(PATH) 
    


In [56]:
df_train = pd.read_csv('../data/df_train_mike.csv')
df_train = df_train.drop(['Unnamed: 0'], axis=1)
df_train = df_train.dropna(axis=0, how='any')

df_train.head()

Unnamed: 0,station,date,TMIN,TMIN_1,TMIN_2,TMIN_3,TMIN_4,TMIN_5,TMIN_6,TMIN_7,...,TMIN_15,TMIN_16,TMIN_17,TMIN_18,TMIN_19,TMIN_20,lat,long,elev,day_of_year
20,AE000041196,20140202,155,147.0,149.0,130.0,138.0,123.0,115.0,120.0,...,159.0,115.0,162.0,140.0,145.0,128.0,25.333,55.517,34.0,33
21,AE000041196,20140203,155,155.0,147.0,149.0,130.0,138.0,123.0,115.0,...,145.0,159.0,115.0,162.0,140.0,145.0,25.333,55.517,34.0,34
22,AE000041196,20140207,99,155.0,155.0,147.0,149.0,130.0,138.0,123.0,...,137.0,145.0,159.0,115.0,162.0,140.0,25.333,55.517,34.0,38
23,AE000041196,20140208,112,99.0,155.0,155.0,147.0,149.0,130.0,138.0,...,118.0,137.0,145.0,159.0,115.0,162.0,25.333,55.517,34.0,39
24,AE000041196,20140209,123,112.0,99.0,155.0,155.0,147.0,149.0,130.0,...,117.0,118.0,137.0,145.0,159.0,115.0,25.333,55.517,34.0,40


In [108]:
def train_simple_random_forest_model():
    TRAIN_COLUMNS = [
        'day_of_year',
        'lat',
        'long',
        'elev',
    ]

    y = df_train['TMIN']
    X = df_train[TRAIN_COLUMNS]

    regr = RandomForestRegressor(n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=3, max_depth=3, bootstrap=True, random_state=1337)
    regr.fit(X, y)

    save_model(regr, '../models/mike/simple_random_forest.pkl')

In [109]:
train_simple_random_forest_model()

# 4. Predict

In [99]:
SUBMISSION_PATH = '../data/2018_test.csv'
def load_submission_file():
    df_test = pd.read_csv(SUBMISSION_PATH)
    return df_test

def prepare_submission_file(df_test):
    df_stations = get_station_df()
    df_out = add_coordinates(df_test, df_stations, src_index='ID', foreign_index='station')
    df_out = add_day_of_year_column(df_out, column_name='DATE')
    return df_out
    
def save_submission(df_src, PATH):
    df_submission = pd.DataFrame()
    df_submission['SUB_ID'] = df_src['DATE'].apply(lambda d: str(d)) + df_src['ID']
    df_submission['DATA_VALUE'] = df_src['DATA_VALUE']
    df_submission.to_csv(PATH, index=False)
    return df_submission

In [106]:
def generate_predictions_from_simple_random_forest_model():
    model = load_model('../models/mike/simple_random_forest.pkl')
    required_features = [
        'day_of_year',
        'lat',
        'long',
        'elev',
        ]
    PREDICITON_FILE_PATH = '../data/predictions/prediction_simple_random_forest.csv'

    df_test = load_submission_file()
    df_test = prepare_submission_file(df_test)
    df_test.head()

    # create predictions
    df_predict = df_test
    df_predict['DATA_VALUE'] = model.predict(df_test[required_features])

    #save predictions
    df_submission = save_submission(df_predict, PREDICITON_FILE_PATH)




In [107]:
generate_predictions_from_simple_random_forest_model()