In [4]:
import numpy as np
import pandas as pd

# README

**WHAT:** Code for splitting the training set by years and running cross validation to test the generalizability of our models.

**HOW:** Just copy and past these functions into your notebooks. Use this code to benchmark your models offline with regards to generalizability. I commented them quite extensiveley, but just let me know, if you have any questions. Best Mike. 


In [5]:
def calculate_rsme(df_submission, df_test):
    '''
    @param df_submission        pandas dataframe with two columns
                                    [1] SUB_ID (like in submission file)
                                    [2] DATA_VALUE (predicted value like in submission file)
    @param df_test              pandas dataframe with two columns
                                    [1] SUB_ID (like in submission file)
                                    [2] TMIN (correct TMIN) 
    '''
    df_compare = df_submission.join(df_test.set_index('SUB_ID'), on='SUB_ID', how='right')
    
    # return RSME
    return ((df_compare['DATA_VALUE'] - df_compare['TMIN']) ** 2).mean() ** .5

In [12]:
def split_data_set_by_year(df_src, test_years = []):
    '''
    @param df_src               pandas dataframe with a column called 'date' 
                                with the date value in YYYYMMDD format
    @param test_years           list of years as integers that should be extracted
                                into the test set
    '''
    
    df_out = df_src.copy()
    df_out['year'] = df_out['date'].apply(lambda s: int(str(s)[:4]))
    
    df_train= df_out[~df_out['year'].isin(test_years)]
    df_test= df_out[df_out['year'].isin(test_years)]
    
    return df_train, df_test
    

In [22]:
def calculate_mean_rsme_over_years(df_src, func, years = [2014, 2015, 2016, 2017]):
    '''
    @param df_src               the pandas dataframe with the unfilterd train data. It needs
                                to contain a 'station', a 'date'column in YYYYMMDD format
                                and a 'TMIN' column with the correct tmin values.
    @param func                 a lambda function taking a (df_train, df_test) as input
                                and returning a pandas dataframe with the predictions in df_test.
                                
                                * df_train is a pandas dataframe with the same format as
                                the one supplied in df_src, but contains only a subset of values 
                                * df_train is a pandas dataframe with the same format as
                                the one supplied in df_src with mutual exclusive subset to df_train.
                                Also the 'TMIN' column is removed.
                                
                                * df_test is a pandas dataframe and should be in the same format as 
                                the submission file. It should have the columns SUB_ID and
                                the predicted DATA_VALUE for all values specified in df_test
    '''
    rsme = []
    for year in years:
        df_train, df_test = split_data_set_by_year(df_src, test_years = [year])
        
        df_test = df_test.drop(['TMIN'], axis=1)
        
        # callback to the lambda function to get predictions
        df_predicted = func((df_train, df_test))

        # generate solutions df
        df_correct = pd.DataFrame(columns=['SUB_ID', 'TMIN'])
        df_correct['SUB_ID'] = df_test['date'].apply(lambda d: str(d)) + df_src['station']
        df_correct['TMIN'] = df_test['TMIN']
        
        # calculate rsme for this round
        err = calculate_rsme(df_predicted, df_correct)
        rsme.append(err)
        
    # return average rsme
    return np.mean(rsme)
        

In [None]:
# EXAMPLE USAGE

PATH = '../../data/tmp/df_time.csv'
df_original = pd.read_csv(PATH)
df_original = df_original.drop(['Unnamed: 0'], axis=1)

df_train, df_test = split_data_set_by_year(df_original, test_years=[2017])