In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from random import seed
import matplotlib.pyplot as plt
import pickle

In [None]:
yearmon_1deg_df = pd.read_csv('C:\\Users\\conor\\General_College\\UTRECHT ADS\\Thesis\\Data\\rescale_dfs\\one_yrmon.csv', index_col = 0)
yearmon_3deg_df = pd.read_csv('C:\\Users\\conor\\General_College\\UTRECHT ADS\\Thesis\\Data\\rescale_dfs\\three_yrmon.csv', index_col = 0)
yearmon_5deg_df = pd.read_csv('C:\\Users\\conor\\General_College\\UTRECHT ADS\\Thesis\\Data\\rescale_dfs\\five_yrmon.csv', index_col = 0)
yearmon_10deg_df = pd.read_csv('C:\\Users\\conor\\General_College\\UTRECHT ADS\\Thesis\\Data\\rescale_dfs\\ten_yrmon.csv', index_col = 0)
yearmonth_full_df = pd.read_csv('C:\\Users\\conor\\General_College\\UTRECHT ADS\\Thesis\\Data\\rescale_dfs\\full_yrmon.csv', index_col = 0)

Testing different resolutions:

In [None]:
#function to run CV

def CV_resolutions(df):
    error_output = 0
    for yr in range(2000,2019):
        print(yr, end = ' ')

        unfold = df[df['year'] != yr] #this is the data not in the fold
        fold = df[df['year'] == yr] #this is the fold of one year

        fold_y = fold.pop('bio_flux') #this is getting the x and y of the fold
        fold_yr = fold.pop('year')
        fold_month = fold.pop('month')

        unfold_y = unfold.pop('bio_flux') #this is getting the x and y of the unfold
        unfold_yr = unfold.pop('year')
        unfold_month = unfold.pop('month')

        agg_cv = pd.concat([fold_yr, fold_month], axis = 1)

        agg_cv['yearmonth'] = agg_cv['year'].astype(str)  + agg_cv['month']
        ##############################
        
        #training model:
        regr_cv = RandomForestRegressor(n_estimators = 100, random_state = 1)
        regr_cv.fit(unfold, unfold_y) #fitting model on the unfold
        #####################

        #testing the model:
        preds_cv = regr_cv.predict(fold) #predicting on the fold 
   
        agg_cv['cv preds'] = preds_cv
        grp_cv = pd.DataFrame(agg_cv.groupby(['yearmonth'])['cv preds'].agg('mean'))
        grp_cv['yearmonth'] = grp_cv.index
        grp_cv.index = [1 for k in grp_cv.index]#dataframe w/ predictions in one column, and yearmonth in second column

        yearmonth_full_df1 = yearmonth_full_df.copy()
        yearmonth_full_df1['yearmonth'] = yearmonth_full_df1['year'].astype(str)  + yearmonth_full_df1['month']
        preds_w_act_cv = grp_cv.merge(yearmonth_full_df1, left_on='yearmonth', right_on='yearmonth')[['cv preds','bio_flux']]

        error_output += np.mean((preds_w_act_cv['cv preds'] - preds_w_act_cv['bio_flux'])**2)
#         print(yr,error_output)
    
    return error_output/19

In [None]:
def fullfull(df):
    fullfull_output = 0
    
    for yr in range(2000,2019):

        #fullfull######
        unfold_full = yearmonth_full_df[yearmonth_full_df['year'] != yr] #this is the data not in the fold
        fold_full = yearmonth_full_df[yearmonth_full_df['year'] == yr] #this is the fold of one year

        fold_full_y = fold_full.pop('bio_flux') #this is getting the x and y of the fold
        fold_full_yr = fold_full.pop('year')
        fold_full_month = fold_full.pop('month')

        unfold_full_y = unfold_full.pop('bio_flux') #this is getting the x and y of the unfold
        unfold_full_yr = unfold_full.pop('year')
        unfold_full_month = unfold_full.pop('month')
        
        #training the full model:
        regr_full_cv = RandomForestRegressor(n_estimators = 100, random_state = 1)
        regr_full_cv.fit(unfold_full, unfold_full_y) #fitting model on the unfold
        #########################

        #testing the full model on full data:
        preds_full_cv = regr_full_cv.predict(fold_full) #predicting on the fold 
        fullfull_output += np.mean((preds_full_cv - np.array(fold_full_y))**2)
        #####################################
        
    return fullfull_output/19
        

In [None]:
CV_res_actual = {}
CV_res_actual[r'$1^{\circ}$ '] = CV_resolutions(yearmon_1deg_df)
CV_res_actual[r'$3^{\circ}$ '] = CV_resolutions(yearmon_3deg_df)
CV_res_actual[r'$5^{\circ}$ '] = CV_resolutions(yearmon_5deg_df)
CV_res_actual[r'$10^{\circ}$ '] = CV_resolutions(yearmon_10deg_df)
CV_res_actual['Full'] = fullfull(yearmonth_full_df)

Same data, different model:

In [None]:
CV_matching = {}

# full model full data:
fullfull_MSE = 0
fullpix_MSE = 0
print('full model:')
for yr in range(2000,2019):
    
    #fullfull######
    unfold_full = yearmonth_full_df[yearmonth_full_df['year'] != yr] #this is the data not in the fold
    fold_full = yearmonth_full_df[yearmonth_full_df['year'] == yr] #this is the fold of one year
    
    fold_full_y = fold_full.pop('bio_flux') #this is getting the x and y of the fold
    fold_full_yr = fold_full.pop('year')
    fold_full_month = fold_full.pop('month')
    
    unfold_full_y = unfold_full.pop('bio_flux') #this is getting the x and y of the unfold
    unfold_full_yr = unfold_full.pop('year')
    unfold_full_month = unfold_full.pop('month')
    ################
    
    #fullpix########
    unfold_pix = yearmon_1deg_df[yearmon_1deg_df['year'] != yr] #this is the data not in the fold
    fold_pix = yearmon_1deg_df[yearmon_1deg_df['year'] == yr] #this is the fold of one year
    
    fold_pix_y = fold_pix.pop('bio_flux') #this is getting the x and y of the fold
    fold_pix_yr = fold_pix.pop('year')
    fold_pix_month = fold_pix.pop('month')
    
    unfold_pix_y = unfold_pix.pop('bio_flux') #this is getting the x and y of the unfold
    unfold_pix_yr = unfold_pix.pop('year')
    unfold_pix_month = unfold_pix.pop('month')
    
    agg_cv = pd.concat([fold_pix_yr, fold_pix_month], axis = 1)
    
    agg_cv['yearmonth'] = agg_cv['year'].astype(str)  + agg_cv['month']
    ################
    
    
    #training the full model:
    regr_full_cv = RandomForestRegressor(n_estimators = 100, random_state = 1)
    regr_full_cv.fit(unfold_full, unfold_full_y) #fitting model on the unfold
    #########################
    
    #testing the full model on full data:
    preds_full_cv = regr_full_cv.predict(fold_full) #predicting on the fold 
    fullfull_MSE += np.mean((preds_full_cv - np.array(fold_full_y))**2)
    #####################################
    
    #testing the full model on pixel data:
    preds_pix_cv = regr_full_cv.predict(fold_pix)
    
    agg_cv['cv preds'] = preds_pix_cv
    grp_cv = pd.DataFrame(agg_cv.groupby(['yearmonth'])['cv preds'].agg('mean'))
    grp_cv['yearmonth'] = grp_cv.index
    grp_cv.index = [1 for k in grp_cv.index]#dataframe w/ predictions in 1 column, and yearmonth in second column
    
    yearmonth_full_df1 = yearmonth_full_df.copy()
    yearmonth_full_df1['yearmonth'] = yearmonth_full_df1['year'].astype(str)  + yearmonth_full_df1['month']
    preds_w_act_cv = grp_cv.merge(yearmonth_full_df1, left_on='yearmonth', right_on='yearmonth')[['cv preds','bio_flux']]
    
    fullpix_MSE += np.mean((preds_w_act_cv['cv preds'] - preds_w_act_cv['bio_flux'])**2)
    #####################################
    print(yr, end = ' ')
    

# pixel model:
pixpix_MSE = 0
pixfull_MSE = 0
print('\npixel model:')
for yr in range(2000,2019):
    print(yr, end = ' ')
    
    #pixpix#####################
    unfold_pix = yearmon_1deg_df[yearmon_1deg_df['year'] != yr] #this is the data not in the fold
    fold_pix = yearmon_1deg_df[yearmon_1deg_df['year'] == yr] #this is the fold of one year
    
    fold_pix_y = fold_pix.pop('bio_flux') #this is getting the x and y of the fold
    fold_pix_yr = fold_pix.pop('year')
    fold_pix_month = fold_pix.pop('month')
    
    unfold_pix_y = unfold_pix.pop('bio_flux') #this is getting the x and y of the unfold
    unfold_pix_yr = unfold_pix.pop('year')
    unfold_pix_month = unfold_pix.pop('month')
    
    agg_cv = pd.concat([fold_pix_yr, fold_pix_month], axis = 1)
    
    agg_cv['yearmonth'] = agg_cv['year'].astype(str)  + agg_cv['month']
    ##############################
    
    #pixfull######################
    unfold_full = yearmonth_full_df[yearmonth_full_df['year'] != yr] #this is the data not in the fold
    fold_full = yearmonth_full_df[yearmonth_full_df['year'] == yr] #this is the fold of one year
    
    fold_full_y = fold_full.pop('bio_flux') #this is getting the x and y of the fold
    fold_full_yr = fold_full.pop('year')
    fold_full_month = fold_full.pop('month')
    
    unfold_full_y = unfold_full.pop('bio_flux') #this is getting the x and y of the unfold
    unfold_full_yr = unfold_full.pop('year')
    unfold_full_month = unfold_full.pop('month')
    ###############################
    
    #training pixel model:
    regr_pix_cv = RandomForestRegressor(n_estimators = 100, random_state = 1)
    regr_pix_cv.fit(unfold_pix, unfold_pix_y) #fitting model on the unfold
    #####################
    
    #testing the pixel model on the pixel data:
    preds_pix_cv = regr_pix_cv.predict(fold_pix) #predicting on the fold 
    
    agg_cv['cv preds'] = preds_pix_cv
    grp_cv = pd.DataFrame(agg_cv.groupby(['yearmonth'])['cv preds'].agg('mean'))
    grp_cv['yearmonth'] = grp_cv.index
    grp_cv.index = [1 for k in grp_cv.index]#dataframe w/ predictions in 1 column, and yearmonth in second column
    
    yearmonth_full_df1 = yearmonth_full_df.copy()
    yearmonth_full_df1['yearmonth'] = yearmonth_full_df1['year'].astype(str)  + yearmonth_full_df1['month']
    preds_w_act_cv = grp_cv.merge(yearmonth_full_df1, left_on='yearmonth', right_on='yearmonth')[['cv preds','bio_flux']]
    
    pixpix_MSE += np.mean((preds_w_act_cv['cv preds'] - preds_w_act_cv['bio_flux'])**2)
    ############################################
    
    #testing the pixel model on the full data:
    preds_full_cv = regr_pix_cv.predict(fold_full) #predicting on the fold 
    pixfull_MSE += np.mean((preds_full_cv - np.array(fold_full_y))**2)
    #########################################
    


CV_matching['full model: full data'] = fullfull_MSE/19 
CV_matching[r'full model: $1^{\circ}$ data'] = fullpix_MSE/19 
CV_matching[r'$1^{\circ}$ model: $1^{\circ}$ data'] = pixpix_MSE/19 
CV_matching[r'$1^{\circ}$ model: full data'] = pixfull_MSE/19      

Plotting:

In [None]:
plt.bar(list(CV_res_actual.keys()), CV_res_actual.values(), color='xkcd:kelly green')
plt.xticks(rotation=30, ha='right')
plt.title('MSE for data with different resolutions')
plt.xticks(rotation=10, ha='right')
plt.xlabel('Data Resolution')
plt.ylabel(r'MSE [$(mol m^{-2} s^{-1})^2]$')
plt.show()

In [None]:
plt.bar(list(CV_matching.keys()), CV_matching.values(), color='xkcd:kelly green')
plt.xticks(rotation=30, ha='right')
plt.title('MSE for training/testing models on different data resolutions')
plt.xticks(rotation=10, ha='right')
plt.xlabel('Model: Data mix')
plt.ylabel(r'MSE [$(mol m^{-2} s^{-1})^2$]')
plt.show()