In [1]:
import sys
import py_compile
import pandas as pd
import xgboost as xgb
import numpy as np

sys.path.append(r"../")
py_compile.compile("../utility.py")
from utility import rmse
from sklearn.model_selection import train_test_split 

In [2]:
exclude = ['PC1', 'AO', 'NAO', 'nina3.4','nina3','nina4','nina1+2','GlobeSST']

df = pd.read_csv("../data/PC1-input.csv")

features = [feature for feature in list(df.keys()) if feature not in exclude]
target = ['PC1']

predPC = []
obsPC = []

takeout = 10

for year in range(0,40,takeout):
    year2 = range(year,year+takeout)
    idx = [line for line in range(0,40) if line not in year2]
    
    train_df = df.iloc[idx][features]
    train_target = df.iloc[idx][target]
    test_df = df.iloc[year2][features] 
    test_target = df.iloc[year2][target] 

    param = {'nthread':6, #when use hyperthread, xgboost may become slower
              'learning_rate': 0.01, #so called `eta` value
              'max_depth': 10,
              'min_child_weight': 0.1, #5
              'silent': 1,
              'subsample': 0.1,
              'colsample_bytree': 1.0,
              'n_estimators': 3000}

    dtrain = xgb.DMatrix(train_df, label=train_target, feature_names=features)
    dtest  = xgb.DMatrix(test_df, label=test_target, feature_names=features)
    
    cv_res= xgb.cv(param,dtrain,num_boost_round=6000,early_stopping_rounds=300,nfold=5, metrics='rmse',show_stdv=False)

    bst = xgb.train(param,dtrain,num_boost_round=cv_res.shape[0])
    pred = bst.predict(dtest)
    
    tmp2 = [x[0] for x in test_target[target].values]

    predPC.extend(pred)
    obsPC.extend(tmp2)
 
    print("This is the %i training" % year)

predPC = np.array(predPC)
obsPC = np.array(obsPC)

This is the 0 training
This is the 10 training
This is the 20 training
This is the 30 training


In [3]:
print("RMSE of pred: %.2f" % rmse(predPC,obsPC))
print("CORR of pred: %f" % np.corrcoef(predPC,obsPC)[0,1])

RMSE of pred: 28.40
CORR of pred: 0.369357


In [4]:
import matplotlib.pyplot as plt
xx = [x for x in range(1979,2019)]
plt.plot(xx, predPC, color='green', label='pred')
plt.plot(xx, obsPC, color='red', label='obs')
plt.legend()
plt.show()

<Figure size 640x480 with 1 Axes>

In [5]:
oo = pd.DataFrame(predPC)
oo.to_csv('XGBoost-PC1-TAKE'+str(takeout)+'.txt',header=0,index=0)