In [38]:
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

data = np.loadtxt("quasars.csv", delimiter = ",") # load data
y = data[:,10] # Labels 
X = data[:, 0:10] # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=120) # split into training and test set
X_train_1, X_eval_1, y_train_1, y_eval_1 = train_test_split(X_train, y_train, test_size=0.1, random_state=120) # split into training and validation set
reg = xgb.XGBRegressor(colsample_bytree=0.5, learning_rate=0.1, max_depth=4,
reg_lambda=1, n_estimators=500) # Initialise XGBRegressor, with specified parameters
reg.fit(X_train_1, y_train_1, eval_set=[(X_eval_1, y_eval_1), (X_train_1, y_train_1)]) # Fit XGBRegressor to training set and evaluate validation error on validation set

[0]	validation_0-rmse:0.63016	validation_1-rmse:0.64193
[1]	validation_0-rmse:0.61302	validation_1-rmse:0.62328
[2]	validation_0-rmse:0.59994	validation_1-rmse:0.60784
[3]	validation_0-rmse:0.58641	validation_1-rmse:0.59402
[4]	validation_0-rmse:0.57616	validation_1-rmse:0.58212
[5]	validation_0-rmse:0.56638	validation_1-rmse:0.57222
[6]	validation_0-rmse:0.55837	validation_1-rmse:0.56363
[7]	validation_0-rmse:0.55225	validation_1-rmse:0.55648
[8]	validation_0-rmse:0.54670	validation_1-rmse:0.54955
[9]	validation_0-rmse:0.54187	validation_1-rmse:0.54390
[10]	validation_0-rmse:0.53720	validation_1-rmse:0.53919
[11]	validation_0-rmse:0.53460	validation_1-rmse:0.53584
[12]	validation_0-rmse:0.53142	validation_1-rmse:0.53221
[13]	validation_0-rmse:0.52917	validation_1-rmse:0.52909
[14]	validation_0-rmse:0.52753	validation_1-rmse:0.52573
[15]	validation_0-rmse:0.52513	validation_1-rmse:0.52305
[16]	validation_0-rmse:0.52355	validation_1-rmse:0.52081
[17]	validation_0-rmse:0.52246	validation

In [40]:
from matplotlib import pyplot as plt
evals = reg.evals_result() # Get RMSEs
val_errors = np.array(evals["validation_0"]["rmse"]) # Convert validation errors to np array
train_errors = np.array(evals["validation_1"]["rmse"]) # Convert training errors to np array

plt.plot(np.arange(1,len(val_errors)+1), val_errors, label = "Validation error")
plt.plot(np.arange(1,len(train_errors)+1), train_errors, label = "Training error")
plt.grid()
plt.legend()
plt.xlabel("Boosting iterations")
plt.ylabel("RMSE")
plt.savefig("boost1.png")
plt.close()

test_RMSE = np.sqrt(np.mean((reg.predict(X_test)-y_test)**2))

test_R2 = 1-test_RMSE**2/np.var(y_test)
print(test_RMSE, test_R2, np.var(y_test))

0.5192326874307802 0.3524055230553442 0.4163139021329715


In [48]:
# Consider symmetric grid for # features to be checked per tree
# Default learning rate 0.3, make a grid containing this
# Logarithmic lambda_grid 
# n_estimator (boosting rounds) seem to stabilize after 300-400 rounds. Make grid centered arround 500 containing these
# Try trees of depth 1-7
import os.path
from joblib import dump,load
from sklearn.model_selection import GridSearchCV

if(os.path.isfile("xgb_cv_1.joblib")): # Check if model exists and load model if it does. Done because cv takes quite a lot of time
    xgb_cv = load("xgb_cv_1.joblib")
else:
    model = xgb.XGBRegressor() # Initialise XGBRegressor
    paramgrid = {"colsample_bytree": np.array([0.3,0.4,0.5,0.6]), "learning_rate": np.array([0.05, 0.1,0.2,0.3]),
"reg_lambda": np.array([0.01,0.1,1,10,100]), "n_estimators" : np.array([300,400,500]),"max_depth": np.array([2,3,4,5,6])} # setup parametergrid
    model_CV = GridSearchCV(model, param_grid=paramgrid, verbose=1, cv = 3)
    model_CV.fit(X_train, y_train)
    dump(model_CV, "xgb_cv_1.joblib") #Dump for later use

test_RMSE_cv = np.sqrt(np.mean((xgb_cv.predict(X_test)-y_test)**2))

test_R2_cv = 1-test_RMSE_cv**2/np.var(y_test)
print(test_RMSE_cv, test_R2_cv, np.var(y_test))

0.5038327385764918 0.3902499360196523 0.4163139021329715


In [46]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
knn_RMSE = np.sqrt(np.mean((knn.predict(X_test)-y_test)**2))

knn_R2 = 1-knn_RMSE**2/np.var(y_test)
print(knn_RMSE, knn_R2)

0.49341186931168185 0.41521224362118925 0.4163139021329715
