<a href="https://colab.research.google.com/github/DerekLeeCS/FreqML/blob/master/Assignment 4/Derek_Lee_Gradient_Boosted_Trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Data

In [None]:
# California Housing Dataset from Google Colab sample_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Load data
trainData = pd.read_csv("sample_data/california_housing_train.csv") 
testData = pd.read_csv("sample_data/california_housing_test.csv") 

originalData = pd.concat([trainData,testData])              # Merges data into a single DataFrame
train, test = train_test_split(originalData, test_size=0.2) # Splits into 80-20
valid, test = train_test_split(test, test_size=0.5)         # Finishes split into 80-10-10


# Separate into input and output
xTrain,yTrain = train.iloc[:,:-1],train.iloc[:,-1]
xValid,yValid = valid.iloc[:,:-1],valid.iloc[:,-1]
xTest,yTest = test.iloc[:,:-1],test.iloc[:,-1]

# Modules

---



In [None]:
import numpy as np
import random
import xgboost as xgb
import csv

# Linspace
start = 0
end = 10
instances = 50

# XGBoost Parameters
num_round = 10
depth = 3

# XGBoost

---

Lambda did not end up having a significant impact on the error.

In [None]:
def avgAbsError(yReal,yPred):
    n = len(yReal)
    error = 0
    for i in range(0,n):
        error += abs(yReal[i] - yPred[i])/yReal[i]
    error /= n
    return error

def main():
    # Read in data
    dtrain = xgb.DMatrix(data=xTrain,label=yTrain)
    dvalid = xgb.DMatrix(data=xValid,label=yValid)
    dtest = xgb.DMatrix(data=xTest,label=yTest)


    # Calculate the best lambda
    lamb = np.linspace(start,end,instances)
    error = np.zeros( (instances,1) )

    # Runs XGBoost on all lambdas
    for i in range(0,instances):
        param = {'max_depth':depth, 'eta':1, 'objective':'reg:squarederror', 'lambda':lamb[i]}
        bst = xgb.train(param, dtrain, num_round)
        preds = bst.predict(dvalid)
        error[i] = avgAbsError(yValid.to_numpy(),preds)

    # Finding optimal lambda
    index = np.where(error == np.amin(error))
    index = random.choice(index[0])  # Chooses a random index if multiple indices have the same minimum error

    # Runs XGBoost on optimal lambda
    param = {'max_depth':depth, 'eta':1, 'objective':'reg:squarederror', 'lambda':lamb[index]}
    bst = xgb.train(param, dtrain, num_round)
    preds = bst.predict(dtest)

    # Error
    error = avgAbsError(yTest.to_numpy(),preds)
    print("Error: ", end="")
    print(error)

    # Baseline Error
    yAvg = yTrain.mean()
    yAvg = np.full(yTest.size,yAvg)     # Creates numpy array to calculate MSE
    baseError = avgAbsError(yTest.to_numpy(),yAvg)
    print("Baseline Error: ", end="")
    print(baseError)
    
    print(bst.get_score(importance_type='gain'))

main()

Error: 0.23605685606350685
Baseline Error: 0.5955792897649087
{'median_income': 5746030751627.777, 'latitude': 1011101951058.8235, 'housing_median_age': 874768243088.8889, 'longitude': 1596388962807.6924, 'total_rooms': 158143785633.33334, 'total_bedrooms': 1109163457500.0, 'population': 564148518160.0, 'households': 100909285000.0}


# Testing with Scikit-Learn API

---



In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 1,
                max_depth = 5, alpha = 10, n_estimators = num_round)
xg_reg.fit(xTrain,yTrain)

preds = xg_reg.predict(xTest)
error = avgAbsError(yTest.to_numpy(),preds)
print("Error: ", end="")
print(error)

Error: 44901.97884765625


# Install XGBoost

---
Instructions from https://xgboost.readthedocs.io/en/latest/build.html


In [None]:
# Ensure that you are downloading one of the following:
#   * xgboost-{version}-py2.py3-none-manylinux1_x86_64.whl
#   * xgboost-{version}-py2.py3-none-win_amd64.whl
!pip3 install xgboost



In [None]:
labels = list(originalData.columns)

# Normalize the data
vals = originalData.values
vals = preprocessing.scale(vals)
originalData = pd.DataFrame(vals)
originalData.columns = labels
print(originalData.head)