## Train a model with linear data using XGBoost algorithm
###  Model is trained with XGBoost installed in notebook instance
###  In the later examples, we will train using SageMaker's XGBoost algorithm

In [None]:
# Install xgboost in notebook instance.
#### Command to install xgboost
!conda install -y -c conda-forge xgboost

In [None]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

In [None]:
df = pd.read_csv('linear_all.csv')

In [None]:
df.head()

In [None]:
plt.scatter(x=df.x,y=df.y,label='ideal fit')
plt.scatter(x=df.x,y=df.y_noisy, color='r',marker='+',label='Target')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.legend()

In [None]:
train_file = 'linear_train.csv'
validation_file = 'linear_validation.csv'

In [None]:
# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=['y_noisy','x'])
df_validation = pd.read_csv(validation_file,names=['y_noisy','x'])

In [None]:
df_train.head()

In [None]:
df_validation.head()

In [None]:
plt.scatter(x=df_train.x,y=df_train.y_noisy,label='Training')
plt.scatter(x=df_validation.x,y=df_validation.y_noisy, color='r',marker='+',label='Validation')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.title('Training and Validation Data')
plt.legend()

In [None]:
X_train = df_train.iloc[:,1:] # Features: 1st column onwards 
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

In [None]:
# max_depth = 5,objective="reg:linear",num_round = 50
regressor = xgb.XGBRegressor()

In [None]:
regressor

In [None]:
regressor.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_validation, y_validation)])

In [None]:
eval_result = regressor.evals_result()

In [None]:
eval_result

In [None]:
training_rounds = range(len(eval_result['validation_0']['rmse']))

In [None]:
print(training_rounds)

In [None]:
plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iterations')
plt.ylabel('RMSE')
plt.title('Training Vs Validation Error')
plt.legend()

In [None]:
xgb.plot_importance(regressor)

In [None]:
df.head()

In [None]:
X_test = df.iloc[:,:1]
print(X_test[:5])

In [None]:
# This works
type(X_test)

In [None]:
# This does not work
type(df.iloc[:5,0])

In [None]:
result = regressor.predict(X_test)

In [None]:
result[:5]

In [None]:
df['y_predicted'] = result

In [None]:
df.head()

In [None]:
plt.scatter(x=df.x,y=df.y_noisy,color='r',label='actual')
plt.scatter(x=df.x,y=df.y_predicted,color='b',label='predicted')
plt.grid(True)
plt.legend()

In [None]:
plt.boxplot([df.y_noisy,df.y_predicted], labels=['actual','predicted'])
plt.title('Box Plot - Actual, Predicted')
plt.ylabel('Target')
plt.grid(True)

In [None]:
# Over prediction and Under Prediction needs to be balanced
# Training Data Residuals
residuals = (df.y_predicted - df.y_noisy)

plt.hist(residuals)
plt.grid(True)
plt.xlabel('(Predicted - Actual)')
plt.ylabel('Count')
plt.title('Residuals Distribution')
plt.axvline(color='g')


### Tree Based Algorithms cap the upper and lower values that are predicted.

In [None]:
# True Function
def straight_line(x):
    return 5*x + 8

In [None]:
# X is outside range of training samples
X = np.array([1000,25,-100,5000])
tmp_df = pd.DataFrame({'x':X})

In [None]:
print("True Function:\n{0}".format(straight_line(X)))
print("Predicted:\n{0}".format(regressor.predict(tmp_df)))

In [None]:
# X is inside range of training samples
X = np.array([0,1,3,5,7,9,11,15,18])
tmp_df = pd.DataFrame({'x':X})

In [None]:
print("True Function:\n{0}".format(straight_line(X)))
print("Predicted:\n{0}".format(regressor.predict(tmp_df)))

In [None]:
df.describe()