## Train  model using PCA Components
###  Model is trained with XGBoost installed in notebook instance

In [None]:
# Install xgboost in notebook instance.
#### Command to install xgboost
!conda install -y -c conda-forge xgboost

In [None]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

In [None]:
column_list_file = 'bike_train_column_list_pca.txt'
train_file = 'bike_train_pca.csv'
validation_file = 'bike_validation_pca.csv'
test_file = 'bike_test_pca.csv'

In [None]:
columns = ''
with open(column_list_file,'r') as f:
    columns = f.read().split(',')

In [None]:
columns

In [None]:
# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=columns)
df_validation = pd.read_csv(validation_file,names=columns)

In [None]:
df_train.head(2)

In [None]:
df_validation.head(2)

In [None]:
df_train.iloc[:,1:-2].head(2)

In [None]:
X_train = df_train.iloc[:,1:] # Features: 1st column onwards 
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

In [None]:
# XGBoost Training Parameter Reference: 
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
regressor = xgb.XGBRegressor(max_depth=5,eta=0.1,subsample=0.7,num_round=150,n_estimators=150)

In [None]:
regressor

In [None]:
regressor.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_validation, y_validation)])

In [None]:
eval_result = regressor.evals_result()

In [None]:
training_rounds = range(len(eval_result['validation_0']['rmse']))

In [None]:
plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.title('Training Vs Validation Error')
plt.legend()

In [None]:
xgb.plot_importance(regressor)

In [None]:
xgb.plot_importance(regressor)

In [None]:
def adjust_count(x):
    if x < 0:
        return 0
    else:
        return x

In [None]:
# Prepare Data for Submission to Kaggle
df_test = pd.read_csv(test_file,parse_dates=['datetime'])

In [None]:
df_test.head(2)

In [None]:
X_test =  df_test.iloc[:,1:] # Exclude datetime for prediction

In [None]:
X_test.head(2)

In [None]:
result = regressor.predict(X_test)

In [None]:
result[:5]

In [None]:
# Convert result to actual count
df_test["count"] = np.expm1(result)

In [None]:
df_test.head()

In [None]:
df_test[df_test["count"] < 0]

In [None]:
df_test['count'] = df_test['count'].map(adjust_count)

In [None]:
df_test[df_test["count"] < 0]

In [None]:
df_test[['datetime','count']].to_csv('predicted_count_pca.csv',index=False)