## Train a model with Mushroom data using XGBoost algorithm
###  Model is trained with XGBoost installed in notebook instance
###  In the later examples, we will train using SageMaker's XGBoost algorithm

In [None]:
# Install xgboost in notebook instance.
#### Command to install xgboost
!conda install -y -c conda-forge xgboost

In [None]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import preprocessing

In [None]:
column_list_file = 'mushroom_train_column_list.txt'
train_file = 'mushroom_train.csv'
validation_file = 'mushroom_validation.csv'

In [None]:
columns = ''
with open(column_list_file,'r') as f:
    columns = f.read().split(',')

In [None]:
columns

In [None]:
# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=columns)
df_validation = pd.read_csv(validation_file,names=columns)

In [None]:
df_train.head()

In [None]:
df_validation.head()

In [None]:
X_train = df_train.iloc[:,1:] # Features: 1st column onwards 
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

In [None]:
# Launch a classifier
# XGBoost Training Parameter Reference: 
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
classifier = xgb.XGBClassifier (objective='binary:logistic',n_estimators=50)

In [None]:
classifier

In [None]:
classifier.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_validation, y_validation)], eval_metric=['logloss'])

In [None]:
eval_result = classifier.evals_result()

In [None]:
training_rounds = range(len(eval_result['validation_0']['logloss']))

In [None]:
print(training_rounds)

In [None]:
plt.scatter(x=training_rounds,y=eval_result['validation_0']['logloss'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['logloss'],label='Validation Error')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('LogLoss')
plt.title('Training Vs Validation Error')
plt.legend()

In [None]:
xgb.plot_importance(classifier)

In [None]:
df = pd.read_csv('mushroom_encoded_all.csv')

In [None]:
df.head()

In [None]:
X_test = df.iloc[:,1:]
print(X_test[:5])

In [None]:
result = classifier.predict(X_test)

In [None]:
result[:5]

In [None]:
df['predicted_class'] = result

In [None]:
df.class_edible = df.class_edible.map({0:'edible',1:'poisonous'})

In [None]:
df.predicted_class = df.predicted_class.map({0:'edible',1:'poisonous'})

In [None]:
df.head()

In [None]:
df.class_edible.value_counts()

In [None]:
df.predicted_class.value_counts()

In [None]:
print('Confusion matrix - Actual versus Predicted')
pd.crosstab(df['class_edible'], df['predicted_class'])

In [None]:
import sklearn.metrics as metrics
print(metrics.classification_report(df['class_edible'], df['predicted_class']))