In [9]:
import pandas as pd
from tabnet.classifier import TabNetClassifier
from tabnet.autoencoder import TabNetAutoencoder
import tabnet_utils
from sklearn.model_selection import train_test_split
from datetime import datetime
import tensorflow
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score

In [10]:
import pickle
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

df = pd.read_csv('covtype.csv')
data = df.sample(frac=1).reset_index(drop=True)

In [11]:
data.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,3162,39,10,524,20,3388,221,219,132,3842,...,0,0,0,0,0,0,0,0,0,0
1,2952,60,17,210,39,4781,231,202,99,2647,...,0,0,0,0,0,0,0,0,0,1
2,2256,56,5,134,48,1577,223,229,141,382,...,0,0,0,0,0,0,0,0,0,5
3,3260,300,8,256,25,4484,199,237,178,751,...,0,0,0,0,0,0,0,1,0,0
4,3018,356,7,67,8,1242,209,227,157,1293,...,0,0,0,0,0,0,0,0,0,1


In [12]:
# Create a Pandas dataframe with all the features
X = data.loc[:, data.columns != 'Cover_Type']
y = data['Cover_Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
y_train.unique()

array([0, 1, 5, 2, 4, 6, 3])

In [14]:
X_train.shape, X_test.shape

((406708, 54), (174304, 54))

In [15]:
import numpy as np
import xgboost as xgb

xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.01
param['max_depth'] = 10
param['nthread'] = 10
param['num_class'] = 7

watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 100
bst = xgb.train(param, xg_train, num_round, watchlist)
# get prediction
pred = bst.predict(xg_test)
error_rate = np.sum(pred != y_test) / y_test.shape[0]
print('Test error using softmax = {}'.format(error_rate))

# do the same thing again, but output probabilities
param['objective'] = 'multi:softprob'
bst = xgb.train(param, xg_train, num_round, watchlist)
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
pred_prob = bst.predict(xg_test).reshape(y_test.shape[0], 7)
pred_label = np.argmax(pred_prob, axis=1)
error_rate = np.sum(pred_label != y_test) / y_test.shape[0]
print('Test error using softprob = {}'.format(error_rate))



[0]	train-mlogloss:1.92295	test-mlogloss:1.92324
[1]	train-mlogloss:1.90064	test-mlogloss:1.90122
[2]	train-mlogloss:1.87892	test-mlogloss:1.87977
[3]	train-mlogloss:1.85776	test-mlogloss:1.85888
[4]	train-mlogloss:1.83715	test-mlogloss:1.83853
[5]	train-mlogloss:1.81704	test-mlogloss:1.81868
[6]	train-mlogloss:1.79741	test-mlogloss:1.79931
[7]	train-mlogloss:1.77822	test-mlogloss:1.78037
[8]	train-mlogloss:1.75950	test-mlogloss:1.76189
[9]	train-mlogloss:1.74118	test-mlogloss:1.74381
[10]	train-mlogloss:1.72318	test-mlogloss:1.72605
[11]	train-mlogloss:1.70574	test-mlogloss:1.70882
[12]	train-mlogloss:1.68853	test-mlogloss:1.69183
[13]	train-mlogloss:1.67176	test-mlogloss:1.67527
[14]	train-mlogloss:1.65542	test-mlogloss:1.65914
[15]	train-mlogloss:1.63927	test-mlogloss:1.64319
[16]	train-mlogloss:1.62346	test-mlogloss:1.62758
[17]	train-mlogloss:1.60801	test-mlogloss:1.61232
[18]	train-mlogloss:1.59280	test-mlogloss:1.59731
[19]	train-mlogloss:1.57798	test-mlogloss:1.58267
[20]	train

[64]	train-mlogloss:1.10702	test-mlogloss:1.11769
[65]	train-mlogloss:1.09952	test-mlogloss:1.11029
[66]	train-mlogloss:1.09198	test-mlogloss:1.10286
[67]	train-mlogloss:1.08446	test-mlogloss:1.09545
[68]	train-mlogloss:1.07726	test-mlogloss:1.08835
[69]	train-mlogloss:1.06996	test-mlogloss:1.08115
[70]	train-mlogloss:1.06275	test-mlogloss:1.07405
[71]	train-mlogloss:1.05572	test-mlogloss:1.06712
[72]	train-mlogloss:1.04892	test-mlogloss:1.06042
[73]	train-mlogloss:1.04210	test-mlogloss:1.05369
[74]	train-mlogloss:1.03535	test-mlogloss:1.04704
[75]	train-mlogloss:1.02868	test-mlogloss:1.04046
[76]	train-mlogloss:1.02212	test-mlogloss:1.03399
[77]	train-mlogloss:1.01562	test-mlogloss:1.02759
[78]	train-mlogloss:1.00927	test-mlogloss:1.02132
[79]	train-mlogloss:1.00297	test-mlogloss:1.01511
[80]	train-mlogloss:0.99672	test-mlogloss:1.00895
[81]	train-mlogloss:0.99042	test-mlogloss:1.00274
[82]	train-mlogloss:0.98424	test-mlogloss:0.99665
[83]	train-mlogloss:0.97813	test-mlogloss:0.99063


In [16]:
y_test.shape

(174304,)

In [17]:
from sklearn.metrics import precision_recall_fscore_support
print(precision_recall_fscore_support(y_test, pred_label, average='weighted'))

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
confusion_matrix = confusion_matrix(y_test, pred_label)
print(confusion_matrix)

print(classification_report(y_test, pred_label))

(0.848744092408673, 0.8473184780613181, 0.8450162276022541, None)
[[51474 11527     4     0     6     7   393]
 [ 8137 76441   263     0    81   148    22]
 [    2   791  9654    58     1   374     0]
 [    0     1   114   652     0    24     0]
 [   44  1647    41     0  1098     5     0]
 [    7   782  1063    18     1  3407     0]
 [ 1031    21     0     0     0     0  4965]]
              precision    recall  f1-score   support

           0       0.85      0.81      0.83     63411
           1       0.84      0.90      0.87     85092
           2       0.87      0.89      0.88     10880
           3       0.90      0.82      0.86       791
           4       0.93      0.39      0.55      2835
           5       0.86      0.65      0.74      5278
           6       0.92      0.83      0.87      6017

    accuracy                           0.85    174304
   macro avg       0.88      0.75      0.80    174304
weighted avg       0.85      0.85      0.85    174304

