In [1]:
import pandas as pd
from tabnet.classifier import TabNetClassifier
from tabnet.autoencoder import TabNetAutoencoder
import tabnet_utils
from sklearn.model_selection import train_test_split
from datetime import datetime
import tensorflow
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score

In [2]:
df = pd.read_csv('symptom/Training.csv')
data = df.sample(frac=1).reset_index(drop=True)

In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['prognosis'] = le.fit_transform(data['prognosis'])

In [4]:
data.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,36
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,22
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,31


In [10]:
# Create a Pandas dataframe with all the features
X = data.loc[:, data.columns != 'prognosis']
y = data['prognosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
X_train.shape, X_test.shape

((3444, 132), (1476, 132))

In [12]:
import numpy as np
import xgboost as xgb

xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.01
param['max_depth'] = 10
param['nthread'] = 10
param['num_class'] = 41

watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 100
bst = xgb.train(param, xg_train, num_round, watchlist)
# get prediction
pred = bst.predict(xg_test)
error_rate = np.sum(pred != y_test) / y_test.shape[0]
print('Test error using softmax = {}'.format(error_rate))

# do the same thing again, but output probabilities
param['objective'] = 'multi:softprob'
bst = xgb.train(param, xg_train, num_round, watchlist)
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
pred_prob = bst.predict(xg_test).reshape(y_test.shape[0], 41)
pred_label = np.argmax(pred_prob, axis=1)
error_rate = np.sum(pred_label != y_test) / y_test.shape[0]
print('Test error using softprob = {}'.format(error_rate))

[0]	train-mlogloss:3.55757	test-mlogloss:3.56023
[1]	train-mlogloss:3.42076	test-mlogloss:3.42570




[2]	train-mlogloss:3.29905	test-mlogloss:3.30626
[3]	train-mlogloss:3.18961	test-mlogloss:3.19823
[4]	train-mlogloss:3.09024	test-mlogloss:3.10009
[5]	train-mlogloss:2.99932	test-mlogloss:3.01020
[6]	train-mlogloss:2.91573	test-mlogloss:2.92773
[7]	train-mlogloss:2.83814	test-mlogloss:2.85121
[8]	train-mlogloss:2.76606	test-mlogloss:2.77999
[9]	train-mlogloss:2.69855	test-mlogloss:2.71334
[10]	train-mlogloss:2.63518	test-mlogloss:2.65070
[11]	train-mlogloss:2.57547	test-mlogloss:2.59177
[12]	train-mlogloss:2.51900	test-mlogloss:2.53586
[13]	train-mlogloss:2.46553	test-mlogloss:2.48318
[14]	train-mlogloss:2.41469	test-mlogloss:2.43284
[15]	train-mlogloss:2.36629	test-mlogloss:2.38507
[16]	train-mlogloss:2.32011	test-mlogloss:2.33946
[17]	train-mlogloss:2.27585	test-mlogloss:2.29557
[18]	train-mlogloss:2.23342	test-mlogloss:2.25360
[19]	train-mlogloss:2.19270	test-mlogloss:2.21340
[20]	train-mlogloss:2.15356	test-mlogloss:2.17458
[21]	train-mlogloss:2.11590	test-mlogloss:2.13737
[22]	tra

[66]	train-mlogloss:1.14514	test-mlogloss:1.17150
[67]	train-mlogloss:1.13196	test-mlogloss:1.15831
[68]	train-mlogloss:1.11900	test-mlogloss:1.14534
[69]	train-mlogloss:1.10623	test-mlogloss:1.13255
[70]	train-mlogloss:1.09366	test-mlogloss:1.11995
[71]	train-mlogloss:1.08129	test-mlogloss:1.10756
[72]	train-mlogloss:1.06910	test-mlogloss:1.09534
[73]	train-mlogloss:1.05709	test-mlogloss:1.08329
[74]	train-mlogloss:1.04526	test-mlogloss:1.07142
[75]	train-mlogloss:1.03362	test-mlogloss:1.05977
[76]	train-mlogloss:1.02214	test-mlogloss:1.04826
[77]	train-mlogloss:1.01083	test-mlogloss:1.03690
[78]	train-mlogloss:0.99968	test-mlogloss:1.02572
[79]	train-mlogloss:0.98868	test-mlogloss:1.01470
[80]	train-mlogloss:0.97782	test-mlogloss:1.00378
[81]	train-mlogloss:0.96711	test-mlogloss:0.99304
[82]	train-mlogloss:0.95656	test-mlogloss:0.98246
[83]	train-mlogloss:0.94615	test-mlogloss:0.97201
[84]	train-mlogloss:0.93589	test-mlogloss:0.96171
[85]	train-mlogloss:0.92577	test-mlogloss:0.95154


In [14]:
from sklearn.metrics import precision_recall_fscore_support
print(precision_recall_fscore_support(y_test, pred_label, average='weighted'))

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
confusion_matrix = confusion_matrix(y_test, pred_label)
print(confusion_matrix)

print(classification_report(y_test, pred_label))

(1.0, 1.0, 1.0, None)
[[31  0  0 ...  0  0  0]
 [ 0 29  0 ...  0  0  0]
 [ 0  0 36 ...  0  0  0]
 ...
 [ 0  0  0 ... 34  0  0]
 [ 0  0  0 ...  0 38  0]
 [ 0  0  0 ...  0  0 35]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        31
           1       1.00      1.00      1.00        29
           2       1.00      1.00      1.00        36
           3       1.00      1.00      1.00        37
           4       1.00      1.00      1.00        35
           5       1.00      1.00      1.00        35
           6       1.00      1.00      1.00        38
           7       1.00      1.00      1.00        41
           8       1.00      1.00      1.00        26
           9       1.00      1.00      1.00        41
          10       1.00      1.00      1.00        37
          11       1.00      1.00      1.00        36
          12       1.00      1.00      1.00        37
          13       1.00      1.00      1.00        37
          1