## Standard Preprocessing before every Model

Use this notebook as the starting point for your models.

## 1. Loading data

In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np

In [2]:
data = arff.loadarff('dataset2_norm.arff')
df = pd.DataFrame(data[0])

In [3]:
df.head()

Unnamed: 0,holdtime1,holdtime2,holdtime3,holdtime4,holdtime5,holdtime6,holdtime7,holdtime8,holdtime9,holdtime10,...,fingerarea9,fingerarea10,fingerarea11,fingerarea12,fingerarea13,fingerarea14,meanholdtime,meanpressure,meanfingerarea,user_id
0,0.538793,0.462222,0.362903,0.27451,0.300366,0.384259,0.430147,0.46729,0.24,0.374429,...,0.296296,0.296296,0.222222,0.21147,0.283154,0.185185,0.44703,0.387546,0.364089,b'1'
1,0.435345,0.382222,0.354839,0.285714,0.106227,0.328704,0.363971,0.485981,0.344,0.365297,...,0.259259,0.185185,0.185185,0.354839,0.21147,0.148148,0.423762,0.445704,0.369322,b'1'
2,0.478448,0.453333,0.399194,0.338936,0.340659,0.375,0.338235,0.345794,0.296,0.365297,...,0.296296,0.333333,0.222222,0.283154,0.175627,0.185185,0.454455,0.464092,0.371658,b'1'
3,0.396552,0.444444,0.415323,0.338936,0.3663,0.416667,0.404412,0.640187,0.276,0.410959,...,0.37037,0.185185,0.222222,0.283154,0.247312,0.296296,0.522772,0.39723,0.396828,b'1'
4,0.469828,0.453333,0.290323,0.271709,0.340659,0.361111,0.408088,0.635514,0.324,0.378995,...,0.333333,0.222222,0.222222,0.21147,0.318996,0.074074,0.493564,0.455577,0.365646,b'1'


### 1.1 X-data

In [4]:
x = df.iloc[:,:71]

In [5]:
x.head()

Unnamed: 0,holdtime1,holdtime2,holdtime3,holdtime4,holdtime5,holdtime6,holdtime7,holdtime8,holdtime9,holdtime10,...,fingerarea8,fingerarea9,fingerarea10,fingerarea11,fingerarea12,fingerarea13,fingerarea14,meanholdtime,meanpressure,meanfingerarea
0,0.538793,0.462222,0.362903,0.27451,0.300366,0.384259,0.430147,0.46729,0.24,0.374429,...,0.222222,0.296296,0.296296,0.222222,0.21147,0.283154,0.185185,0.44703,0.387546,0.364089
1,0.435345,0.382222,0.354839,0.285714,0.106227,0.328704,0.363971,0.485981,0.344,0.365297,...,0.185185,0.259259,0.185185,0.185185,0.354839,0.21147,0.148148,0.423762,0.445704,0.369322
2,0.478448,0.453333,0.399194,0.338936,0.340659,0.375,0.338235,0.345794,0.296,0.365297,...,0.259259,0.296296,0.333333,0.222222,0.283154,0.175627,0.185185,0.454455,0.464092,0.371658
3,0.396552,0.444444,0.415323,0.338936,0.3663,0.416667,0.404412,0.640187,0.276,0.410959,...,0.296296,0.37037,0.185185,0.222222,0.283154,0.247312,0.296296,0.522772,0.39723,0.396828
4,0.469828,0.453333,0.290323,0.271709,0.340659,0.361111,0.408088,0.635514,0.324,0.378995,...,0.296296,0.333333,0.222222,0.222222,0.21147,0.318996,0.074074,0.493564,0.455577,0.365646


In [6]:
X = x.values

In [7]:
X.shape

(2142, 71)

### 1.2 Y-data

In [8]:
label = []
for i in range(42):
    for j in range(51):
        label.append(i)

In [9]:
y = np.array(label)

In [23]:
y.shape

(2142,)

### 1.3 Before Model

In [11]:
from keras.utils import to_categorical

Using TensorFlow backend.


In [24]:
Y = to_categorical(y)

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2)

In [29]:
y.shape

(2142,)

In [30]:
X.shape

(2142, 71)

In [31]:
X_train.shape

(1713, 71)

In [32]:
Y_train.shape

(1713,)

# ----------------Here Start Your Model------------------

# XGBoost

In [45]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [46]:
model = XGBClassifier()

In [48]:
model.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [50]:
# make predictions for test data
y_pred = model.predict(X_test)

In [51]:
predictions = [round(value) for value in y_pred]

In [53]:
# evaluate predictions
accuracy = accuracy_score(Y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 89.98%


## WHAT?
Source to read: https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/