## import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

### prepare data


In [2]:
df_train = pd.read_csv("data/optics/opt_train.csv", delimiter=',', header=None, names=[i for i in range(65)], index_col=False)

In [3]:
df_test = pd.read_csv("data/optics/opt_test.csv", delimiter=',', header=None, names=[i for i in range(65)], index_col=False)

### exploratory

In [4]:
df_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,0,1,6,15,12,1,0,0,0,7,...,0,0,0,6,14,7,1,0,0,0
1,0,0,10,16,6,0,0,0,0,7,...,0,0,0,10,16,15,3,0,0,0
2,0,0,8,15,16,13,0,0,0,1,...,0,0,0,9,14,0,0,0,0,7
3,0,0,0,3,11,16,0,0,0,0,...,0,0,0,0,1,15,2,0,0,4
4,0,0,5,14,4,0,0,0,0,0,...,0,0,0,4,12,14,7,0,0,6


In [5]:
df_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,0,0,5,13,9,1,0,0,0,0,...,0,0,0,6,13,10,0,0,0,0
1,0,0,0,12,13,5,0,0,0,0,...,0,0,0,0,11,16,10,0,0,1
2,0,0,0,4,15,12,0,0,0,0,...,0,0,0,0,3,11,16,9,0,2
3,0,0,7,15,13,1,0,0,0,8,...,0,0,0,7,13,13,9,0,0,3
4,0,0,0,1,11,0,0,0,0,0,...,0,0,0,0,2,16,4,0,0,4


### get features

In [6]:
x_train = df_train.iloc[:,:-1]
x_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0,1,6,15,12,1,0,0,0,7,...,0,0,0,0,6,14,7,1,0,0
1,0,0,10,16,6,0,0,0,0,7,...,3,0,0,0,10,16,15,3,0,0
2,0,0,8,15,16,13,0,0,0,1,...,0,0,0,0,9,14,0,0,0,0
3,0,0,0,3,11,16,0,0,0,0,...,0,0,0,0,0,1,15,2,0,0
4,0,0,5,14,4,0,0,0,0,0,...,12,0,0,0,4,12,14,7,0,0


In [7]:
x_test = df_test.iloc[:,:-1]
x_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0,0,5,13,9,1,0,0,0,0,...,0,0,0,0,6,13,10,0,0,0
1,0,0,0,12,13,5,0,0,0,0,...,0,0,0,0,0,11,16,10,0,0
2,0,0,0,4,15,12,0,0,0,0,...,5,0,0,0,0,3,11,16,9,0
3,0,0,7,15,13,1,0,0,0,8,...,9,0,0,0,7,13,13,9,0,0
4,0,0,0,1,11,0,0,0,0,0,...,0,0,0,0,0,2,16,4,0,0


### get labels

In [8]:
y_train = df_train.iloc[:,-1:]
y_train.head(5)

Unnamed: 0,64
0,0
1,0
2,7
3,4
4,6


In [9]:
y_test = df_test.iloc[:,-1:]
y_test.head(5)

Unnamed: 0,64
0,0
1,1
2,2
3,3
4,4


### Normalize data

#### min max scale

In [10]:
x_train = (x_train-x_train.min())/(x_train.max()-x_train.min())
x_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,,0.125,0.375,0.9375,0.75,0.0625,0.0,0.0,0.0,0.466667,...,0.0,0.0,0.0,0.0,0.375,0.875,0.4375,0.0625,0.0,0.0
1,,0.0,0.625,1.0,0.375,0.0,0.0,0.0,0.0,0.466667,...,0.1875,0.0,0.0,0.0,0.625,1.0,0.9375,0.1875,0.0,0.0
2,,0.0,0.5,0.9375,1.0,0.8125,0.0,0.0,0.0,0.066667,...,0.0,0.0,0.0,0.0,0.5625,0.875,0.0,0.0,0.0,0.0
3,,0.0,0.0,0.1875,0.6875,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0625,0.9375,0.125,0.0,0.0
4,,0.0,0.3125,0.875,0.25,0.0,0.0,0.0,0.0,0.0,...,0.75,0.0,0.0,0.0,0.25,0.75,0.875,0.4375,0.0,0.0


In [11]:
x_train.fillna(0, inplace = True)
x_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.0,0.125,0.375,0.9375,0.75,0.0625,0.0,0.0,0.0,0.466667,...,0.0,0.0,0.0,0.0,0.375,0.875,0.4375,0.0625,0.0,0.0
1,0.0,0.0,0.625,1.0,0.375,0.0,0.0,0.0,0.0,0.466667,...,0.1875,0.0,0.0,0.0,0.625,1.0,0.9375,0.1875,0.0,0.0
2,0.0,0.0,0.5,0.9375,1.0,0.8125,0.0,0.0,0.0,0.066667,...,0.0,0.0,0.0,0.0,0.5625,0.875,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.1875,0.6875,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0625,0.9375,0.125,0.0,0.0
4,0.0,0.0,0.3125,0.875,0.25,0.0,0.0,0.0,0.0,0.0,...,0.75,0.0,0.0,0.0,0.25,0.75,0.875,0.4375,0.0,0.0


In [12]:
x_test = (x_test-x_test.min())/(x_test.max()-x_test.min())
x_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,,0.0,0.3125,0.8125,0.5625,0.0625,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.375,0.8125,0.625,0.0,0.0,0.0
1,,0.0,0.0,0.75,0.8125,0.3125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.6875,1.0,0.625,0.0,0.0
2,,0.0,0.0,0.25,0.9375,0.75,0.0,0.0,0.0,0.0,...,0.3125,0.0,0.0,0.0,0.0,0.1875,0.6875,1.0,0.5625,0.0
3,,0.0,0.4375,0.9375,0.8125,0.0625,0.0,0.0,0.0,0.5,...,0.5625,0.0,0.0,0.0,0.4375,0.8125,0.8125,0.5625,0.0,0.0
4,,0.0,0.0,0.0625,0.6875,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.125,1.0,0.25,0.0,0.0


In [13]:
x_test.fillna(0, inplace = True)
x_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.0,0.0,0.3125,0.8125,0.5625,0.0625,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.375,0.8125,0.625,0.0,0.0,0.0
1,0.0,0.0,0.0,0.75,0.8125,0.3125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.6875,1.0,0.625,0.0,0.0
2,0.0,0.0,0.0,0.25,0.9375,0.75,0.0,0.0,0.0,0.0,...,0.3125,0.0,0.0,0.0,0.0,0.1875,0.6875,1.0,0.5625,0.0
3,0.0,0.0,0.4375,0.9375,0.8125,0.0625,0.0,0.0,0.0,0.5,...,0.5625,0.0,0.0,0.0,0.4375,0.8125,0.8125,0.5625,0.0,0.0
4,0.0,0.0,0.0,0.0625,0.6875,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.125,1.0,0.25,0.0,0.0


### Trainning

In [14]:
classifier = DecisionTreeClassifier()
classifier.fit(x_train, y_train.values.ravel())

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [15]:
y_pred = classifier.predict(x_test) 

### Evaluate

In [16]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.8530884808013356
[[172   0   1   0   2   2   0   0   1   0]
 [  0 156   4   8   1   1   1   0   6   5]
 [  1   8 144   4   2   0   0   7  10   1]
 [  0   2   7 143   0   5   0   4  16   6]
 [  4  10   1   0 139   2   7   4  10   4]
 [  1   1   6   3   1 162   6   1   0   1]
 [  1   3   1   0   1   0 170   0   5   0]
 [  0   3   5   0  13   1   0 142  10   5]
 [  0   5   6   3   4   1   0   4 148   3]
 [  0   4   0   5   4   5   0   1   4 157]]
