## import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

### prepare data


In [2]:
df_train = pd.read_csv("data/letter/let_train.csv", delimiter=',', header=None, names=[i for i in range(17)], index_col=False)

In [3]:
df_test = pd.read_csv("data/letter/let_test.csv", delimiter=',', header=None, names=[i for i in range(17)], index_col=False)

### exploratory

In [4]:
df_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8,19
1,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10,8
2,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9,3
3,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8,13
4,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10,6


In [5]:
df_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,4,8,5,6,3,8,7,3,7,10,4,7,2,8,5,9,18
1,3,5,5,3,2,8,8,2,5,10,4,6,5,8,1,7,13
2,5,9,7,4,4,6,4,3,2,8,4,10,7,3,1,8,12
3,1,1,2,1,0,8,14,1,5,6,10,8,0,8,0,8,19
4,3,2,5,3,2,7,10,1,7,7,11,8,1,11,2,8,24


### get features

In [6]:
x_train = df_train.iloc[:,:-1]
x_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [7]:
x_test = df_test.iloc[:,:-1]
x_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,4,8,5,6,3,8,7,3,7,10,4,7,2,8,5,9
1,3,5,5,3,2,8,8,2,5,10,4,6,5,8,1,7
2,5,9,7,4,4,6,4,3,2,8,4,10,7,3,1,8
3,1,1,2,1,0,8,14,1,5,6,10,8,0,8,0,8
4,3,2,5,3,2,7,10,1,7,7,11,8,1,11,2,8


### get labels

In [8]:
y_train = df_train.iloc[:,-1:]
y_train.head(5)

Unnamed: 0,16
0,19
1,8
2,3
3,13
4,6


In [9]:
y_test = df_test.iloc[:,-1:]
y_test.head(5)

Unnamed: 0,16
0,18
1,13
2,12
3,19
4,24


### Normalize data

#### min max scale

In [10]:
x_train = (x_train-x_train.min())/(x_train.max()-x_train.min())
x_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.133333,0.533333,0.2,0.333333,0.066667,0.533333,0.866667,0.0,0.4,0.4,0.666667,0.533333,0.0,0.533333,0.0,0.5
1,0.333333,0.8,0.2,0.466667,0.133333,0.666667,0.333333,0.333333,0.266667,0.866667,0.2,0.6,0.133333,0.533333,0.266667,0.642857
2,0.266667,0.733333,0.4,0.533333,0.4,0.666667,0.4,0.133333,0.4,0.666667,0.2,0.466667,0.2,0.466667,0.2,0.571429
3,0.466667,0.733333,0.4,0.4,0.2,0.333333,0.6,0.266667,0.4,0.266667,0.266667,0.666667,0.4,0.666667,0.133333,0.5
4,0.133333,0.066667,0.2,0.066667,0.066667,0.533333,0.4,0.4,0.4,0.4,0.333333,0.6,0.066667,0.466667,0.333333,0.642857


In [11]:
x_test = (x_test-x_test.min())/(x_test.max()-x_test.min())
x_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.285714,0.533333,0.357143,0.4,0.2,0.533333,0.466667,0.2,0.466667,0.666667,0.266667,0.384615,0.133333,0.538462,0.333333,0.6
1,0.214286,0.333333,0.357143,0.2,0.133333,0.533333,0.533333,0.133333,0.333333,0.666667,0.266667,0.307692,0.333333,0.538462,0.066667,0.466667
2,0.357143,0.6,0.5,0.266667,0.266667,0.4,0.266667,0.2,0.133333,0.533333,0.266667,0.615385,0.466667,0.153846,0.066667,0.533333
3,0.071429,0.066667,0.142857,0.066667,0.0,0.533333,0.933333,0.066667,0.333333,0.4,0.666667,0.461538,0.0,0.538462,0.0,0.533333
4,0.214286,0.133333,0.357143,0.2,0.133333,0.466667,0.666667,0.066667,0.466667,0.466667,0.733333,0.461538,0.066667,0.769231,0.133333,0.533333


### Trainning

In [12]:
classifier = DecisionTreeClassifier()
classifier.fit(x_train, y_train.values.ravel())

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [13]:
y_pred = classifier.predict(x_test) 

### Evaluate

In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.7785778577857786
[[247   0   1   3   0   1   0   1   0   2   1   0   2   0   1   0   2   1
    4   0   0   2   2   2   0   2]
 [  0 184   0  16   0   3   2   1   2   0   0   0   1   0   6  12   4   2
    2   0   1   2   0   2   0   0]
 [  1   0 187   0   9   0   7   2   0   1   2   3   0   0   5   2   2   2
    0   2   0   0   0   1   0   0]
 [  0   2   0 233   0   7   0   6   0   1   0   0   1   3   4   1   0  12
    1   0   0   0   0   5   1   0]
 [  0   5   7   0 171   2   5   1   5   2   0   7   1   0   1   4   2   4
   16  11   0   2   0  14   0   2]
 [  0   4   2   3   1 215   0   4   4   2   0   0   0   0   1  24   0   2
    2   1   0   1   0   1   2   0]
 [  0  21   8   4   6   0 166   4   3   1   0   2   3   1   8   2   6  11
    5   1   0   0   2   5   0   4]
 [  0  13   0  30   2   6   0 150   1   0   2   1   0   3   3   4   0   5
    3   0   0   0   1   5   0   1]
 [  2   5   0   0   1   3   0   1 228  17   0   0   0   1   0   4   1   0
    3   0   0   0   0   0   1   2]
