https://github.com/ageron/handson-ml2/blob/master/06_decision_trees.ipynb <br/>
https://github.com/ageron/handson-ml2/blob/master/04_training_linear_models.ipynb

# Setup

In [20]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is 
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [2]:
import pandas as pd

batch_size = 512
train_data = np.array(pd.read_csv(r'C:\Users\ryand\Desktop\Data Mining & ML\Fashion\fashion-mnist_train.csv'))
test_data = np.array(pd.read_csv(r'C:\Users\ryand\Desktop\Data Mining & ML\Fashion\fashion-mnist_test.csv'))

X_train = train_data[:, 1:785]
y_train = train_data[:, 0]
X_test = test_data[:, 1:785]
y_test = test_data[:, 0]

# Training

In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score

In [4]:
clf = DecisionTreeClassifier(random_state=42)

In [5]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [6]:
y_train_pred = clf.predict(X_train)

In [7]:
y_test_pred = clf.predict(X_test)

In [8]:
cross_val_y_train_pred = cross_val_predict(clf, X_train, y_train, cv=10)

## Cross-validation Matrix

In [9]:
confusion_matrix(y_train, cross_val_y_train_pred)

array([[4490,   37,  128,  256,   55,   12,  937,    4,   77,    4],
       [  43, 5682,   15,  173,   35,    3,   39,    0,    8,    2],
       [ 148,   22, 4096,  102,  882,   11,  677,    0,   57,    5],
       [ 273,  163,  106, 4786,  324,   15,  286,    2,   43,    2],
       [  64,   30,  942,  294, 3933,    5,  681,    1,   48,    2],
       [  15,    8,    6,   14,    2, 5376,   11,  348,   63,  157],
       [ 922,   39,  742,  239,  644,    9, 3260,    2,  137,    6],
       [   0,    0,    0,    2,    1,  365,    1, 5227,   31,  373],
       [  55,   13,   85,   44,   62,   85,  139,   34, 5455,   28],
       [   5,    1,    7,   10,    7,  156,    7,  358,   25, 5424]],
      dtype=int64)

class 0: TP=4490 | FP=1510 | TN=52475 (60000-TP+FP+FN) | FN=1525

In [10]:
cv_cnf_matrix = confusion_matrix(y_train, cross_val_y_train_pred)

In [11]:
CV_FP = cv_cnf_matrix.sum(axis=0) - np.diag(cv_cnf_matrix) 
CV_FN = cv_cnf_matrix.sum(axis=1) - np.diag(cv_cnf_matrix)
CV_TP = np.diag(cv_cnf_matrix)
CV_TN = cv_cnf_matrix.sum() - (CV_FP + CV_FN + CV_TP)
CV_FP = CV_FP.astype(float)
CV_FN = CV_FN.astype(float)
CV_TP = CV_TP.astype(float)
CV_TN = CV_TN.astype(float)

avg_cv_FP=0
avg_cv_FN=0
avg_cv_TP=0
avg_cv_TN=0

for i in CV_FP:
    avg_cv_FP+=i
    
for i in CV_FN:
    avg_cv_FN+=i
    
for i in CV_TP:
    avg_cv_TP+=i

for i in CV_TN:
    avg_cv_TN+=i
    
avg_cv_FP = avg_cv_FP/10
avg_cv_FN = avg_cv_FN/10
avg_cv_TP = avg_cv_TP/10
avg_cv_TN = avg_cv_TN/10

## Training Set Matrix

In [12]:
confusion_matrix(y_train, y_train_pred)

array([[6000,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0, 6000,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0, 6000,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0, 6000,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0, 6000,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0, 6000,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0, 6000,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 6000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0, 6000,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0, 6000]],
      dtype=int64)

Can see that it has 60000 TP because its the set it trained on

In [13]:
train_cnf_matrix = confusion_matrix(y_train, y_train_pred)

In [14]:
train_FP = train_cnf_matrix.sum(axis=0) - np.diag(train_cnf_matrix) 
train_FN = train_cnf_matrix.sum(axis=1) - np.diag(train_cnf_matrix)
train_TP = np.diag(train_cnf_matrix)
train_TN = train_cnf_matrix.sum() - (train_FP + train_FN + train_TP)
train_FP = train_FP.astype(float)
train_FN = train_FN.astype(float)
train_TP = train_TP.astype(float)
train_TN = train_TN.astype(float)

avg_train_FP=0
avg_train_FN=0
avg_train_TP=0
avg_train_TN=0

for i in train_FP:
    avg_train_FP+=i
    
for i in train_FN:
    avg_train_FN+=i
    
for i in train_TP:
    avg_train_TP+=i

for i in train_TN:
    avg_train_TN+=i
    
avg_train_FP = avg_train_FP/10
avg_train_FN = avg_train_FN/10
avg_train_TP = avg_train_TP/10
avg_train_TN = avg_train_TN/10

## Test Set Matrix

In [15]:
cfm = confusion_matrix(y_test, y_test_pred)
confusion_matrix(y_test, y_test_pred)

array([[732,   6,  19,  41,   7,   3, 175,   3,  14,   0],
       [  4, 960,   4,  19,   7,   0,   4,   0,   2,   0],
       [ 26,   5, 683,  17, 137,   3, 118,   0,  10,   1],
       [ 44,  28,  15, 813,  60,   2,  36,   0,   2,   0],
       [  5,   4, 146,  50, 690,   1,  93,   0,  10,   1],
       [  1,   1,   1,   0,   1, 873,   2,  66,  21,  34],
       [166,  12, 105,  39, 102,   1, 561,   0,  13,   1],
       [  0,   0,   0,   0,   0,  56,   0, 857,   8,  79],
       [ 19,   1,  16,   7,  17,  10,  20,   8, 898,   4],
       [  1,   0,   0,   1,   1,  25,   2,  69,   5, 896]], dtype=int64)

7963 TP

class 0: TP=732 | FP=268 (6+19+41+7+3+175+3+14+0) | TN=8734 (60000-TP+FP+FN) or (960+5+28+4+...+4+683+15+146+...+19+17+...) | FN=266 (4+26+44+5+1+166+19+1)

class 1: TP=960 | FP=40 | TN=8943 | FN=57

class 2: TP=683 | FP=317 | TN=8694 | FN=306

class 3: TP=813 | FP=187 | TN=8826 | FN=174

class 4: TP=690 | FP=310 | TN=8668 | FN=332

class 5: TP=873 | FP=127 | TN=8899 | FN=101

class 6: TP=561 | FP=439 | TN=8550 | FN=450

class 7: TP=857 | FP=143 | TN=8854 | FN=146

class 8: TP=898 | FP=102 | TN=8915 | FN=85

class 9: TP=896 | FP=104 | TN=8880 | FN=120

With the whole set taken into account there are 7963 TP so accuracy is 80%

Test set displays an okay matrix, pretty good results, 1k items per class so 960tp on class 1 is great, seems to nail class 1 pretty well

In [16]:
test_cnf_matrix = confusion_matrix(y_test, y_test_pred)

In [17]:
test_FP = test_cnf_matrix.sum(axis=0) - np.diag(test_cnf_matrix) 
test_FN = test_cnf_matrix.sum(axis=1) - np.diag(test_cnf_matrix)
test_TP = np.diag(test_cnf_matrix)
test_TN = test_cnf_matrix.sum() - (test_FP + test_FN + test_TP)
test_FP = test_FP.astype(float)
test_FN = test_FN.astype(float)
test_TP = test_TP.astype(float)
test_TN = test_TN.astype(float)

avg_test_FP=0
avg_test_FN=0
avg_test_TP=0
avg_test_TN=0

for i in test_FP:
    avg_test_FP+=i
    
for i in test_FN:
    avg_test_FN+=i
    
for i in test_TP:
    avg_test_TP+=i

for i in test_TN:
    avg_test_TN+=i
    
avg_test_FP = avg_test_FP/10
avg_test_FN = avg_test_FN/10
avg_test_TP = avg_test_TP/10
avg_test_TN = avg_test_TN/10

## Accuracy

80% accuracy for cross validation, 80% accuracy with decision tree, both worse than Categorical Naive Bayes which had 87% accuracy.

### Cross-validation

In [18]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_train, cross_val_y_train_pred))

print(accuracy_score(y_train, cross_val_y_train_pred, normalize=False))

accuracy = accuracy_score(y_train, cross_val_y_train_pred)

0.7954833333333333
47729


### Decision Tree Train

In [19]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_train, y_train_pred))

print(accuracy_score(y_train, y_train_pred, normalize=False))

accuracy = accuracy_score(y_train, y_train_pred)


1.0
60000


### Decision Tree Test

In [20]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_test_pred))

print(accuracy_score(y_test, y_test_pred, normalize=False))

accuracy = accuracy_score(y_test, y_test_pred)

0.7963
7963


## Specificity

TN / (TN + FP) How sensitive is the classifier to the negative cases? - A highly specific test for cancer: if "YES" then you can be sure it's "YES". Closer to 1 means less false positives. 100% Specificity misses some true positives but in return provides no false positives.

### Cross-validation

In [21]:
avg_cv_TN/(avg_cv_TN+avg_cv_FP)

0.9772759259259259

### Decision Tree Train

In [22]:
avg_train_TN/(avg_train_TN+avg_train_FP)

1.0

### Decision Tree Test

In [23]:
avg_test_TN/(avg_test_TN+avg_test_FP)

0.9773666666666666

98% specificity for cv is incredible in comparison to 78% for Categorical Naive Bayes. Decision Tree Test also has 98%.

## Precision and Recall
Recall is intuitevely the ability of the classifier to find all the positive samples.

The best value is 1, and the worst value is 0.

### precision = TP / (TP+FP)

43% for Categorical Naive Bayes, so a big jump to 80% in both cross validation and test decision tree. So it classifies positive cases with 80% confidence, every 4 in 5 is correct.

In [24]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_train, cross_val_y_train_pred, average='micro')

0.7954833333333333

In [25]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_train, y_train_pred, average='micro')

1.0

In [26]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_test, y_test_pred, average='micro')

0.7963

### recall = TP / (TP+FN)
Similar to sensitivity (how sensitive is our classifier to the true cases? closer to 1 when theres no false negatives) with 100% sensitivity you catch all cases that could be true, so there will be more false postitives

In [27]:
recall_score(y_train, cross_val_y_train_pred, average='micro')

0.7954833333333333

In [28]:
recall_score(y_test, y_test_pred, average='micro')

0.7963

recall for Categorical Naive Bayes was .95 which was a really sensitive classifier, this time it is .75 so less sensitive.

F1 = 2 / ((1 / precision) + (1 / recall)) = 2 x ((precision x recall) / (precision + recall)) = TP / (TP + ((FN + FP) / 2))

In [29]:
from sklearn.metrics import f1_score

f1_score(y_train, cross_val_y_train_pred, average='micro')

0.7954833333333333

In [30]:
from sklearn.metrics import f1_score

f1_score(y_test, y_test_pred, average='micro')

0.7963

In [31]:
y_scores = cross_val_score(clf, X_train, y_train, cv=10)

In [32]:
y_score = clf.predict_proba(X_test)

In [33]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_score, average='macro', multi_class='ovo')

0.8868333333333334

# Does the decision tree generalize well to the new data?

The decision tree does not generalize well to the new data, but in comparison to cross validation, it matches its results for all performance metrices. It is therefore that the decision tree is overfitting.

# Experimenting with various tree parameters

## Shallow Depth

In [34]:
clf2 = DecisionTreeClassifier(random_state=42, max_depth=2)

In [35]:
clf2.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=2, random_state=42)

In [36]:
y_test_pred2 = clf2.predict(X_test)

In [37]:
cfm2 = confusion_matrix(y_test, y_test_pred2)
confusion_matrix(y_test, y_test_pred2)

array([[  0,   6,   0,   0, 967,   0,   0,  13,   0,  14],
       [  0, 840,   0,   0, 155,   0,   0,   2,   0,   3],
       [  0,  12,   0,   0, 971,   0,   0,   7,   0,  10],
       [  0,  11,   0,   0, 964,   0,   0,   2,   0,  23],
       [  0,   1,   0,   0, 996,   0,   0,   3,   0,   0],
       [  0,  29,   0,   0,  45,   0,   0, 691,   0, 235],
       [  0,  13,   0,   0, 973,   0,   0,  10,   0,   4],
       [  0,   0,   0,   0,   3,   0,   0, 866,   0, 131],
       [  0,  11,   0,   0, 682,   0,   0,  42,   0, 265],
       [  0,   0,   0,   0,  50,   0,   0, 109,   0, 841]], dtype=int64)

In [38]:
test_cnf_matrix2 = confusion_matrix(y_test, y_test_pred2)

In [39]:
test_FP2 = test_cnf_matrix2.sum(axis=0) - np.diag(test_cnf_matrix2) 
test_FN2 = test_cnf_matrix2.sum(axis=1) - np.diag(test_cnf_matrix2)
test_TP2 = np.diag(test_cnf_matrix2)
test_TN2 = test_cnf_matrix2.sum() - (test_FP2 + test_FN2 + test_TP2)
test_FP2 = test_FP2.astype(float)
test_FN2 = test_FN2.astype(float)
test_TP2 = test_TP2.astype(float)
test_TN2 = test_TN2.astype(float)

avg_test_FP2=0
avg_test_FN2=0
avg_test_TP2=0
avg_test_TN2=0

for i in test_FP2:
    avg_test_FP2+=i
    
for i in test_FN2:
    avg_test_FN2+=i
    
for i in test_TP2:
    avg_test_TP2+=i

for i in test_TN2:
    avg_test_TN2+=i
    
avg_test_FP2 = avg_test_FP2/10
avg_test_FN2 = avg_test_FN2/10
avg_test_TP2 = avg_test_TP2/10
avg_test_TN2 = avg_test_TN2/10

### Accuracy

In [40]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_test_pred2))

print(accuracy_score(y_test, y_test_pred2, normalize=False))

accuracy = accuracy_score(y_test, y_test_pred2)

0.3543
3543


### Specificity

In [41]:
avg_test_TN2/(avg_test_TN2+avg_test_FP2)

0.9282555555555555

### Precision

In [42]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_test, y_test_pred2, average='micro')

0.3543

### Recall

In [43]:
recall_score(y_test, y_test_pred2, average='micro')

0.3543

In [44]:
from sklearn.metrics import f1_score

f1_score(y_test, y_test_pred2, average='micro')

0.35429999999999995

### Shallow Depth Conclusion

It can be seen from the results of the major metrics that reducing the max depth of the decision tree to 2 has a negative impact on the classifiers ability to accuractely classify unseen data. This is because by having a depth of 2 it is only allowing for 4 leaf nodes in total, and therefore it can only classify 4 different classes. If I was to up the depth too much, the classifier wouldn't perform as well as it would have too many choices, but in this case, it is over simplifying the classifications to two classes only.

## Confidence Threshold for Pruning

### Threshold 1.0

In [45]:
clf3 = DecisionTreeClassifier(random_state=42, ccp_alpha=1.0)

In [46]:
clf3.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=1.0, random_state=42)

In [47]:
y_test_pred3 = clf3.predict(X_test)

In [48]:
cfm3 = confusion_matrix(y_test, y_test_pred3)
confusion_matrix(y_test, y_test_pred3)

array([[1000,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [1000,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [1000,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [1000,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [1000,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [1000,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [1000,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [1000,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [1000,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [1000,    0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int64)

### Confidence Threshold for Pruning Conclusion

Because the lowest cost label is 0, it classifies as 0 everytime.

## Splitting Criteria

Splitting nodes (children not leafs) into x nodes, so a child node will be split into 6 here, and each of them 6 will either be split into another 6 or have 2 leaf nodes. My initial understanding tells me that a high split will return poor results because it will have too many to choose from.

## Min split = 6 (deafault is 2)

In [49]:
clf4 = DecisionTreeClassifier(random_state=42, min_samples_split=6)

In [50]:
clf4.fit(X_train, y_train)

DecisionTreeClassifier(min_samples_split=6, random_state=42)

In [51]:
y_test_pred4 = clf4.predict(X_test)

In [52]:
cfm4 = confusion_matrix(y_test, y_test_pred4)
confusion_matrix(y_test, y_test_pred4)

array([[756,   7,  24,  41,  10,   2, 146,   1,  13,   0],
       [  4, 963,   3,  16,   5,   0,   7,   0,   2,   0],
       [ 24,   5, 680,   9, 148,   3, 122,   0,   8,   1],
       [ 54,  30,  20, 809,  49,   1,  35,   0,   2,   0],
       [ 11,   6, 164,  47, 673,   1,  89,   0,   8,   1],
       [  3,   0,   1,   0,   2, 874,   3,  65,  19,  33],
       [170,  12, 103,  37, 109,   1, 553,   0,  15,   0],
       [  0,   0,   0,   0,   0,  61,   0, 862,   4,  73],
       [ 16,   2,  26,   6,  13,  10,  16,   7, 900,   4],
       [  2,   0,   0,   1,   1,  28,   2,  70,   5, 891]], dtype=int64)

In [53]:
test_cnf_matrix4 = confusion_matrix(y_test, y_test_pred4)

In [54]:
test_FP4 = test_cnf_matrix4.sum(axis=0) - np.diag(test_cnf_matrix4) 
test_FN4 = test_cnf_matrix4.sum(axis=1) - np.diag(test_cnf_matrix4)
test_TP4 = np.diag(test_cnf_matrix4)
test_TN4 = test_cnf_matrix4.sum() - (test_FP4 + test_FN4 + test_TP4)
test_FP4 = test_FP4.astype(float)
test_FN4 = test_FN4.astype(float)
test_TP4 = test_TP4.astype(float)
test_TN4 = test_TN4.astype(float)

avg_test_FP4=0
avg_test_FN4=0
avg_test_TP4=0
avg_test_TN4=0

for i in test_FP4:
    avg_test_FP4+=i
    
for i in test_FN4:
    avg_test_FN4+=i
    
for i in test_TP4:
    avg_test_TP4+=i

for i in test_TN4:
    avg_test_TN4+=i
    
avg_test_FP4 = avg_test_FP4/10
avg_test_FN4 = avg_test_FN4/10
avg_test_TP4 = avg_test_TP4/10
avg_test_TN4 = avg_test_TN4/10

In [55]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_test_pred4))

print(accuracy_score(y_test, y_test_pred4, normalize=False))

accuracy = accuracy_score(y_test, y_test_pred4)

0.7961
7961


In [56]:
avg_test_TN4/(avg_test_TN4+avg_test_FP4)

0.9773444444444445

In [57]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_test, y_test_pred4, average='micro')

0.7961

In [58]:
recall_score(y_test, y_test_pred4, average='micro')

0.7961

In [59]:
from sklearn.metrics import f1_score

f1_score(y_test, y_test_pred4, average='micro')

0.7961

### Min Split = 6 Conclusion

it can be seen that with a sample split of 6, the accuracy is as good as it was with default split.

## Min Split = 40000

In [60]:
clf5 = DecisionTreeClassifier(random_state=42, min_samples_split=40000)

In [61]:
clf5.fit(X_train, y_train)

DecisionTreeClassifier(min_samples_split=40000, random_state=42)

In [62]:
y_test_pred5 = clf5.predict(X_test)

In [63]:
cfm5 = confusion_matrix(y_test, y_test_pred5)
confusion_matrix(y_test, y_test_pred5)

array([[  0,   6,   0,   0, 967,   0,   0,  27,   0,   0],
       [  0, 840,   0,   0, 155,   0,   0,   5,   0,   0],
       [  0,  12,   0,   0, 971,   0,   0,  17,   0,   0],
       [  0,  11,   0,   0, 964,   0,   0,  25,   0,   0],
       [  0,   1,   0,   0, 996,   0,   0,   3,   0,   0],
       [  0,  29,   0,   0,  45,   0,   0, 926,   0,   0],
       [  0,  13,   0,   0, 973,   0,   0,  14,   0,   0],
       [  0,   0,   0,   0,   3,   0,   0, 997,   0,   0],
       [  0,  11,   0,   0, 682,   0,   0, 307,   0,   0],
       [  0,   0,   0,   0,  50,   0,   0, 950,   0,   0]], dtype=int64)

### Min Split = 40000 conclusion

It can be seen from the above matrix that with a split of 40000 the algorithm performs poorly, as there are 10 classes and it is only classifying images as one of three: class 1, class 4, and class 7.

## Random Splitter (default = best)

In [64]:
clf6 = DecisionTreeClassifier(random_state=42, splitter='random')

In [65]:
clf6.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42, splitter='random')

In [66]:
y_test_pred6 = clf6.predict(X_test)

In [67]:
cfm6 = confusion_matrix(y_test, y_test_pred6)
confusion_matrix(y_test, y_test_pred6)

array([[732,   7,  29,  38,  14,   4, 159,   1,  16,   0],
       [  7, 961,   4,  16,   5,   1,   5,   0,   1,   0],
       [ 26,   4, 675,  18, 141,   1, 130,   1,   3,   1],
       [ 42,  39,  17, 800,  43,   2,  46,   0,  11,   0],
       [  9,   3, 139,  44, 659,   1, 134,   0,  10,   1],
       [  2,   1,   0,   2,   3, 875,   4,  66,  13,  34],
       [168,   6, 100,  35, 104,   3, 563,   0,  19,   2],
       [  0,   0,   0,   0,   0,  62,   0, 863,   5,  70],
       [  5,   2,  19,  13,  11,  16,  19,  10, 904,   1],
       [  0,   1,   0,   0,   1,  18,   0,  58,   3, 919]], dtype=int64)

## Max Leaf Nodes

### 2 Leaf Nodes Max

In [68]:
clf7 = DecisionTreeClassifier(random_state=42, max_leaf_nodes=2)

In [69]:
clf7.fit(X_train, y_train)

DecisionTreeClassifier(max_leaf_nodes=2, random_state=42)

In [70]:
y_test_pred7 = clf7.predict(X_test)

In [71]:
cfm7 = confusion_matrix(y_test, y_test_pred7)
confusion_matrix(y_test, y_test_pred7)

array([[  0, 973,   0,   0,   0,   0,   0,  27,   0,   0],
       [  0, 995,   0,   0,   0,   0,   0,   5,   0,   0],
       [  0, 983,   0,   0,   0,   0,   0,  17,   0,   0],
       [  0, 975,   0,   0,   0,   0,   0,  25,   0,   0],
       [  0, 997,   0,   0,   0,   0,   0,   3,   0,   0],
       [  0,  74,   0,   0,   0,   0,   0, 926,   0,   0],
       [  0, 986,   0,   0,   0,   0,   0,  14,   0,   0],
       [  0,   3,   0,   0,   0,   0,   0, 997,   0,   0],
       [  0, 693,   0,   0,   0,   0,   0, 307,   0,   0],
       [  0,  50,   0,   0,   0,   0,   0, 950,   0,   0]], dtype=int64)

Only classifies as either class 1 or class 7, this is because only 2 leaf nodes are allowed.

### 200000 Leaf Nodes Max

In [72]:
clf8 = DecisionTreeClassifier(random_state=42, max_leaf_nodes=200000)

In [73]:
clf8.fit(X_train, y_train)

DecisionTreeClassifier(max_leaf_nodes=200000, random_state=42)

In [74]:
y_test_pred8 = clf8.predict(X_test)

In [75]:
cfm8 = confusion_matrix(y_test, y_test_pred8)
confusion_matrix(y_test, y_test_pred8)

array([[738,   6,  18,  41,   8,   2, 169,   1,  17,   0],
       [  9, 963,   4,  16,   2,   0,   4,   0,   2,   0],
       [ 25,   3, 675,  17, 144,   2, 120,   0,  12,   2],
       [ 49,  24,  20, 810,  53,   3,  36,   0,   5,   0],
       [  7,   5, 147,  47, 685,   0, 103,   0,   5,   1],
       [  3,   1,   1,   1,   2, 872,   2,  60,  22,  36],
       [171,   9, 104,  40, 104,   1, 554,   0,  17,   0],
       [  0,   0,   0,   2,   0,  53,   1, 867,   2,  75],
       [ 17,   1,  17,   3,  13,  12,  23,  10, 902,   2],
       [  3,   0,   0,   1,   1,  19,   2,  71,   7, 896]], dtype=int64)

### 200K leaf nodes conclusion

I initially thought that it would be more accurate, but the matrix looks similar to that of the original algorithm. This will be because the max_split default is 2, and therefore the child nodes allow for the correct path, the only decision it has to make is on the leaf nodes, and it probably isn't using the full 200k leaf nodes allowed. If I were to change the max_split in combination with max leaf nodes to a really high number, the results would be bad.

# Test + 30% of Training set

In [76]:
import pandas as pd

batch_size = 512
train_data = np.array(pd.read_csv(r'C:\Users\ryand\Desktop\Data Mining & ML\Fashion\fashion-mnist_train.csv'))
test_data = np.array(pd.read_csv(r'C:\Users\ryand\Desktop\Data Mining & ML\Fashion\fashion-mnist_test.csv'))

X_train = train_data[:, 1:785]
y_train = train_data[:, 0]
X_test = test_data[:, 1:785]
y_test = test_data[:, 0]

In [77]:
thirty_percent_X_train = X_train[0:18000]
thirty_percent_y_train = y_train[0:18000]

X_test_train = np.concatenate((X_test, thirty_percent_X_train), axis=0)
y_test_train = np.concatenate((y_test, thirty_percent_y_train), axis=0)

seventy_percent_X_train = X_train[18000:]
seventy_percent_y_train = y_train[18000:]

In [78]:
print(X_test_train.shape)
print(y_test_train.shape)
print(seventy_percent_X_train.shape)
print(seventy_percent_y_train.shape)

(28000, 784)
(28000,)
(42000, 784)
(42000,)


In [79]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(seventy_percent_X_train, seventy_percent_y_train)

DecisionTreeClassifier(random_state=42)

In [80]:
cross_val_y_train_pred = cross_val_predict(clf, seventy_percent_X_train, seventy_percent_y_train, cv=10)

In [81]:
cv_cnf_matrix = confusion_matrix(seventy_percent_y_train, cross_val_y_train_pred)

### Accuracy of 70% of Train

In [82]:
from sklearn.metrics import accuracy_score

print(accuracy_score(seventy_percent_y_train, cross_val_y_train_pred))

print(accuracy_score(seventy_percent_y_train, cross_val_y_train_pred, normalize=False))

accuracy = accuracy_score(seventy_percent_y_train, cross_val_y_train_pred)

0.7838571428571428
32922


### Accuracy of Test + 30% of Train

In [83]:
y_test_train_pred = clf.predict(X_test_train)

In [84]:
cfm = confusion_matrix(y_test_train, y_test_train_pred)
confusion_matrix(y_test_train, y_test_train_pred)

array([[2114,   13,   84,  140,   37,    3,  429,    0,   47,    1],
       [  19, 2664,   12,   54,   21,    0,   26,    0,    4,    0],
       [  76,   14, 1872,   54,  386,    5,  321,    0,   30,    1],
       [ 135,  106,   65, 2210,  146,    4,  126,    0,   22,    1],
       [  26,   10,  383,  145, 1852,    2,  293,    0,   26,    1],
       [   6,    3,    4,    9,    1, 2479,    4,  171,   33,   85],
       [ 435,   18,  389,  106,  279,    7, 1510,    0,   61,    2],
       [   0,    0,    0,    0,    1,  155,    0, 2467,   15,  217],
       [  45,    6,   36,   34,   22,   22,   59,   24, 2515,   19],
       [   4,    0,    3,    5,    0,   85,    6,  183,   14, 2481]],
      dtype=int64)

In [85]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test_train, y_test_train_pred))

print(accuracy_score(y_test_train, y_test_train_pred, normalize=False))

accuracy = accuracy_score(y_test_train, y_test_train_pred)

0.7915714285714286
22164


### Conclusions

Accuracy was slightly higher on the test + train set using decision tree classifier in comparison to 70% of train set with cross validation. That will be because the cross-validation has less images to train on now, and it will get worse as it has less and less to train on, as it will lead to overfitting, since it has less information to go off. The decision tree classifier saw no change in accuracy in comparison to the original test set.

# Test + 60% of Train

In [86]:
import pandas as pd

batch_size = 512
train_data = np.array(pd.read_csv(r'C:\Users\ryand\Desktop\Data Mining & ML\Fashion\fashion-mnist_train.csv'))
test_data = np.array(pd.read_csv(r'C:\Users\ryand\Desktop\Data Mining & ML\Fashion\fashion-mnist_test.csv'))

X_train = train_data[:, 1:785]
y_train = train_data[:, 0]
X_test = test_data[:, 1:785]
y_test = test_data[:, 0]

In [87]:
sixty_percent_X_train = X_train[0:36000]
sixty_percent_y_train = y_train[0:36000]

X_test_train = np.concatenate((X_test, sixty_percent_X_train), axis=0)
y_test_train = np.concatenate((y_test, sixty_percent_y_train), axis=0)

fourty_percent_X_train = X_train[36000:]
fourty_percent_y_train = y_train[36000:]

In [88]:
print(X_test_train.shape)
print(y_test_train.shape)
print(fourty_percent_X_train.shape)
print(fourty_percent_y_train.shape)

(46000, 784)
(46000,)
(24000, 784)
(24000,)


In [89]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(fourty_percent_X_train, fourty_percent_y_train)

DecisionTreeClassifier(random_state=42)

In [90]:
cross_val_y_train_pred = cross_val_predict(clf, fourty_percent_X_train, fourty_percent_y_train, cv=10)

In [91]:
cv_cnf_matrix = confusion_matrix(fourty_percent_y_train, cross_val_y_train_pred)

### Accuracy of 40% Train

In [92]:
from sklearn.metrics import accuracy_score

print(accuracy_score(fourty_percent_y_train, cross_val_y_train_pred))

print(accuracy_score(fourty_percent_y_train, cross_val_y_train_pred, normalize=False))

accuracy = accuracy_score(fourty_percent_y_train, cross_val_y_train_pred)

0.7712083333333334
18509


### Accuracy of Test + 60% Train

In [93]:
y_test_train_pred = clf.predict(X_test_train)

In [94]:
cfm = confusion_matrix(y_test_train, y_test_train_pred)
confusion_matrix(y_test_train, y_test_train_pred)

array([[3322,   43,  113,  226,   45,   17,  764,    6,   73,    5],
       [  43, 4324,   20,  145,   20,    3,   30,    0,   10,    0],
       [ 133,   28, 3050,  101,  671,    9,  531,    3,   73,    6],
       [ 245,  140,   92, 3623,  240,   14,  201,    0,   35,    4],
       [  72,   31,  679,  266, 2931,    4,  523,    2,   40,    3],
       [  23,    7,    2,   24,    3, 3913,   18,  328,   86,  157],
       [ 672,   31,  596,  231,  528,   13, 2463,    5,  110,    4],
       [   0,    0,    1,    2,    0,  312,    1, 3933,   24,  372],
       [  89,   13,   62,   43,   46,   57,  119,   33, 4092,   15],
       [  12,    3,    4,    6,    4,  130,    5,  329,   28, 4092]],
      dtype=int64)

In [95]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test_train, y_test_train_pred))

print(accuracy_score(y_test_train, y_test_train_pred, normalize=False))

accuracy = accuracy_score(y_test_train, y_test_train_pred)

0.7770217391304348
35743


### Conclusions

Accuracy of cv on 40% of training set saw a further decrease in accuracy, almost to the point now where it wrongly classifies every 1 in 4. Accuracy of Test + 60% train also saw a decrease in accuracy, this is because the classifier was fitted using the unseen other 40% of the training set, and therefore, overfitting is beginning to occur because there is way more unseen data that hasnt came up in training.

# Random Forest Classifier

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
import pandas as pd
import numpy as np

batch_size = 512
train_data = np.array(pd.read_csv(r'C:\Users\ryand\Desktop\Data Mining & ML\Fashion\fashion-mnist_train.csv'))
test_data = np.array(pd.read_csv(r'C:\Users\ryand\Desktop\Data Mining & ML\Fashion\fashion-mnist_test.csv'))

X_train = train_data[:, 1:785]
y_train = train_data[:, 0]
X_test = test_data[:, 1:785]
y_test = test_data[:, 0]

In [24]:
clf = RandomForestClassifier(random_state=42)

In [25]:
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [26]:
y_train_pred = clf.predict(X_train)

In [27]:
y_test_pred = clf.predict(X_test)

In [28]:
cross_val_y_train_pred = cross_val_predict(clf, X_train, y_train, cv=10)

## Cross-validation Matrix

In [29]:
confusion_matrix(y_train, cross_val_y_train_pred)

array([[5208,    2,   72,  197,   23,    2,  450,    0,   46,    0],
       [  17, 5794,   24,  128,   11,    1,   22,    0,    3,    0],
       [  50,    1, 4913,   56,  662,    2,  283,    0,   33,    0],
       [ 108,   15,   45, 5507,  190,    0,  123,    0,   12,    0],
       [  10,    7,  471,  228, 4991,    1,  273,    0,   19,    0],
       [   0,    0,    0,    1,    0, 5770,    1,  154,   17,   57],
       [ 925,    8,  686,  152,  555,    3, 3583,    0,   88,    0],
       [   0,    0,    0,    0,    0,   93,    0, 5660,    8,  239],
       [  10,    3,   20,   23,   24,   21,   57,   14, 5825,    3],
       [   0,    0,    2,    2,    0,   60,    2,  220,    6, 5708]],
      dtype=int64)

In [30]:
cv_cnf_matrix = confusion_matrix(y_train, cross_val_y_train_pred)

In [31]:
CV_FP = cv_cnf_matrix.sum(axis=0) - np.diag(cv_cnf_matrix) 
CV_FN = cv_cnf_matrix.sum(axis=1) - np.diag(cv_cnf_matrix)
CV_TP = np.diag(cv_cnf_matrix)
CV_TN = cv_cnf_matrix.sum() - (CV_FP + CV_FN + CV_TP)
CV_FP = CV_FP.astype(float)
CV_FN = CV_FN.astype(float)
CV_TP = CV_TP.astype(float)
CV_TN = CV_TN.astype(float)

avg_cv_FP=0
avg_cv_FN=0
avg_cv_TP=0
avg_cv_TN=0

for i in CV_FP:
    avg_cv_FP+=i
    
for i in CV_FN:
    avg_cv_FN+=i
    
for i in CV_TP:
    avg_cv_TP+=i

for i in CV_TN:
    avg_cv_TN+=i
    
avg_cv_FP = avg_cv_FP/10
avg_cv_FN = avg_cv_FN/10
avg_cv_TP = avg_cv_TP/10
avg_cv_TN = avg_cv_TN/10

## Training Set Matrix

In [32]:
confusion_matrix(y_train, y_train_pred)

array([[6000,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0, 6000,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0, 6000,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0, 6000,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0, 6000,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0, 6000,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0, 6000,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 6000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0, 6000,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0, 6000]],
      dtype=int64)

Can see that it has 60000 TP because its the set it trained on

In [33]:
train_cnf_matrix = confusion_matrix(y_train, y_train_pred)

In [34]:
train_FP = train_cnf_matrix.sum(axis=0) - np.diag(train_cnf_matrix) 
train_FN = train_cnf_matrix.sum(axis=1) - np.diag(train_cnf_matrix)
train_TP = np.diag(train_cnf_matrix)
train_TN = train_cnf_matrix.sum() - (train_FP + train_FN + train_TP)
train_FP = train_FP.astype(float)
train_FN = train_FN.astype(float)
train_TP = train_TP.astype(float)
train_TN = train_TN.astype(float)

avg_train_FP=0
avg_train_FN=0
avg_train_TP=0
avg_train_TN=0

for i in train_FP:
    avg_train_FP+=i
    
for i in train_FN:
    avg_train_FN+=i
    
for i in train_TP:
    avg_train_TP+=i

for i in train_TN:
    avg_train_TN+=i
    
avg_train_FP = avg_train_FP/10
avg_train_FN = avg_train_FN/10
avg_train_TP = avg_train_TP/10
avg_train_TN = avg_train_TN/10

## Test Set Matrix

In [35]:
cfm = confusion_matrix(y_test, y_test_pred)
confusion_matrix(y_test, y_test_pred)

array([[861,   0,  12,  31,   4,   1,  77,   0,  14,   0],
       [  2, 971,   5,  17,   1,   1,   3,   0,   0,   0],
       [  7,   2, 798,  13, 115,   0,  57,   0,   8,   0],
       [ 17,   6,   8, 937,  18,   0,  14,   0,   0,   0],
       [  1,   1,  64,  27, 862,   0,  42,   0,   3,   0],
       [  0,   0,   0,   0,   0, 947,   0,  37,   5,  11],
       [166,   1, 100,  28,  75,   0, 611,   0,  19,   0],
       [  0,   0,   0,   0,   0,  13,   0, 937,   0,  50],
       [  1,   1,   7,   0,   3,   1,  10,   3, 974,   0],
       [  0,   0,   0,   0,   0,   7,   1,  41,   2, 949]], dtype=int64)

In [36]:
test_cnf_matrix = confusion_matrix(y_test, y_test_pred)

In [37]:
test_FP = test_cnf_matrix.sum(axis=0) - np.diag(test_cnf_matrix) 
test_FN = test_cnf_matrix.sum(axis=1) - np.diag(test_cnf_matrix)
test_TP = np.diag(test_cnf_matrix)
test_TN = test_cnf_matrix.sum() - (test_FP + test_FN + test_TP)
test_FP = test_FP.astype(float)
test_FN = test_FN.astype(float)
test_TP = test_TP.astype(float)
test_TN = test_TN.astype(float)

avg_test_FP=0
avg_test_FN=0
avg_test_TP=0
avg_test_TN=0

for i in test_FP:
    avg_test_FP+=i
    
for i in test_FN:
    avg_test_FN+=i
    
for i in test_TP:
    avg_test_TP+=i

for i in test_TN:
    avg_test_TN+=i
    
avg_test_FP = avg_test_FP/10
avg_test_FN = avg_test_FN/10
avg_test_TP = avg_test_TP/10
avg_test_TN = avg_test_TN/10

## Accuracy

80% accuracy for cross validation, 80% accuracy with decision tree, both worse than Categorical Naive Bayes which had 87% accuracy.

Random forest returns 88% accuracy from cv, and also 88% accuracy for the test set alone with the random forest classifier. RFC is better at not overfitting.

### Cross-validation

In [38]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_train, cross_val_y_train_pred))

print(accuracy_score(y_train, cross_val_y_train_pred, normalize=False))

accuracy = accuracy_score(y_train, cross_val_y_train_pred)

0.88265
52959


### Decision Tree Train

In [39]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_train, y_train_pred))

print(accuracy_score(y_train, y_train_pred, normalize=False))

accuracy = accuracy_score(y_train, y_train_pred)


1.0
60000


### Decision Tree Test

In [40]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_test_pred))

print(accuracy_score(y_test, y_test_pred, normalize=False))

accuracy = accuracy_score(y_test, y_test_pred)

0.8847
8847


## Specificity

TN / (TN + FP) How sensitive is the classifier to the negative cases? - A highly specific test for cancer: if "YES" then you can be sure it's "YES". Closer to 1 means less false positives. 100% Specificity misses some true positives but in return provides no false positives.

### Cross-validation

In [41]:
avg_cv_TN/(avg_cv_TN+avg_cv_FP)

0.9869611111111112

### Decision Tree Train

In [42]:
avg_train_TN/(avg_train_TN+avg_train_FP)

1.0

### Decision Tree Test

In [43]:
avg_test_TN/(avg_test_TN+avg_test_FP)

0.987188888888889

J48: 98% specificity for cv is incredible in comparison to 78% for Categorical Naive Bayes. Decision Tree Test also has 98%.

RFC: 99% sensitivity for cv and test, showing to be a better decision tree than the j48 because the accuracy is also higher.

## Precision and Recall
Recall is intuitevely the ability of the classifier to find all the positive samples.

The best value is 1, and the worst value is 0.

### precision = TP / (TP+FP)

43% for Categorical Naive Bayes, a big jump to 80% for J48 in both cross validation and test decision tree. A further jump to 88% for both cv and test set for RFC, random clssifier is doing really well.

In [44]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_train, cross_val_y_train_pred, average='micro')

0.88265

In [45]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_train, y_train_pred, average='micro')

1.0

In [46]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_test, y_test_pred, average='micro')

0.8847

### recall = TP / (TP+FN)
Similar to sensitivity (how sensitive is our classifier to the true cases? closer to 1 when theres no false negatives) with 100% sensitivity you catch all cases that could be true, so there will be more false postitives

In [47]:
recall_score(y_train, cross_val_y_train_pred, average='micro')

0.88265

In [48]:
recall_score(y_test, y_test_pred, average='micro')

0.8847

recall for Categorical Naive Bayes was .95 which was a really sensitive classifier, J48 it was .75 so less sensitive. This time around it has gone back up but not as high as naive bayes, to .88. This classifier is on the whole very good.

F1 = 2 / ((1 / precision) + (1 / recall)) = 2 x ((precision x recall) / (precision + recall)) = TP / (TP + ((FN + FP) / 2))

In [49]:
from sklearn.metrics import f1_score

f1_score(y_train, cross_val_y_train_pred, average='micro')

0.88265

In [50]:
from sklearn.metrics import f1_score

f1_score(y_test, y_test_pred, average='micro')

0.8847

In [51]:
y_scores = cross_val_score(clf, X_train, y_train, cv=10)

In [52]:
y_score = clf.predict_proba(X_test)

In [53]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_score, average='macro', multi_class='ovo')

0.9912469500000001

# Does Random Forest generalize well to the new data?

The RFC is better than the J48 algorithm across the board. This is because its training is more randomised, and therefore it generalises better, as it isn't relying on a particular dataset. RFC doesn't overfit or underfit.

# Experimenting with various tree parameters

## Shallow Depth

In [54]:
clf2 = RandomForestClassifier(random_state=42, max_depth=2)

In [55]:
clf2.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, random_state=42)

In [56]:
y_test_pred2 = clf2.predict(X_test)

In [57]:
cfm2 = confusion_matrix(y_test, y_test_pred2)
confusion_matrix(y_test, y_test_pred2)

array([[802,  81,  70,   1,   9,   1,   0,   1,  33,   2],
       [ 10, 965,  21,   1,   0,   2,   0,   0,   1,   0],
       [ 18,   3, 956,   0,   9,   0,   0,   1,  13,   0],
       [282, 466,  63, 140,  28,   1,   0,   5,   2,  13],
       [ 87,  28, 830,   1,  48,   0,   0,   1,   5,   0],
       [  0,   8,   0,   2,   0, 138,   0, 580,  10, 262],
       [271,  51, 621,   1,  13,   0,   1,   0,  42,   0],
       [  0,   0,   0,   0,   0,   1,   0, 897,   1, 101],
       [  0,  12,  63,   4,   0,   6,   0,  27, 772, 116],
       [  0,   1,   2,   1,   0,   0,   0,  82,   4, 910]], dtype=int64)

In [58]:
test_cnf_matrix2 = confusion_matrix(y_test, y_test_pred2)

In [59]:
test_FP2 = test_cnf_matrix2.sum(axis=0) - np.diag(test_cnf_matrix2) 
test_FN2 = test_cnf_matrix2.sum(axis=1) - np.diag(test_cnf_matrix2)
test_TP2 = np.diag(test_cnf_matrix2)
test_TN2 = test_cnf_matrix2.sum() - (test_FP2 + test_FN2 + test_TP2)
test_FP2 = test_FP2.astype(float)
test_FN2 = test_FN2.astype(float)
test_TP2 = test_TP2.astype(float)
test_TN2 = test_TN2.astype(float)

avg_test_FP2=0
avg_test_FN2=0
avg_test_TP2=0
avg_test_TN2=0

for i in test_FP2:
    avg_test_FP2+=i
    
for i in test_FN2:
    avg_test_FN2+=i
    
for i in test_TP2:
    avg_test_TP2+=i

for i in test_TN2:
    avg_test_TN2+=i
    
avg_test_FP2 = avg_test_FP2/10
avg_test_FN2 = avg_test_FN2/10
avg_test_TP2 = avg_test_TP2/10
avg_test_TN2 = avg_test_TN2/10

### Accuracy

In [60]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_test_pred2))

print(accuracy_score(y_test, y_test_pred2, normalize=False))

accuracy = accuracy_score(y_test, y_test_pred2)

0.5629
5629


Accuracy is low but still better than decision tree j48

### Specificity

In [61]:
avg_test_TN2/(avg_test_TN2+avg_test_FP2)

0.9514333333333332

### Precision

In [62]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_test, y_test_pred2, average='micro')

0.5629

### Recall

In [63]:
recall_score(y_test, y_test_pred2, average='micro')

0.5629

In [64]:
from sklearn.metrics import f1_score

f1_score(y_test, y_test_pred2, average='micro')

0.5629

### Shallow Depth Conclusion

It can be seen from the results of the major metrics that reducing the max depth of the decision tree to 2 still has a negative impact on the results even with RFC.

## Confidence Threshold for Pruning

### Threshold 1.0

In [65]:
clf3 = RandomForestClassifier(random_state=42, ccp_alpha=1.0)

In [66]:
clf3.fit(X_train, y_train)

RandomForestClassifier(ccp_alpha=1.0, random_state=42)

In [67]:
y_test_pred3 = clf3.predict(X_test)

In [68]:
cfm3 = confusion_matrix(y_test, y_test_pred3)
confusion_matrix(y_test, y_test_pred3)

array([[   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0]],
      dtype=int64)

### Confidence Threshold for Pruning Conclusion

Classifies as 7 everytime.

## Splitting Criteria

Splitting nodes (children not leafs) into x nodes, so a child node will be split into 6 here, and each of them 6 will either be split into another 6 or have 2 leaf nodes. My initial understanding tells me that a high split will return poor results because it will have too many to choose from.

## Min split = 6 (deafault is 2)

In [69]:
clf4 = RandomForestClassifier(random_state=42, min_samples_split=6)

In [70]:
clf4.fit(X_train, y_train)

RandomForestClassifier(min_samples_split=6, random_state=42)

In [71]:
y_test_pred4 = clf4.predict(X_test)

In [72]:
cfm4 = confusion_matrix(y_test, y_test_pred4)
confusion_matrix(y_test, y_test_pred4)

array([[855,   0,   8,  29,   2,   1,  94,   0,  11,   0],
       [  3, 968,   6,  18,   1,   1,   2,   0,   1,   0],
       [ 10,   1, 803,  10, 110,   0,  55,   0,  11,   0],
       [ 19,   8,   8, 928,  22,   0,  15,   0,   0,   0],
       [  0,   1,  62,  33, 858,   0,  43,   0,   3,   0],
       [  0,   0,   0,   0,   0, 948,   0,  37,   5,  10],
       [164,   1,  98,  24,  77,   0, 622,   0,  14,   0],
       [  0,   0,   0,   0,   0,  18,   0, 932,   0,  50],
       [  1,   1,   7,   0,   3,   1,   9,   2, 975,   1],
       [  0,   0,   1,   0,   0,   6,   0,  40,   3, 950]], dtype=int64)

In [73]:
test_cnf_matrix4 = confusion_matrix(y_test, y_test_pred4)

In [74]:
test_FP4 = test_cnf_matrix4.sum(axis=0) - np.diag(test_cnf_matrix4) 
test_FN4 = test_cnf_matrix4.sum(axis=1) - np.diag(test_cnf_matrix4)
test_TP4 = np.diag(test_cnf_matrix4)
test_TN4 = test_cnf_matrix4.sum() - (test_FP4 + test_FN4 + test_TP4)
test_FP4 = test_FP4.astype(float)
test_FN4 = test_FN4.astype(float)
test_TP4 = test_TP4.astype(float)
test_TN4 = test_TN4.astype(float)

avg_test_FP4=0
avg_test_FN4=0
avg_test_TP4=0
avg_test_TN4=0

for i in test_FP4:
    avg_test_FP4+=i
    
for i in test_FN4:
    avg_test_FN4+=i
    
for i in test_TP4:
    avg_test_TP4+=i

for i in test_TN4:
    avg_test_TN4+=i
    
avg_test_FP4 = avg_test_FP4/10
avg_test_FN4 = avg_test_FN4/10
avg_test_TP4 = avg_test_TP4/10
avg_test_TN4 = avg_test_TN4/10

In [75]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_test_pred4))

print(accuracy_score(y_test, y_test_pred4, normalize=False))

accuracy = accuracy_score(y_test, y_test_pred4)

0.8839
8839


In [76]:
avg_test_TN4/(avg_test_TN4+avg_test_FP4)

0.9871

In [77]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_test, y_test_pred4, average='micro')

0.8839

In [78]:
recall_score(y_test, y_test_pred4, average='micro')

0.8839

In [79]:
from sklearn.metrics import f1_score

f1_score(y_test, y_test_pred4, average='micro')

0.8839

### Min Split = 6 Conclusion

it can be seen that with a sample split of 6, the accuracy is as good as it matches the decision tree accuracy, but is worse than the default for RFC.

## Min Split = 40000

In [80]:
clf5 = RandomForestClassifier(random_state=42, min_samples_split=40000)

In [81]:
clf5.fit(X_train, y_train)

RandomForestClassifier(min_samples_split=40000, random_state=42)

In [82]:
y_test_pred5 = clf5.predict(X_test)

In [83]:
cfm5 = confusion_matrix(y_test, y_test_pred5)
confusion_matrix(y_test, y_test_pred5)

array([[   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0]],
      dtype=int64)

### Min Split = 40000 conclusion

It can be seen from the above matrix that with a split of 40000 the algorithm performs poorly, as there are 10 classes and it is only classifying images as: class 7.

## Max Leaf Nodes

### 2 Leaf Nodes Max

In [85]:
clf7 = RandomForestClassifier(random_state=42, max_leaf_nodes=2)

In [86]:
clf7.fit(X_train, y_train)

RandomForestClassifier(max_leaf_nodes=2, random_state=42)

In [87]:
y_test_pred7 = clf7.predict(X_test)

In [88]:
cfm7 = confusion_matrix(y_test, y_test_pred7)
confusion_matrix(y_test, y_test_pred7)

array([[  1,  26,  26,   0, 872,   0,  36,  11,  28,   0],
       [  0, 715,   0,   3, 273,   2,   0,   0,   0,   7],
       [  0,   2,  45,   0, 926,   0,  21,   1,   5,   0],
       [  0, 432,  15,  73, 410,   1,   9,   2,   4,  54],
       [  0,   7,  11,   0, 973,   0,   3,   3,   3,   0],
       [  0,   2,   0,   0,   0,   2,   0, 978,   4,  14],
       [  0,  14,  33,   0, 875,   0,  44,   4,  28,   2],
       [  0,   0,   0,   0,   0,   0,   0, 999,   0,   1],
       [  0,   9,  25,   1,  45,  22,  42, 482, 372,   2],
       [  0,   3,   0,   3,   1,   0,   1, 940,   3,  49]], dtype=int64)

only 1 correct class 0 prediction.

### 200000 Leaf Nodes Max

In [89]:
clf8 = RandomForestClassifier(random_state=42, max_leaf_nodes=200000)

In [90]:
clf8.fit(X_train, y_train)

RandomForestClassifier(max_leaf_nodes=200000, random_state=42)

In [91]:
y_test_pred8 = clf8.predict(X_test)

In [92]:
cfm8 = confusion_matrix(y_test, y_test_pred8)
confusion_matrix(y_test, y_test_pred8)

array([[860,   0,  13,  35,   1,   1,  78,   0,  12,   0],
       [  4, 971,   5,  15,   1,   1,   3,   0,   0,   0],
       [  9,   1, 800,   8, 116,   0,  57,   0,   9,   0],
       [ 21,   7,   8, 926,  21,   0,  17,   0,   0,   0],
       [  1,   0,  62,  26, 865,   0,  43,   0,   3,   0],
       [  0,   0,   0,   0,   0, 946,   0,  38,   5,  11],
       [170,   1, 107,  32,  80,   0, 596,   0,  14,   0],
       [  0,   0,   0,   0,   0,  16,   0, 930,   0,  54],
       [  2,   1,   7,   0,   3,   2,   6,   2, 976,   1],
       [  0,   0,   0,   0,   0,   7,   1,  39,   2, 951]], dtype=int64)

### 200K leaf nodes conclusion

I initially thought that it would be more accurate, but the matrix looks similar to that of the original algorithm. This will be because the max_split default is 2, and therefore the child nodes allow for the correct path, the only decision it has to make is on the leaf nodes, and it probably isn't using the full 200k leaf nodes allowed. If I were to change the max_split in combination with max leaf nodes to a really high number, the results would be bad.

# Test + 30% of Training set

In [93]:
import pandas as pd

batch_size = 512
train_data = np.array(pd.read_csv(r'C:\Users\ryand\Desktop\Data Mining & ML\Fashion\fashion-mnist_train.csv'))
test_data = np.array(pd.read_csv(r'C:\Users\ryand\Desktop\Data Mining & ML\Fashion\fashion-mnist_test.csv'))

X_train = train_data[:, 1:785]
y_train = train_data[:, 0]
X_test = test_data[:, 1:785]
y_test = test_data[:, 0]

In [94]:
thirty_percent_X_train = X_train[0:18000]
thirty_percent_y_train = y_train[0:18000]

X_test_train = np.concatenate((X_test, thirty_percent_X_train), axis=0)
y_test_train = np.concatenate((y_test, thirty_percent_y_train), axis=0)

seventy_percent_X_train = X_train[18000:]
seventy_percent_y_train = y_train[18000:]

In [95]:
print(X_test_train.shape)
print(y_test_train.shape)
print(seventy_percent_X_train.shape)
print(seventy_percent_y_train.shape)

(28000, 784)
(28000,)
(42000, 784)
(42000,)


In [96]:
clf = RandomForestClassifier(random_state=42)
clf.fit(seventy_percent_X_train, seventy_percent_y_train)

RandomForestClassifier(random_state=42)

In [97]:
cross_val_y_train_pred = cross_val_predict(clf, seventy_percent_X_train, seventy_percent_y_train, cv=10)

In [98]:
cv_cnf_matrix = confusion_matrix(seventy_percent_y_train, cross_val_y_train_pred)

### Accuracy of 70% of Train

In [99]:
from sklearn.metrics import accuracy_score

print(accuracy_score(seventy_percent_y_train, cross_val_y_train_pred))

print(accuracy_score(seventy_percent_y_train, cross_val_y_train_pred, normalize=False))

accuracy = accuracy_score(seventy_percent_y_train, cross_val_y_train_pred)

0.8781190476190476
36881


### Accuracy of Test + 30% of Train

In [100]:
y_test_train_pred = clf.predict(X_test_train)

In [101]:
cfm = confusion_matrix(y_test_train, y_test_train_pred)
confusion_matrix(y_test_train, y_test_train_pred)

array([[2480,    3,   37,   96,    3,    5,  225,    0,   19,    0],
       [   3, 2698,   14,   52,    7,    1,   21,    0,    4,    0],
       [  17,    1, 2234,   28,  313,    0,  146,    0,   20,    0],
       [  53,   13,   24, 2596,   69,    0,   55,    0,    5,    0],
       [   2,    5,  178,  122, 2302,    1,  118,    0,   10,    0],
       [   0,    0,    0,    1,    0, 2675,    0,   86,    7,   26],
       [ 433,    4,  349,   78,  256,    1, 1634,    0,   52,    0],
       [   0,    0,    0,    0,    0,   49,    0, 2680,    4,  122],
       [   5,    3,   14,    7,   11,   10,   31,    7, 2690,    4],
       [   0,    0,    0,    0,    0,   30,    3,  104,    5, 2639]],
      dtype=int64)

In [102]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test_train, y_test_train_pred))

print(accuracy_score(y_test_train, y_test_train_pred, normalize=False))

accuracy = accuracy_score(y_test_train, y_test_train_pred)

0.8795714285714286
24628


### Conclusions

Accuracy was the same as the original datasets. This shows the ability of the classifier to classify with less data.

# Test + 60% of Train

In [103]:
import pandas as pd

batch_size = 512
train_data = np.array(pd.read_csv(r'C:\Users\ryand\Desktop\Data Mining & ML\Fashion\fashion-mnist_train.csv'))
test_data = np.array(pd.read_csv(r'C:\Users\ryand\Desktop\Data Mining & ML\Fashion\fashion-mnist_test.csv'))

X_train = train_data[:, 1:785]
y_train = train_data[:, 0]
X_test = test_data[:, 1:785]
y_test = test_data[:, 0]

In [104]:
sixty_percent_X_train = X_train[0:36000]
sixty_percent_y_train = y_train[0:36000]

X_test_train = np.concatenate((X_test, sixty_percent_X_train), axis=0)
y_test_train = np.concatenate((y_test, sixty_percent_y_train), axis=0)

fourty_percent_X_train = X_train[36000:]
fourty_percent_y_train = y_train[36000:]

In [105]:
print(X_test_train.shape)
print(y_test_train.shape)
print(fourty_percent_X_train.shape)
print(fourty_percent_y_train.shape)

(46000, 784)
(46000,)
(24000, 784)
(24000,)


In [106]:
clf = RandomForestClassifier(random_state=42)
clf.fit(fourty_percent_X_train, fourty_percent_y_train)

RandomForestClassifier(random_state=42)

In [107]:
cross_val_y_train_pred = cross_val_predict(clf, fourty_percent_X_train, fourty_percent_y_train, cv=10)

In [108]:
cv_cnf_matrix = confusion_matrix(fourty_percent_y_train, cross_val_y_train_pred)

### Accuracy of 40% Train

In [109]:
from sklearn.metrics import accuracy_score

print(accuracy_score(fourty_percent_y_train, cross_val_y_train_pred))

print(accuracy_score(fourty_percent_y_train, cross_val_y_train_pred, normalize=False))

accuracy = accuracy_score(fourty_percent_y_train, cross_val_y_train_pred)

0.870625
20895


### Accuracy of Test + 60% Train

In [110]:
y_test_train_pred = clf.predict(X_test_train)

In [111]:
cfm = confusion_matrix(y_test_train, y_test_train_pred)
confusion_matrix(y_test_train, y_test_train_pred)

array([[3981,    4,   59,  176,   16,    6,  327,    0,   45,    0],
       [   9, 4411,   20,  115,    6,    1,   30,    0,    3,    0],
       [  27,    2, 3691,   43,  584,    1,  216,    0,   41,    0],
       [  98,   18,   44, 4221,  108,    0,   96,    0,    9,    0],
       [   6,    6,  341,  214, 3778,    1,  182,    0,   23,    0],
       [   0,    0,    0,    0,    0, 4338,    0,  147,   22,   54],
       [ 761,    7,  604,  129,  476,    2, 2576,    0,   98,    0],
       [   0,    0,    0,    0,    0,  101,    0, 4304,    6,  234],
       [   8,    5,   24,   13,   22,   19,   44,    9, 4421,    4],
       [   0,    0,    2,    0,    0,   54,    1,  165,    9, 4382]],
      dtype=int64)

In [112]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test_train, y_test_train_pred))

print(accuracy_score(y_test_train, y_test_train_pred, normalize=False))

accuracy = accuracy_score(y_test_train, y_test_train_pred)

0.871804347826087
40103


### Conclusions

Accuracy is similar again, slightly lower, but very much similar. Again, this shows the power of RFC.

# Categorical Naive Bayes

In [None]:
import numpy as np

rng = np.random.RandomState(1)

X = X_train
Y = y_train_0

from sklearn.naive_bayes import CategoricalNB
clf = CategoricalNB()
clf.fit(X, Y)
CategoricalNB()
CategoricalPredictions = clf.predict(X_train)

In [None]:
from tabulate import tabulate

countFalse = 0
countTrue = 0

for i in CategoricalPredictions:
    if i == False:
        countFalse += 1
    elif i == True:
        countTrue += 1
        
print(tabulate([['False', countFalse],['True', countTrue]]))

## Accuracy
87% accuracy - decent

In [10]:
from sklearn.metrics import accuracy_score
y_pred = CategoricalPredictions
y_true = y_train_0
print(accuracy_score(y_true, y_pred))

print(accuracy_score(y_true, y_pred, normalize=False))

accuracy = accuracy_score(y_true, y_pred)


0.8706833333333334
52241


## Confusion Matrix 

In [11]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_true, y_pred)

array([[46528,  7472],
       [  287,  5713]], dtype=int64)

True negatives C0,0 is 46528 | False negatives C1,0 is 287 | True postivies C1,1 is 5713 | False postivies C0,1 is 7472. This is overly predicting true cases, but also predicts a good portion of negative cases with good accuracy. I'd say these results are neither amazing nor bad. My observation is that two items of clothing are very similar and so it is predicting them as the same item.

## Specificity

In [46]:
46528 / (52380 + 7472)

0.777384214395509

## Precision

In [12]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_true, y_pred)

0.433295411452408

## Recall

In [13]:
recall_score(y_true, y_pred)

0.9521666666666667

## F1

In [14]:
from sklearn.metrics import f1_score

f1_score(y_true, y_pred)

0.5955694553036226