In [1]:
import tensorflow as tf

In [2]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import precision_score, recall_score, fbeta_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import RandomizedSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_classif


In [3]:
fashion_mnist=tf.keras.datasets.fashion_mnist

In [5]:
(X_train,y_train),(X_test,y_test)=fashion_mnist.load_data()

In [6]:
test=np.array(X_test)

In [7]:
test.shape=(10000,28*28)

In [7]:
test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [8]:
train=np.array(X_train)

In [9]:
train.shape=(60000,784)

In [10]:
train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [11]:
df=pd.DataFrame(train)

In [12]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,119,114,130,76,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,22,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,33,96,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59997,0,0,0,0,0,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
59998,0,0,0,0,0,0,0,0,0,0,...,66,54,50,5,0,1,0,0,0,0


In [13]:
def confusion_matrix_show(cf_matrix):
    group_names = ['True Neg','False Pos','False Neg','True Pos']

    group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix.flatten()]

    group_percentages = ["{0:.2%}".format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]

    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]

    labels = np.asarray(labels).reshape(2,2)

    ax = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')

    ax.set_title('Model Confusion Matrix\n\n');
    ax.set_xlabel('\nPredictions')
    ax.set_ylabel('Actual');

    ## Ticket labels - List must be in alphabetical order
    ax.xaxis.set_ticklabels(['False','True'])
    ax.yaxis.set_ticklabels(['False','True'])
    
    
    ## Display the visualization of the Confusion Matrix.
    plt.show()
    
def roc_show(y_test,y_predict):
    plt.figure(dpi=100)
    fpr, tpr, thresholds = roc_curve(y_test, y_predict)
    plt.plot(fpr,tpr,label = "%.2f" %sklearn.metrics.roc_auc_score(y_test,y_predict))
    plt.plot([0, 1], [0, 1], color='crimson', linestyle='--')
    plt.legend(loc = 'lower right')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.title('ROC Curve for Diabetes for Model')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.grid(True)

In [14]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(train,y_train, test_size=0.2, random_state=25)

# Set the figure size
plt.figure(figsize=(10,10))
input_data=df.iloc[:,1:]/255
# Show only the first 30 pictures
for i in range(30):
    plt.subplot(6,5, i+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(np.array(input_data.iloc[i, :]).reshape(28,28), cmap=plt.cm.binary)
    plt.xlabel(label[target.label.iloc[i]])

In [33]:
# dummy model, to see what a "bad score" would look like

dummy_model = DummyClassifier()
dummy_model.fit(X_train2,y_train2)

y_predict = dummy_model.predict(X_test2)

y_train_pred = cross_val_predict(dummy_model, X_train2, y_train2, cv=10)

precision = precision_score(y_train2, y_train_pred, average='micro')

recall = recall_score(y_train2, y_train_pred, average='micro')

accuracy = dummy_model.score(X_test2, y_test2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy:{accuracy}")


Precision: 0.09933333333333333
Recall: 0.09933333333333333
Accuracy:0.10175




In [14]:
dummy_model.score(test,y_test)

0.1

In [42]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(train,y_train, test_size=0.2, random_state=25)

not good

In [34]:
decision_tree = DecisionTreeClassifier()
decision_tree =decision_tree.fit(X_train2,y_train2)

In [35]:
decision_tree.score(X_test2,y_test2)

y_train_pred = cross_val_predict(decision_tree, X_train2, y_train2, cv=3)

precision = precision_score(y_train2, y_train_pred, average='micro')

recall = recall_score(y_train2, y_train_pred, average='micro')

accuracy = decision_tree.score(X_test2, y_test2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy:{accuracy}")

Precision: 0.7936458333333334
Recall: 0.7936458333333334
Accuracy:0.79625


In [15]:
def select_features(X_train, y_train, X_test,i):
    
    # configure to select a subset of features
    fs = SelectKBest(score_func=f_classif, k=i)
    
    # learn relationship from training data
    fs.fit(X_train, y_train)
    
    # transform train input data
    X_train_fs = fs.transform(X_train)
    
    # transform test input data
    X_test_fs = fs.transform(X_test)
        
    return X_train_fs, X_test_fs, fs

In [14]:
X_train_fs, X_test_fs, fs = select_features(X_train2, y_train2, X_test2,600)

In [38]:
decision_tree = DecisionTreeClassifier()
decision_tree =decision_tree.fit(X_train_fs,y_train2)

after some manual tests, we recognized that the k-best number of features would be 600!

0.79875-700

0.7999166666666667-600

0.7940833333333334-500

0.7936666666666666-550

0.79475-575


In [39]:
y_train_pred = cross_val_predict(decision_tree, X_train_fs, y_train2, cv=3)

precision = precision_score(y_train2, y_train_pred, average='micro')

recall = recall_score(y_train2, y_train_pred, average='micro')

accuracy = decision_tree.score(X_test_fs, y_test2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy:{accuracy}")

Precision: 0.7849375
Recall: 0.7849375
Accuracy:0.8003333333333333


In [50]:
param_dist = {"max_depth": [3, 10, 20, None],
              "max_features": ['auto', 'sqrt'],
              "min_samples_leaf": [1, 2, 4],
              "criterion": ["gini", "entropy"]}
dt_random = RandomizedSearchCV(estimator = decision_tree, param_distributions = param_dist, n_iter = 100, cv = 3, verbose=3, random_state=42, n_jobs = -1, scoring='accuracy')
# Fit the random search model
dt_random.fit(X_train_fs, y_train2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   30.0s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   39.8s finished


RandomizedSearchCV(cv=3, estimator=DecisionTreeClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, 10, 20, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4]},
                   random_state=42, scoring='accuracy', verbose=3)

In [51]:
dt_random.best_params_

{'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'criterion': 'entropy'}

these are the best features for decision trees! we will now see how our tuned model will score

In [56]:
dt_tuned = DecisionTreeClassifier(min_samples_leaf=4,max_features='sqrt',
                                  max_depth=10,criterion='entropy')

dt_tuned.fit(X_train_fs,y_train2)

y_train_pred = cross_val_predict(dt_tuned, X_train_fs, y_train2, cv=3)

precision = precision_score(y_train2, y_train_pred, average='micro')

recall = recall_score(y_train2, y_train_pred, average='micro')

accuracy = dt_tuned.score(X_test_fs, y_test2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy:{accuracy}")

Precision: 0.7840416666666666
Recall: 0.7840416666666666
Accuracy:0.788


The hyperparameter tuning did not help our model. Our best decision tree model overall is the untouched one, but it is best to remove some unnecessary features (and the accuracy improved slightly), so our winner would be the decision tree model with feature selection

In [17]:
X_train_fs, X_test_fs, fs = select_features(X_train2, y_train2, X_test2,600)

In [14]:
knn = KNeighborsClassifier()

knn.fit(X_train2,y_train2)

KNeighborsClassifier()

In [15]:
y_train_pred = cross_val_predict(knn, X_train2, y_train2, cv=3)

precision = precision_score(y_train2, y_train_pred, average='micro')

recall = recall_score(y_train2, y_train_pred, average='micro')

accuracy = knn.score(X_test2, y_test2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy:{accuracy}")

Precision: 0.8478125
Recall: 0.8478125
Accuracy:0.85125


In [15]:
knn2 = KNeighborsClassifier()

knn2.fit(X_train_fs,y_train2)



KNeighborsClassifier()

In [16]:
y_train_pred = cross_val_predict(knn2, X_train_fs, y_train2, cv=3)

precision = precision_score(y_train2, y_train_pred, average='micro')

recall = recall_score(y_train2, y_train_pred, average='micro')

accuracy = knn2.score(X_test_fs, y_test2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy:{accuracy}")

Precision: 0.8492916666666667
Recall: 0.8492916666666667
Accuracy:0.8514166666666667


we have slight improvement, therefore we will be using the feature selected model

In [101]:
knn2.score(X_test_fs,y_test2)

0.8514166666666667

In [44]:
cross_val_score(knn,train,y_train,cv=5)

array([0.85741667, 0.84875   , 0.85358333, 0.85875   , 0.8515    ])

In [98]:
knn.score(X_test2,y_test2)

0.85125

In [None]:
k2 = []
ind=[]
ind=range(1,100)

for  i in range(1,100):
    knn = KNeighborsClassifier(n_neighbors=i)
    score=cross_val_score(knn,X_train_fs,y_train2,cv=3,scoring='accuracy')
    k2.append(score.mean())
d2 = {'K': ind, 'Accuracy': k2}
dfd2 = pd.DataFrame(data=d2)
dfd2.sort_values('Accuracy',ascending=False,inplace=True,ignore_index=True)
dfd2.head(10)

In [47]:
model = AdaBoostClassifier()
# evaluate the model
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
model.fit(train,y_train)

AdaBoostClassifier()

https://medium.com/@chaudhurysrijani/tuning-of-adaboost-with-computational-complexity-8727d01a9d20#:~:text=An%20important%20hyperparameter%20for%20Adaboost,often%20hundreds%2C%20if%20not%20thousands.

In [49]:
model.score(test,y_test)

y_train_pred = cross_val_predict(model, X_train2, y_train2, cv=3)

precision = precision_score(y_train2, y_train_pred, average='micro')

recall = recall_score(y_train2, y_train_pred, average='micro')

accuracy = model.score(X_test2, y_test2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy:{accuracy}")


0.5425