In [2]:
import os
import random
import struct
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import catboost as cb 
from sklearn import datasets,svm
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

# 数据装填

In [3]:
def load_mnist( kind='train'):
    """Load MNIST data from `path`"""
    labels_path = ('%s-labels.idx1-ubyte' % kind)
    images_path = ('%s-images.idx3-ubyte' % kind)

    with open(labels_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II', lbpath.read(8))
        labels = np.fromfile(lbpath, dtype=np.uint8)
        #print(magic, n,labels)

    with open(images_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack(">IIII", imgpath.read(16))
        #print(magic, num, rows, cols)
        images = np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784)  #28*28=784
    return images, labels


In [4]:
train_ori,y_train=load_mnist(kind='train')
train_plot=train_ori
test_ori,y_test=load_mnist(kind='t10k')
test_plot=test_ori

In [5]:
train_ori=pd.DataFrame(train_ori)
test_plot=pd.DataFrame(train_ori)
y_train=pd.DataFrame(y_train)
test_ori=pd.DataFrame(test_ori)
test_plot=pd.DataFrame(test_ori)
y_test=pd.DataFrame(y_test)
print(train_ori,y_train,test_ori,y_test)

       0    1    2    3    4    5    6    7    8    9    ...  774  775  776  \
0        0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
1        0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
2        0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
3        0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
4        0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
...    ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
59995    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
59996    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
59997    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
59998    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
59999    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   

       777  778  779  780  781  782  783  
0       

# 检查有无用（即灰度一直不变）的像素并去除

In [6]:
feats_counts = train_ori.nunique(dropna = False)
feats_counts.sort_values()

0        1
560      1
476      1
782      1
168      1
      ... 
410    256
409    256
408    256
159    256
411    256
Length: 784, dtype: int64

In [7]:
feats_counts_exchange1=pd.DataFrame(feats_counts)
feats_counts_exchange2=feats_counts_exchange1[feats_counts_exchange1==1]
print(feats_counts_exchange2.isnull().sum())
feats_drop=feats_counts_exchange2[feats_counts_exchange2.notnull().all(axis=1)]
print(feats_drop.index)

0    717
dtype: int64
Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  16,
             17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
             30,  31,  52,  53,  54,  55,  56,  57,  82,  83,  84,  85, 111,
            112, 140, 141, 168, 476, 560, 644, 645, 671, 672, 673, 699, 700,
            701, 727, 728, 729, 730, 754, 755, 756, 757, 758, 759, 780, 781,
            782, 783],
           dtype='int64')


很明显都是一些处在图像边角的像素点，所以把这些边角点去掉

In [8]:
train_ori=train_ori.drop(labels=feats_drop.index,axis = 1)
train_ori

Unnamed: 0,12,13,14,15,32,33,34,35,36,37,...,770,771,772,773,774,775,776,777,778,779
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
test_ori=test_ori.drop(labels=feats_drop.index,axis = 1)
test_ori

Unnamed: 0,12,13,14,15,32,33,34,35,36,37,...,770,771,772,773,774,775,776,777,778,779
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#二值化

train_ori[train_ori>0]=1
test_ori[test_ori>0]=1

In [9]:
train=train_ori#train_ori,train_pca
test=test_ori#test=test_ori,test_pca

In [11]:
model_M = MultinomialNB()
model_G = GaussianNB()
model_B = BernoulliNB()

model_M.fit(train,y_train)#8365
preds_M = model_M.predict(test)
print(confusion_matrix(y_test, preds_M))
print(accuracy_score(y_test,preds_M))

model_G.fit(train,y_train)
preds_G = model_G.predict(test)
print(confusion_matrix(y_test, preds_G))
print(accuracy_score(y_test,preds_G))

model_B.fit(train,y_train)#8413
preds_B = model_B.predict(test)
print(confusion_matrix(y_test, preds_B))
print(accuracy_score(y_test,preds_B))

  return f(**kwargs)


[[ 912    0    2    6    1    8   14    1   36    0]
 [   0 1061    5    9    0    2    6    0   51    1]
 [  15   11  858   24   10    3   33   11   66    1]
 [   4   11   34  851    1   21    7   14   40   27]
 [   2    2    6    0  732    0   25    1   38  176]
 [  23   11    6  107   18  590   17    6   78   36]
 [  17   13   17    1    7   25  860    0   18    0]
 [   1   21   11    5   19    0    1  861   40   69]
 [   6   26   13   54   14   27    8    9  777   40]
 [   6    7    3   10   66   10    0   17   27  863]]
0.8365


  return f(**kwargs)


[[ 870    0    3    5    2    5   31    1   35   28]
 [   0 1079    2    1    0    0   10    0   38    5]
 [  79   25  266   91    5    2  269    4  271   20]
 [  32   39    6  353    2    3   51    8  409  107]
 [  19    2    5    4  168    7   63    7  210  497]
 [  71   25    1   20    3   44   40    2  586  100]
 [  12   12    3    1    1    7  895    0   26    1]
 [   0   15    2   10    5    1    5  280   39  671]
 [  13   72    3    7    3   11   12    4  648  201]
 [   5    7    3    6    1    0    1   13   18  955]]
0.5558


  return f(**kwargs)


[[ 887    0    4    7    2   41   16    1   22    0]
 [   0 1085   10    5    0    9    6    0   19    1]
 [  19    8  852   29   17    4   32   14   55    2]
 [   5   15   34  844    0   13    9   15   49   26]
 [   2    6    4    0  795    4   21    1   23  126]
 [  23   12    7  129   30  627   16    8   21   19]
 [  18   18   15    2   13   35  851    0    6    0]
 [   1   24   14    4   15    0    0  871   27   72]
 [  16   23   13   76   17   22    7    6  758   36]
 [   9   13    5    9   74    8    0   24   24  843]]
0.8413


In [12]:
model = BernoulliNB()
param_dist = {"binarize": [0.5],
              "alpha" : [0,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09],
              "fit_prior" : [True,False]}

grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3, 
                                   verbose=10, n_jobs=-1)
grid_search.fit(train, y_train)
grid_search.best_estimator_

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  52 out of  60 | elapsed:  3.6min remaining:   33.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  3.9min finished
  return f(**kwargs)


BernoulliNB(alpha=0, binarize=0.5)

In [13]:
model_Best = BernoulliNB()#(alpha=0, binarize=0.5)
model_Best.fit(train,y_train)#8419
preds = model_Best.predict(test)
print(confusion_matrix(y_test, preds))
print(accuracy_score(y_test,preds))

  return f(**kwargs)


[[ 887    0    4    7    2   41   16    1   22    0]
 [   0 1085   10    5    0    9    6    0   19    1]
 [  19    8  852   29   17    4   32   14   55    2]
 [   5   15   34  844    0   13    9   15   49   26]
 [   2    6    4    0  795    4   21    1   23  126]
 [  23   12    7  129   30  627   16    8   21   19]
 [  18   18   15    2   13   35  851    0    6    0]
 [   1   24   14    4   15    0    0  871   27   72]
 [  16   23   13   76   17   22    7    6  758   36]
 [   9   13    5    9   74    8    0   24   24  843]]
0.8413


In [37]:
%matplotlib
fig, ax = plt.subplots(
    nrows=2,
    ncols=10,
    sharex=True,
    sharey=True, )

ax = ax.flatten()
k=random.randint(0,9980)
n=0
for i in range(10):
    for j in range(2):
        print("perdict:%d, True:%d" %(preds[k],y_test.values[k]))
        img = np.array(test_plot)[k,:].reshape(28, 28)
        ax[n].imshow(img, cmap='Greys', interpolation='nearest')
        k=k+1
        n=n+1

ax[0].set_xticks([])
ax[0].set_yticks([])
plt.tight_layout()
plt.show()

Using matplotlib backend: Qt5Agg
perdict:6, True:6
perdict:7, True:7
perdict:8, True:8
perdict:9, True:9
perdict:0, True:0
perdict:1, True:1
perdict:2, True:2
perdict:3, True:3
perdict:4, True:4
perdict:5, True:5
perdict:6, True:6
perdict:7, True:7
perdict:8, True:8
perdict:9, True:9
perdict:8, True:8
perdict:0, True:0
perdict:5, True:5
perdict:6, True:6
perdict:6, True:6
perdict:0, True:0


# 降维

In [29]:
pca = PCA(n_components=717)
train_pca = pca.fit_transform(train_ori.values[:,0:718]) 
test_pca = pca.fit_transform(test_ori.values[:,0:718]) 
print(pca.explained_variance_ratio_)#方差百分比，越大越重要

[1.00476637e-01 7.54448687e-02 6.14051640e-02 5.42580757e-02
 5.03124907e-02 4.24636351e-02 3.31140388e-02 2.95028844e-02
 2.72985783e-02 2.27804148e-02 2.13389874e-02 2.09520381e-02
 1.72589817e-02 1.68575787e-02 1.57607739e-02 1.49456231e-02
 1.30250648e-02 1.27846749e-02 1.18011852e-02 1.14829974e-02
 1.06640981e-02 1.02717794e-02 9.79714914e-03 9.01867308e-03
 8.88230598e-03 8.26572580e-03 7.99743217e-03 7.65837906e-03
 7.17513682e-03 6.87255762e-03 6.49144290e-03 6.27159038e-03
 5.90284431e-03 5.81758998e-03 5.44737613e-03 5.41655176e-03
 5.07242566e-03 4.81120974e-03 4.60166113e-03 4.51522794e-03
 4.37027009e-03 4.24364020e-03 4.12963974e-03 3.89631088e-03
 3.78241088e-03 3.70460718e-03 3.58284288e-03 3.31726366e-03
 3.26815238e-03 3.11841988e-03 3.04071146e-03 2.97838557e-03
 2.88486522e-03 2.78701693e-03 2.73021218e-03 2.62964970e-03
 2.54962808e-03 2.53014061e-03 2.44552889e-03 2.41445395e-03
 2.34363158e-03 2.33338799e-03 2.23071018e-03 2.15530296e-03
 2.08822408e-03 2.023286

In [30]:
def count(array_input,delta):
    m = 0
    for i in range(len(array_input)):
        m += array_input[i]
        if(m>1-delta):
            return i

In [31]:
count(pca.explained_variance_ratio_,0.01)

322

前322个像素点方差百分比之和大于99%，故降成322维

In [32]:
pca = PCA(n_components=322)
train_pca = pca.fit_transform(train_ori.values[:,0:718]) 
test_pca = pca.fit_transform(test_ori.values[:,0:718]) 

In [33]:
train=train_pca#train_ori,train_pca
test=test_pca#test=test_ori,test_pca

In [34]:
model_B.fit(train,y_train)#8413
preds_B = model_B.predict(test)
print(confusion_matrix(y_test, preds_B))
print(accuracy_score(y_test,preds_B))

  return f(**kwargs)


[[286   1  37  32  76 129 147 149  56  67]
 [  1   3 108   5 255  18  46 554   2 143]
 [ 46  33  57  11 183  36 454  53  28 131]
 [ 88  16  25  22 233  89  51 121  43 322]
 [ 70 148 133 209  11  82  86   4 227  12]
 [148  53  34 129 130  68  18 142  15 155]
 [ 96  19 422   1 106  23 141  49  60  41]
 [ 93 354  25 100  18 173  76  25 122  42]
 [ 16   6  49 203 205  91  43 147  38 176]
 [ 51  91 101 306   8 115  56  26 243  12]]
0.0663


In [97]:
train=train_ori#train_ori,train_pca
test=test_ori#test=test_ori,test_pca

In [98]:
#cb

model_cb = cb.CatBoostClassifier(eval_metric="AUC", depth=7,l2_leaf_reg= 9, learning_rate= 0.04)
model_cb.fit(train,y_train)
preds = model_cb.predict(test)
print(confusion_matrix(y_test, preds))#9680

0:	total: 3.65s	remaining: 1h 46s
1:	total: 7.14s	remaining: 59m 20s
2:	total: 10.6s	remaining: 58m 44s
3:	total: 14.2s	remaining: 58m 51s
4:	total: 17.7s	remaining: 58m 49s
5:	total: 21.3s	remaining: 58m 41s
6:	total: 24.8s	remaining: 58m 38s
7:	total: 28.4s	remaining: 58m 37s
8:	total: 32.2s	remaining: 59m 6s
9:	total: 35.8s	remaining: 59m 2s
10:	total: 39.3s	remaining: 58m 52s
11:	total: 42.8s	remaining: 58m 43s
12:	total: 46.5s	remaining: 58m 50s
13:	total: 50.2s	remaining: 58m 53s
14:	total: 53.7s	remaining: 58m 45s
15:	total: 57.2s	remaining: 58m 38s
16:	total: 1m	remaining: 58m 42s
17:	total: 1m 4s	remaining: 58m 44s
18:	total: 1m 8s	remaining: 58m 42s
19:	total: 1m 11s	remaining: 58m 37s
20:	total: 1m 15s	remaining: 58m 36s
21:	total: 1m 19s	remaining: 58m 35s
22:	total: 1m 22s	remaining: 58m 25s
23:	total: 1m 25s	remaining: 58m 14s
24:	total: 1m 29s	remaining: 58m 13s
25:	total: 1m 33s	remaining: 58m 9s
26:	total: 1m 36s	remaining: 58m 5s
27:	total: 1m 40s	remaining: 57m 56s
2

221:	total: 13m 27s	remaining: 47m 11s
222:	total: 13m 31s	remaining: 47m 7s
223:	total: 13m 35s	remaining: 47m 4s
224:	total: 13m 38s	remaining: 47m
225:	total: 13m 42s	remaining: 46m 56s
226:	total: 13m 46s	remaining: 46m 52s
227:	total: 13m 49s	remaining: 46m 49s
228:	total: 13m 53s	remaining: 46m 45s
229:	total: 13m 56s	remaining: 46m 40s
230:	total: 13m 59s	remaining: 46m 36s
231:	total: 14m 3s	remaining: 46m 32s
232:	total: 14m 7s	remaining: 46m 28s
233:	total: 14m 10s	remaining: 46m 24s
234:	total: 14m 14s	remaining: 46m 20s
235:	total: 14m 17s	remaining: 46m 16s
236:	total: 14m 21s	remaining: 46m 12s
237:	total: 14m 24s	remaining: 46m 8s
238:	total: 14m 28s	remaining: 46m 4s
239:	total: 14m 31s	remaining: 46m 1s
240:	total: 14m 35s	remaining: 45m 57s
241:	total: 14m 38s	remaining: 45m 52s
242:	total: 14m 42s	remaining: 45m 48s
243:	total: 14m 45s	remaining: 45m 43s
244:	total: 14m 49s	remaining: 45m 40s
245:	total: 14m 52s	remaining: 45m 36s
246:	total: 14m 56s	remaining: 45m 3

434:	total: 25m 30s	remaining: 33m 7s
435:	total: 25m 33s	remaining: 33m 4s
436:	total: 25m 37s	remaining: 33m
437:	total: 25m 40s	remaining: 32m 56s
438:	total: 25m 43s	remaining: 32m 52s
439:	total: 25m 46s	remaining: 32m 48s
440:	total: 25m 50s	remaining: 32m 45s
441:	total: 25m 53s	remaining: 32m 41s
442:	total: 25m 56s	remaining: 32m 37s
443:	total: 26m	remaining: 32m 33s
444:	total: 26m 3s	remaining: 32m 29s
445:	total: 26m 6s	remaining: 32m 26s
446:	total: 26m 10s	remaining: 32m 22s
447:	total: 26m 13s	remaining: 32m 18s
448:	total: 26m 16s	remaining: 32m 14s
449:	total: 26m 20s	remaining: 32m 11s
450:	total: 26m 23s	remaining: 32m 7s
451:	total: 26m 27s	remaining: 32m 4s
452:	total: 26m 30s	remaining: 32m
453:	total: 26m 34s	remaining: 31m 57s
454:	total: 26m 38s	remaining: 31m 54s
455:	total: 26m 41s	remaining: 31m 50s
456:	total: 26m 45s	remaining: 31m 47s
457:	total: 26m 49s	remaining: 31m 44s
458:	total: 26m 52s	remaining: 31m 40s
459:	total: 26m 56s	remaining: 31m 37s
460:

647:	total: 37m 17s	remaining: 20m 15s
648:	total: 37m 20s	remaining: 20m 11s
649:	total: 37m 23s	remaining: 20m 8s
650:	total: 37m 26s	remaining: 20m 4s
651:	total: 37m 30s	remaining: 20m 1s
652:	total: 37m 33s	remaining: 19m 57s
653:	total: 37m 36s	remaining: 19m 53s
654:	total: 37m 39s	remaining: 19m 50s
655:	total: 37m 43s	remaining: 19m 46s
656:	total: 37m 46s	remaining: 19m 43s
657:	total: 37m 49s	remaining: 19m 39s
658:	total: 37m 52s	remaining: 19m 36s
659:	total: 37m 56s	remaining: 19m 32s
660:	total: 37m 59s	remaining: 19m 29s
661:	total: 38m 2s	remaining: 19m 25s
662:	total: 38m 6s	remaining: 19m 22s
663:	total: 38m 9s	remaining: 19m 18s
664:	total: 38m 12s	remaining: 19m 14s
665:	total: 38m 15s	remaining: 19m 11s
666:	total: 38m 19s	remaining: 19m 7s
667:	total: 38m 22s	remaining: 19m 4s
668:	total: 38m 25s	remaining: 19m
669:	total: 38m 28s	remaining: 18m 57s
670:	total: 38m 31s	remaining: 18m 53s
671:	total: 38m 35s	remaining: 18m 50s
672:	total: 38m 39s	remaining: 18m 46

861:	total: 48m 24s	remaining: 7m 45s
862:	total: 48m 27s	remaining: 7m 41s
863:	total: 48m 31s	remaining: 7m 38s
864:	total: 48m 34s	remaining: 7m 34s
865:	total: 48m 37s	remaining: 7m 31s
866:	total: 48m 40s	remaining: 7m 27s
867:	total: 48m 43s	remaining: 7m 24s
868:	total: 48m 46s	remaining: 7m 21s
869:	total: 48m 49s	remaining: 7m 17s
870:	total: 48m 52s	remaining: 7m 14s
871:	total: 48m 55s	remaining: 7m 10s
872:	total: 48m 58s	remaining: 7m 7s
873:	total: 49m 2s	remaining: 7m 4s
874:	total: 49m 5s	remaining: 7m
875:	total: 49m 8s	remaining: 6m 57s
876:	total: 49m 11s	remaining: 6m 54s
877:	total: 49m 15s	remaining: 6m 50s
878:	total: 49m 18s	remaining: 6m 47s
879:	total: 49m 22s	remaining: 6m 43s
880:	total: 49m 25s	remaining: 6m 40s
881:	total: 49m 28s	remaining: 6m 37s
882:	total: 49m 32s	remaining: 6m 33s
883:	total: 49m 35s	remaining: 6m 30s
884:	total: 49m 38s	remaining: 6m 27s
885:	total: 49m 42s	remaining: 6m 23s
886:	total: 49m 46s	remaining: 6m 20s
887:	total: 49m 49s	r

In [101]:
print(accuracy_score(y_test,preds))

0.968


In [10]:
train_ori[train_ori>0]=1
test_ori[test_ori>0]=1

In [39]:
train_ori

Unnamed: 0,12,13,14,15,32,33,34,35,36,37,...,770,771,772,773,774,775,776,777,778,779
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
test_ori

Unnamed: 0,12,13,14,15,32,33,34,35,36,37,...,770,771,772,773,774,775,776,777,778,779
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
train=train_ori#train_ori,train_pca
test=test_ori#test=test_ori,test_pca

In [12]:
cat_features_index = np.where(train[train.columns].dtypes!=float)[0]

#With Categorical features
model_cb_coded = cb.CatBoostClassifier(eval_metric="AUC", depth=7,l2_leaf_reg= 9, learning_rate= 0.04)
model_cb_coded.fit(train,y_train, cat_features=cat_features_index)
preds_cb_coded= model_cb_coded.predict(test)
print(confusion_matrix(y_test, preds_cb_coded))
print(accuracy_score(y_test,preds_cb_coded))

0:	total: 820ms	remaining: 13m 38s
1:	total: 1.47s	remaining: 12m 15s
2:	total: 2.05s	remaining: 11m 21s
3:	total: 2.65s	remaining: 11m
4:	total: 3.25s	remaining: 10m 46s
5:	total: 3.85s	remaining: 10m 37s
6:	total: 4.49s	remaining: 10m 36s
7:	total: 5.07s	remaining: 10m 28s
8:	total: 5.66s	remaining: 10m 23s
9:	total: 6.24s	remaining: 10m 18s
10:	total: 6.93s	remaining: 10m 23s
11:	total: 7.5s	remaining: 10m 17s
12:	total: 8.09s	remaining: 10m 14s
13:	total: 8.68s	remaining: 10m 11s
14:	total: 9.29s	remaining: 10m 9s
15:	total: 9.9s	remaining: 10m 8s
16:	total: 10.5s	remaining: 10m 7s
17:	total: 11.1s	remaining: 10m 6s
18:	total: 11.7s	remaining: 10m 5s
19:	total: 12.4s	remaining: 10m 7s
20:	total: 13.1s	remaining: 10m 12s
21:	total: 13.7s	remaining: 10m 10s
22:	total: 14.3s	remaining: 10m 8s
23:	total: 14.9s	remaining: 10m 7s
24:	total: 15.5s	remaining: 10m 6s
25:	total: 16.1s	remaining: 10m 4s
26:	total: 16.7s	remaining: 10m 2s
27:	total: 17.4s	remaining: 10m 4s
28:	total: 18.1s	rem

229:	total: 2m 22s	remaining: 7m 57s
230:	total: 2m 23s	remaining: 7m 56s
231:	total: 2m 23s	remaining: 7m 56s
232:	total: 2m 24s	remaining: 7m 55s
233:	total: 2m 25s	remaining: 7m 54s
234:	total: 2m 25s	remaining: 7m 54s
235:	total: 2m 26s	remaining: 7m 53s
236:	total: 2m 26s	remaining: 7m 53s
237:	total: 2m 27s	remaining: 7m 52s
238:	total: 2m 28s	remaining: 7m 51s
239:	total: 2m 28s	remaining: 7m 50s
240:	total: 2m 29s	remaining: 7m 50s
241:	total: 2m 29s	remaining: 7m 49s
242:	total: 2m 30s	remaining: 7m 48s
243:	total: 2m 31s	remaining: 7m 48s
244:	total: 2m 31s	remaining: 7m 47s
245:	total: 2m 32s	remaining: 7m 47s
246:	total: 2m 33s	remaining: 7m 47s
247:	total: 2m 33s	remaining: 7m 46s
248:	total: 2m 34s	remaining: 7m 45s
249:	total: 2m 35s	remaining: 7m 45s
250:	total: 2m 35s	remaining: 7m 44s
251:	total: 2m 36s	remaining: 7m 43s
252:	total: 2m 36s	remaining: 7m 42s
253:	total: 2m 37s	remaining: 7m 42s
254:	total: 2m 37s	remaining: 7m 41s
255:	total: 2m 38s	remaining: 7m 40s
2

453:	total: 4m 36s	remaining: 5m 32s
454:	total: 4m 37s	remaining: 5m 31s
455:	total: 4m 37s	remaining: 5m 31s
456:	total: 4m 38s	remaining: 5m 30s
457:	total: 4m 38s	remaining: 5m 30s
458:	total: 4m 39s	remaining: 5m 29s
459:	total: 4m 40s	remaining: 5m 28s
460:	total: 4m 40s	remaining: 5m 28s
461:	total: 4m 41s	remaining: 5m 27s
462:	total: 4m 41s	remaining: 5m 26s
463:	total: 4m 42s	remaining: 5m 26s
464:	total: 4m 42s	remaining: 5m 25s
465:	total: 4m 43s	remaining: 5m 24s
466:	total: 4m 43s	remaining: 5m 24s
467:	total: 4m 44s	remaining: 5m 23s
468:	total: 4m 44s	remaining: 5m 22s
469:	total: 4m 45s	remaining: 5m 22s
470:	total: 4m 46s	remaining: 5m 21s
471:	total: 4m 46s	remaining: 5m 20s
472:	total: 4m 47s	remaining: 5m 20s
473:	total: 4m 47s	remaining: 5m 19s
474:	total: 4m 48s	remaining: 5m 18s
475:	total: 4m 48s	remaining: 5m 18s
476:	total: 4m 49s	remaining: 5m 17s
477:	total: 4m 50s	remaining: 5m 16s
478:	total: 4m 50s	remaining: 5m 16s
479:	total: 4m 51s	remaining: 5m 15s
4

677:	total: 6m 48s	remaining: 3m 14s
678:	total: 6m 49s	remaining: 3m 13s
679:	total: 6m 49s	remaining: 3m 12s
680:	total: 6m 50s	remaining: 3m 12s
681:	total: 6m 51s	remaining: 3m 11s
682:	total: 6m 51s	remaining: 3m 11s
683:	total: 6m 52s	remaining: 3m 10s
684:	total: 6m 52s	remaining: 3m 9s
685:	total: 6m 53s	remaining: 3m 9s
686:	total: 6m 53s	remaining: 3m 8s
687:	total: 6m 54s	remaining: 3m 7s
688:	total: 6m 55s	remaining: 3m 7s
689:	total: 6m 55s	remaining: 3m 6s
690:	total: 6m 56s	remaining: 3m 6s
691:	total: 6m 56s	remaining: 3m 5s
692:	total: 6m 57s	remaining: 3m 4s
693:	total: 6m 57s	remaining: 3m 4s
694:	total: 6m 58s	remaining: 3m 3s
695:	total: 6m 58s	remaining: 3m 2s
696:	total: 6m 59s	remaining: 3m 2s
697:	total: 7m	remaining: 3m 1s
698:	total: 7m	remaining: 3m 1s
699:	total: 7m 1s	remaining: 3m
700:	total: 7m 1s	remaining: 2m 59s
701:	total: 7m 2s	remaining: 2m 59s
702:	total: 7m 2s	remaining: 2m 58s
703:	total: 7m 3s	remaining: 2m 58s
704:	total: 7m 4s	remaining: 2m 5

902:	total: 9m 2s	remaining: 58.3s
903:	total: 9m 3s	remaining: 57.7s
904:	total: 9m 3s	remaining: 57.1s
905:	total: 9m 4s	remaining: 56.5s
906:	total: 9m 4s	remaining: 55.9s
907:	total: 9m 5s	remaining: 55.3s
908:	total: 9m 5s	remaining: 54.7s
909:	total: 9m 6s	remaining: 54s
910:	total: 9m 7s	remaining: 53.4s
911:	total: 9m 7s	remaining: 52.8s
912:	total: 9m 8s	remaining: 52.2s
913:	total: 9m 8s	remaining: 51.6s
914:	total: 9m 9s	remaining: 51s
915:	total: 9m 9s	remaining: 50.4s
916:	total: 9m 10s	remaining: 49.8s
917:	total: 9m 11s	remaining: 49.2s
918:	total: 9m 11s	remaining: 48.6s
919:	total: 9m 12s	remaining: 48s
920:	total: 9m 12s	remaining: 47.4s
921:	total: 9m 13s	remaining: 46.8s
922:	total: 9m 13s	remaining: 46.2s
923:	total: 9m 14s	remaining: 45.6s
924:	total: 9m 15s	remaining: 45s
925:	total: 9m 16s	remaining: 44.4s
926:	total: 9m 16s	remaining: 43.8s
927:	total: 9m 17s	remaining: 43.2s
928:	total: 9m 17s	remaining: 42.6s
929:	total: 9m 18s	remaining: 42.1s
930:	total: 9m

In [38]:
%matplotlib
fig, ax = plt.subplots(
    nrows=2,
    ncols=10,
    sharex=True,
    sharey=True, )

ax = ax.flatten()
k=random.randint(0,9980)
n=0
for i in range(10):
    for j in range(2):
        print("perdict:%d, True:%d" %(preds_cb_coded[k],y_test.values[k]))
        img = np.array(test_plot)[k,:].reshape(28, 28)
        ax[n].imshow(img, cmap='Greys', interpolation='nearest')
        k=k+1
        n=n+1

ax[0].set_xticks([])
ax[0].set_yticks([])
plt.tight_layout()
plt.show()

Using matplotlib backend: Qt5Agg
perdict:7, True:7
perdict:9, True:9
perdict:4, True:4
perdict:6, True:6
perdict:7, True:7
perdict:1, True:1
perdict:3, True:3
perdict:1, True:7
perdict:3, True:3
perdict:6, True:6
perdict:6, True:6
perdict:0, True:0
perdict:9, True:9
perdict:0, True:0
perdict:1, True:1
perdict:9, True:9
perdict:9, True:9
perdict:2, True:2
perdict:8, True:8
perdict:8, True:8
