In [44]:
import numpy as np
from sklearn.decomposition import pca
import time
from sklearn import metrics
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Activation, Convolution2D, MaxPooling2D, Flatten
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import Adam
from keras.optimizers import RMSprop

## 读取MNIST数据
* 两组数据
* 1. train 1-2000, test 2001-4000
* 2. train 2001-4000, test 1-2000
* 数据是按0-9的顺序排列的，每个数字有两百个

In [3]:
def load_data(flag=1,normalize=False):
    """
    flag: 选择两组数据中的某一个
    normalize: 数据是否要正则化 (在训练某些model的时候好像有bug，待研究)
    
    return: X_train, Y_train, X_test, Y_test
    """
    labels = get_data('digits4000_txt/digits4000_digits_labels.txt')
    digits = get_data('digits4000_txt/digits4000_digits_vec.txt')

    if flag==1:
        X_train = digits[:2000]
        Y_train = labels[:2000]
        X_test = digits[2000:]
        Y_test = labels[2000:]
    else:
        X_train = digits[2000:]
        Y_train = labels[2000:]
        X_test = digits[:2000]
        Y_test = labels[:2000]

    if normalize:
        X_train = X_train/255
        Y_train = Y_train/255
        X_test = X_test/255
        Y_test = Y_test/255

    return np.array(X_train),np.array(Y_train),np.array(X_test),np.array(Y_test)

def get_data(filename):
    return np.loadtxt(filename)

## 各种Classifier的实现

### KNN Classifier

In [4]:
def knn_classifier(train_x,train_y):
    from sklearn.neighbors import KNeighborsClassifier
    model = KNeighborsClassifier()
    model.fit(train_x, train_y)
    return model

### Logistic Classifier

In [5]:
def logistic_regression_classifier(train_x, train_y):
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(penalty='l2')
    model.fit(train_x, train_y)
    return model

### Decision Tree Classifier

In [6]:
def decision_tree_classifier(train_x, train_y):
    from sklearn import tree
    model = tree.DecisionTreeClassifier()
    model.fit(train_x, train_y)
    return model

### SVM Classifier

In [7]:
def svm_classifier(train_x, train_y):
    from sklearn.svm import SVC
    model = SVC(kernel='linear', gamma=0.01, C=1)
    model.fit(train_x, train_y)
    return model

### CNN model
* 模型参数直接在这个里面改
* CNN不需要进行PCA

In [21]:
def cnn_classifier(train_x, train_y):
    train_X = train_x.reshape(-1, 1, 28, 28)
    train_Y = np_utils.to_categorical(train_y, num_classes=10)
    
    model = Sequential()
    
    model.add(Convolution2D(filters=32,
                                        kernel_size=5,
                                        strides=1,
                                        padding='same',
                                        batch_input_shape=(None, 1, 28, 28),
                                        data_format='channels_first'))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=2, 
                                       strides=2, 
                                       padding='same', 
                                       data_format='channels_first'))
    model.add(Convolution2D(64, 5, 
                                        strides=1, 
                                        padding='same', 
                                        data_format='channels_first'))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(2, 2, 'same', data_format='channels_first'))
    
    model.add(Flatten())
    model.add(Dense(1024))
    model.add(Activation('relu'))
    
    model.add(Dense(10))
    model.add(Activation('softmax'))
    
    adam = Adam(lr=1e-4)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
    
    model.fit(train_X, train_Y, epochs=20, batch_size=64)
    
    return model
    

### DNN model
* 和CNN model相似，结构不一样
* DNN可以进行PCA

In [51]:
def dnn_classifier(train_x, train_y):
    batch_size = 100
    nb_classes = 10
    nb_epoch = 20
    
    train_y = np_utils.to_categorical(train_y, num_classes=10)
    
    input_dim = train_x.shape[1]
    
    model = Sequential()
    model.add(Dense(512, input_shape=(input_dim,)))
    
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    
    model.add(Dense(512))
    model.add(Activation('relu'))
    
    model.add(Dropout(0.2))
    model.add(Dense(10))
    model.add(Activation('softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])
    model.fit(train_x, train_y, batch_size=batch_size, epochs=nb_epoch)
    
    return model

## 进行分析

### 使用PCA降维

### PCA model

In [10]:
def PCA_model(train_x,components=600):
    model = pca.PCA(n_components=components).fit(train_x)
    return model

### training和testing的部分

In [39]:
# classifiers = ['knn_classifier','logistic_regression_classifier','decision_tree_classifier','svm_classifier','cnn_classifier','dnn_classifier']
classifiers = ['knn_classifier','logistic_regression_classifier','decision_tree_classifier','svm_classifier','dnn_classifier'] # test
x_train, y_train, x_test, y_test = load_data(flag=1,normalize=False)

In [53]:
def training_process(classifiers, x_train, y_train, x_test, y_test):
    result = {}
    for classifier in classifiers:
        try:
            print("=======================")
            print('Classifier: {}'.format(classifier))
            temp_model = eval(classifier)(x_train,y_train) 
    
            if classifier == 'cnn_classifier': # CNN需要转换一下数据格式
                x_test_reshape = x_test.reshape(-1, 1, 28, 28)
                y_test_reshape = np_utils.to_categorical(y_test, num_classes=10)
                loss, accuracy = temp_model.evaluate(x_test_reshape, y_test_reshape)
            else:
                if classifier == 'dnn_classifier':
                    y_test_reshape = np_utils.to_categorical(y_test, num_classes=10)
                    loss, accuracy = temp_model.evaluate(x_test, y_test_reshape)
                else:
                    y_train_predict = temp_model.predict(x_train)
                    training_accuracy = metrics.accuracy_score(y_train,y_train_predict)
                    print('training accuracy: {}'.format("%.2f%%"%( 100*training_accuracy)))
                    y_predict = temp_model.predict(x_test)
                    accuracy = metrics.accuracy_score(y_test, y_predict)
                
            print('testing accuracy: {}'.format("%.2f%%"%( 100*accuracy)))
            result[classifier] = accuracy
        except:
            print('+++++++++++++++++++++++++')
            print('Error with {}.'.format(classifier))
            import traceback
            print(traceback.format_exc())
            print('+++++++++++++++++++++++++')
    return result

### 没有normalization和PCA

In [54]:
training_process(classifiers, x_train, y_train, x_test, y_test)

Classifier: knn_classifier
training accuracy: 94.50%
testing accuracy: 91.70%
Classifier: logistic_regression_classifier
training accuracy: 100.00%
testing accuracy: 83.45%
Classifier: decision_tree_classifier
training accuracy: 100.00%
testing accuracy: 72.80%
Classifier: svm_classifier
training accuracy: 100.00%
testing accuracy: 90.10%
Classifier: dnn_classifier
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
testing accuracy: 37.10%


{'knn_classifier': 0.917,
 'logistic_regression_classifier': 0.8345,
 'decision_tree_classifier': 0.728,
 'svm_classifier': 0.901,
 'dnn_classifier': 0.371}

### 有normalization，没有PCA

In [55]:
training_process(classifiers, x_train/255, y_train, x_test/255, y_test)

Classifier: knn_classifier
training accuracy: 94.50%
testing accuracy: 91.70%
Classifier: logistic_regression_classifier
training accuracy: 98.60%
testing accuracy: 88.50%
Classifier: decision_tree_classifier
training accuracy: 100.00%
testing accuracy: 72.85%
Classifier: svm_classifier
training accuracy: 100.00%
testing accuracy: 90.10%
Classifier: dnn_classifier
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
testing accuracy: 93.05%


{'knn_classifier': 0.917,
 'logistic_regression_classifier': 0.885,
 'decision_tree_classifier': 0.7285,
 'svm_classifier': 0.901,
 'dnn_classifier': 0.9305}

### 没有normalization，有PCA
* CNN不适用于PCA

### 训练PCA model

In [32]:
pca_model = PCA_model(x_train)

In [34]:
x_train_pca = pca_model.transform(x_train)
x_test_pca = pca_model.transform(x_test)
temp_classifiers = ['knn_classifier','logistic_regression_classifier','decision_tree_classifier','svm_classifier'] # 没有CNN
training_process(temp_classifiers, x_train_pca, y_train, x_test_pca, y_test)

Classifier: knn_classifier
training accuracy: 94.50%
testing accuracy: 91.70%
Classifier: logistic_regression_classifier
training accuracy: 99.90%
testing accuracy: 76.75%
Classifier: decision_tree_classifier
training accuracy: 100.00%
testing accuracy: 63.90%
Classifier: svm_classifier
training accuracy: 100.00%
testing accuracy: 90.10%


{'knn_classifier': 0.917,
 'logistic_regression_classifier': 0.7675,
 'decision_tree_classifier': 0.639,
 'svm_classifier': 0.901}

### 有normalization和PCA

In [35]:
pca_model = PCA_model(x_train/255,components=400)
x_train_normal_pca = pca_model.transform(x_train/255)
x_test_normal_pca = pca_model.transform(x_test/255)
temp_classifiers = ['knn_classifier','logistic_regression_classifier','decision_tree_classifier','svm_classifier'] # 没有CNN
training_process(temp_classifiers, x_train_normal_pca, y_train, x_test_normal_pca, y_test)

Classifier: knn_classifier
training accuracy: 94.55%
testing accuracy: 91.75%
Classifier: logistic_regression_classifier
training accuracy: 98.30%
testing accuracy: 87.80%
Classifier: decision_tree_classifier
training accuracy: 100.00%
testing accuracy: 65.70%
Classifier: svm_classifier
training accuracy: 100.00%
testing accuracy: 90.10%


{'knn_classifier': 0.9175,
 'logistic_regression_classifier': 0.878,
 'decision_tree_classifier': 0.657,
 'svm_classifier': 0.901}