In [1]:
import numpy as np
from sklearn.decomposition import pca
import time
from sklearn import metrics
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Activation, Convolution2D, MaxPooling2D, Flatten
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import Adam
from keras.optimizers import RMSprop

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


## 读取MNIST数据
* 两组数据
* 1. train 1-2000, test 2001-4000
* 2. train 2001-4000, test 1-2000
* 数据是按0-9的顺序排列的，每个数字有两百个

In [2]:
def load_data(flag=1,normalize=False):
    """
    flag: 选择两组数据中的某一个
    normalize: 数据是否要正则化 (在训练某些model的时候好像有bug，待研究)
    
    return: X_train, Y_train, X_test, Y_test
    """
    labels = get_data('digits4000_txt/digits4000_digits_labels.txt')
    digits = get_data('digits4000_txt/digits4000_digits_vec.txt')

    if flag==1:
        X_train = digits[:2000]
        Y_train = labels[:2000]
        X_test = digits[2000:]
        Y_test = labels[2000:]
    else:
        X_train = digits[2000:]
        Y_train = labels[2000:]
        X_test = digits[:2000]
        Y_test = labels[:2000]

    if normalize:
        X_train = X_train/255
        Y_train = Y_train
        X_test = X_test/255
        Y_test = Y_test

    return np.array(X_train),np.array(Y_train),np.array(X_test),np.array(Y_test)

def get_data(filename):
    return np.loadtxt(filename)

In [3]:
x_train, y_train, x_test, y_test = load_data(flag=1,normalize=False)

## 各种Classifier的实现

### KNN Classifier

In [4]:
def knn_classifier(train_x,train_y):
    from sklearn.neighbors import KNeighborsClassifier
    model = KNeighborsClassifier()
    model.fit(train_x, train_y)
    return model

### Logistic Classifier

In [5]:
def logistic_regression_classifier(train_x, train_y):
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(penalty='l2')
    model.fit(train_x, train_y)
    return model

### Decision Tree Classifier

In [6]:
def decision_tree_classifier(train_x, train_y):
    from sklearn import tree
    model = tree.DecisionTreeClassifier()
    model.fit(train_x, train_y)
    return model

### SVM Classifier

In [7]:
def svm_classifier(train_x, train_y):
    from sklearn.svm import SVC
    model = SVC(kernel='rbf', gamma=0.01, C=1) # 选择kernel之后，rbf的准确率比其他的高，参数还没研究过
    model.fit(train_x, train_y)
    return model

### Random Forest Classifier

In [8]:
def random_forest_classifier(train_x, train_y, num_classes=10):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier()
    model.fit(train_x, train_y)
    return model

### Ada Boost Classifier

In [9]:
def ada_boost_classifier(train_x, train_y, num_classes=10):
	from sklearn.ensemble import AdaBoostClassifier
	model = AdaBoostClassifier()
	model.fit(train_x,train_y)
	return model

### CNN model
* 模型参数直接在这个里面改
* CNN不需要进行PCA

In [10]:
def cnn_classifier(train_x, train_y):
    train_X = train_x.reshape(-1, 1, 28, 28)
    train_Y = np_utils.to_categorical(train_y, num_classes=10)
    
    model = Sequential()
    
    model.add(Convolution2D(filters=32,
                                        kernel_size=5,
                                        strides=1,
                                        padding='same',
                                        batch_input_shape=(None, 1, 28, 28),
                                        data_format='channels_first'))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=2, 
                                       strides=2, 
                                       padding='same', 
                                       data_format='channels_first'))
    model.add(Convolution2D(64, 5, 
                                        strides=1, 
                                        padding='same', 
                                        data_format='channels_first'))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(2, 2, 'same', data_format='channels_first'))
    
    model.add(Flatten())
    model.add(Dense(1024))
    model.add(Activation('relu'))
    
    model.add(Dense(10))
    model.add(Activation('softmax'))
    
    adam = Adam(lr=1e-4)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
    
    model.fit(train_X, train_Y, epochs=20, batch_size=64)
    
    return model
    

### DNN model
* 和CNN model相似，结构不一样
* DNN可以进行PCA

In [11]:
def dnn_classifier(train_x, train_y):
    batch_size = 100
    nb_classes = 10
    nb_epoch = 20
    
    train_y = np_utils.to_categorical(train_y, num_classes=10)
    
    input_dim = train_x.shape[1]
    
    model = Sequential()
    model.add(Dense(512, input_shape=(input_dim,)))
    
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    
    model.add(Dense(512))
    model.add(Activation('relu'))
    
    model.add(Dropout(0.2))
    model.add(Dense(10))
    model.add(Activation('softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])
    model.fit(train_x, train_y, batch_size=batch_size, epochs=nb_epoch)
    
    return model

## 进行分析

### 使用PCA降维

### PCA model

In [39]:
def PCA_model(train_x,components=150):
    model = pca.PCA(n_components=components).fit(train_x)
    return model

### training和testing的部分

In [50]:
classifiers = ['knn_classifier','logistic_regression_classifier','decision_tree_classifier','svm_classifier','random_forest_classifier','ada_boost_classifier','cnn_classifier','dnn_classifier']
# classifiers = ['knn_classifier','logistic_regression_classifier','decision_tree_classifier','svm_classifier','dnn_classifier'] # test
# x_train, y_train, x_test, y_test = load_data(flag=1,normalize=False)
x_train, y_train, x_test, y_test = load_data(flag=1,normalize=False)

In [46]:
def training_process(classifiers, x_train, y_train, x_test, y_test):
    result = {}
    for classifier in classifiers:
        try:
            print("=======================")
            print('Classifier: {}'.format(classifier))
            temp_model = eval(classifier)(x_train,y_train) 
    
            if classifier == 'cnn_classifier': # CNN需要转换一下数据格式
                x_test_reshape = x_test.reshape(-1, 1, 28, 28)
                y_test_reshape = np_utils.to_categorical(y_test, num_classes=10)
                loss, accuracy = temp_model.evaluate(x_test_reshape, y_test_reshape)
            else:
                if classifier == 'dnn_classifier':
                    y_test_reshape = np_utils.to_categorical(y_test, num_classes=10)
                    loss, accuracy = temp_model.evaluate(x_test, y_test_reshape)
                else:
                    y_train_predict = temp_model.predict(x_train)
                    training_accuracy = metrics.accuracy_score(y_train,y_train_predict)
                    print('training accuracy: {}'.format(training_accuracy))
                    y_predict = temp_model.predict(x_test)
                    accuracy = metrics.accuracy_score(y_test, y_predict)
                
            print('testing accuracy: {}'.format(accuracy))
            result[classifier] = accuracy
        except:
            print('+++++++++++++++++++++++++')
            print('Error with {}.'.format(classifier))
            import traceback
            print(traceback.format_exc())
            print('+++++++++++++++++++++++++')
    return result

### 没有normalization和PCA

In [34]:
    training_process(classifiers, x_train, y_train, x_test, y_test)

Classifier: knn_classifier
training accuracy: 0.9465
testing accuracy: 0.9095
Classifier: logistic_regression_classifier
training accuracy: 1.0
testing accuracy: 0.8395
Classifier: decision_tree_classifier
training accuracy: 1.0
testing accuracy: 0.6845
Classifier: svm_classifier
training accuracy: 1.0
testing accuracy: 0.1
Classifier: random_forest_classifier
training accuracy: 0.9995
testing accuracy: 0.823
Classifier: ada_boost_classifier
training accuracy: 0.4485
testing accuracy: 0.4085
Classifier: cnn_classifier
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
testing accuracy: 0.3895
Classifier: dnn_classifier
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/2

{'knn_classifier': 0.9095,
 'logistic_regression_classifier': 0.8395,
 'decision_tree_classifier': 0.6845,
 'svm_classifier': 0.1,
 'random_forest_classifier': 0.823,
 'ada_boost_classifier': 0.4085,
 'cnn_classifier': 0.3895,
 'dnn_classifier': 0.1}

### 有normalization，没有PCA

In [35]:
training_process(classifiers, x_train/255, y_train, x_test/255, y_test)

Classifier: knn_classifier
training accuracy: 0.9465
testing accuracy: 0.9095
Classifier: logistic_regression_classifier
training accuracy: 0.989
testing accuracy: 0.8755
Classifier: decision_tree_classifier
training accuracy: 1.0
testing accuracy: 0.681
Classifier: svm_classifier
training accuracy: 0.9795
testing accuracy: 0.9275
Classifier: random_forest_classifier
training accuracy: 0.999
testing accuracy: 0.8465
Classifier: ada_boost_classifier
training accuracy: 0.4485
testing accuracy: 0.4085
Classifier: cnn_classifier
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
testing accuracy: 0.949
Classifier: dnn_classifier
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoc

{'knn_classifier': 0.9095,
 'logistic_regression_classifier': 0.8755,
 'decision_tree_classifier': 0.681,
 'svm_classifier': 0.9275,
 'random_forest_classifier': 0.8465,
 'ada_boost_classifier': 0.4085,
 'cnn_classifier': 0.949,
 'dnn_classifier': 0.919}

### 没有normalization，有PCA
* CNN不适用于PCA

### 训练PCA model

In [51]:
pca_model = PCA_model(x_train)

In [52]:
x_train_pca = pca_model.transform(x_train)
x_test_pca = pca_model.transform(x_test)
temp_classifiers = ['knn_classifier','logistic_regression_classifier','decision_tree_classifier','svm_classifier','random_forest_classifier','ada_boost_classifier','dnn_classifier'] # 没有CNN
training_process(temp_classifiers, x_train_pca, y_train, x_test_pca, y_test)

Classifier: knn_classifier
training accuracy: 0.95
testing accuracy: 0.9195
Classifier: logistic_regression_classifier
training accuracy: 0.9775
testing accuracy: 0.8555
Classifier: decision_tree_classifier
training accuracy: 1.0
testing accuracy: 0.6835
Classifier: svm_classifier
training accuracy: 1.0
testing accuracy: 0.1005
Classifier: random_forest_classifier
training accuracy: 0.9995
testing accuracy: 0.7175
Classifier: ada_boost_classifier
training accuracy: 0.309
testing accuracy: 0.2995
Classifier: dnn_classifier
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
testing accuracy: 0.476


{'knn_classifier': 0.9195,
 'logistic_regression_classifier': 0.8555,
 'decision_tree_classifier': 0.6835,
 'svm_classifier': 0.1005,
 'random_forest_classifier': 0.7175,
 'ada_boost_classifier': 0.2995,
 'dnn_classifier': 0.476}

### 有normalization和PCA

In [53]:
pca_model = PCA_model(x_train/255)
x_train_normal_pca = pca_model.transform(x_train/255)
x_test_normal_pca = pca_model.transform(x_test/255)
temp_classifiers = ['knn_classifier','logistic_regression_classifier','decision_tree_classifier','svm_classifier','random_forest_classifier','ada_boost_classifier','dnn_classifier'] # 没有CNN
training_process(temp_classifiers, x_train_normal_pca, y_train, x_test_normal_pca, y_test)

Classifier: knn_classifier
training accuracy: 0.949
testing accuracy: 0.9185
Classifier: logistic_regression_classifier
training accuracy: 0.9605
testing accuracy: 0.885
Classifier: decision_tree_classifier
training accuracy: 1.0
testing accuracy: 0.6765
Classifier: svm_classifier
training accuracy: 0.9785
testing accuracy: 0.933
Classifier: random_forest_classifier
training accuracy: 0.997
testing accuracy: 0.7165
Classifier: ada_boost_classifier
training accuracy: 0.309
testing accuracy: 0.2995
Classifier: dnn_classifier
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
testing accuracy: 0.922


{'knn_classifier': 0.9185,
 'logistic_regression_classifier': 0.885,
 'decision_tree_classifier': 0.6765,
 'svm_classifier': 0.933,
 'random_forest_classifier': 0.7165,
 'ada_boost_classifier': 0.2995,
 'dnn_classifier': 0.922}