In [54]:
import numpy as np
from sklearn.decomposition import pca
import time
from sklearn import metrics

## 读取MNIST数据
* 两组数据
* 1. train 1-2000, test 2001-4000
* 2. train 2001-4000, test 1-2000
* 数据是按0-9的顺序排列的，每个数字有两百个

In [2]:
def load_data(flag=1,normalize=False):
    """
    flag: 选择两组数据中的某一个
    normalize: 数据是否要正则化 (在训练某些model的时候好像有bug，待研究)
    
    return: X_train, Y_train, X_test, Y_test
    """
    labels = get_data('digits4000_txt/digits4000_digits_labels.txt')
    digits = get_data('digits4000_txt/digits4000_digits_vec.txt')

    if flag==1:
        X_train = digits[:2000]
        Y_train = labels[:2000]
        X_test = digits[2000:]
        Y_test = labels[2000:]
    else:
        X_train = digits[2000:]
        Y_train = labels[2000:]
        X_test = digits[:2000]
        Y_test = labels[:2000]

    if normalize:
        X_train = X_train/255
        Y_train = Y_train/255
        X_test = X_test/255
        Y_test = Y_test/255

    return np.array(X_train),np.array(Y_train),np.array(X_test),np.array(Y_test)

def get_data(filename):
    return np.loadtxt(filename)

## 各种Classifier的实现

### KNN Classifier

In [3]:
def knn_classifier(train_x,train_y):
    from sklearn.neighbors import KNeighborsClassifier
    model = KNeighborsClassifier()
    model.fit(train_x, train_y)
    return model

### Logistic Classifier

In [4]:
def logistic_regression_classifier(train_x, train_y):
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(penalty='l2')
    model.fit(train_x, train_y)
    return model

### Decision Tree Classifier

In [5]:
def decision_tree_classifier(train_x, train_y):
    from sklearn import tree
    model = tree.DecisionTreeClassifier()
    model.fit(train_x, train_y)
    return model

### SVM Classifier

In [6]:
def svm_classifier(train_x, train_y):
    from sklearn.svm import SVC
    model = SVC(kernel='linear', gamma=0.01, C=1)
    model.fit(train_x, train_y)
    return model

## 进行分析

### 使用PCA降维

### PCA model

In [55]:
def PCA_model(train_x,components=600):
    model = pca.PCA(n_components=components).fit(train_x)
    return model

### training和testing的部分

In [8]:
classifiers = ['knn_classifier','logistic_regression_classifier','decision_tree_classifier','svm_classifier']
x_train, y_train, x_test, y_test = load_data(flag=1,normalize=False)

In [9]:
def training_process(classifiers, x_train, y_train, x_test, y_test):
    result = {}
    for classifier in classifiers:
        try:
            temp_model = eval(classifier)(x_train,y_train)
            y_predict = temp_model.predict(x_test)
            accuracy = metrics.accuracy_score(y_test, y_predict)
            print("=======================")
            print('Classifier: {}'.format(classifier))
            print('accuracy: {}'.format("%.2f%%"%( 100*accuracy)))
            result[classifier] = accuracy
        except:
            print('Error with {}.'.format(classifier))
    return result

### 没有normalization和PCA

In [10]:
training_process(classifiers, x_train, y_train, x_test, y_test)

Classifier: knn_classifier
accuracy: 91.70%
Classifier: logistic_regression_classifier
accuracy: 83.45%
Classifier: decision_tree_classifier
accuracy: 71.90%
Classifier: svm_classifier
accuracy: 90.10%


{'knn_classifier': 0.917,
 'logistic_regression_classifier': 0.8345,
 'decision_tree_classifier': 0.719,
 'svm_classifier': 0.901}

### 有normalization，没有PCA

In [11]:
training_process(classifiers, x_train/255, y_train, x_test/255, y_test)

Classifier: knn_classifier
accuracy: 91.70%
Classifier: logistic_regression_classifier
accuracy: 88.50%
Classifier: decision_tree_classifier
accuracy: 72.20%
Classifier: svm_classifier
accuracy: 90.10%


{'knn_classifier': 0.917,
 'logistic_regression_classifier': 0.885,
 'decision_tree_classifier': 0.722,
 'svm_classifier': 0.901}

### 没有normalization，有PCA

### 训练PCA model

In [56]:
pca_model = PCA_model(x_train)

In [57]:
x_train_pca = pca_model.transform(x_train,components=444)
x_test_pca = pca_model.transform(x_test)
training_process(classifiers, x_train_pca, y_train, x_test_pca, y_test)

Classifier: knn_classifier
accuracy: 91.70%
Classifier: logistic_regression_classifier
accuracy: 75.70%
Classifier: decision_tree_classifier
accuracy: 63.30%
Classifier: svm_classifier
accuracy: 90.10%


{'knn_classifier': 0.917,
 'logistic_regression_classifier': 0.757,
 'decision_tree_classifier': 0.633,
 'svm_classifier': 0.901}

### 有normalization和PCA

In [62]:
pca_model = PCA_model(x_train/255,components=400)
x_train_normal_pca = pca_model.transform(x_train/255)
x_test_normal_pca = pca_model.transform(x_test/255)

training_process(classifiers, x_train_normal_pca, y_train, x_test_normal_pca, y_test)

Classifier: knn_classifier
accuracy: 91.70%
Classifier: logistic_regression_classifier
accuracy: 87.80%
Classifier: decision_tree_classifier
accuracy: 64.40%
Classifier: svm_classifier
accuracy: 90.10%


{'knn_classifier': 0.917,
 'logistic_regression_classifier': 0.878,
 'decision_tree_classifier': 0.644,
 'svm_classifier': 0.901}