In [1]:
# %% 1
# Package imports
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.linear_model
import matplotlib
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import copy

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train',  categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',  categories=categories)

# pprint(newsgroups_train.data[0])

num_train = len(newsgroups_train.data)
num_test  = len(newsgroups_test.data)

vectorizer = TfidfVectorizer(max_features=20)

X = vectorizer.fit_transform( newsgroups_train.data + newsgroups_test.data )
X_train = X[0:num_train, :]
X_test = X[num_train:num_train+num_test,:]

Y_train = newsgroups_train.target
Y_test = newsgroups_test.target

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(2034, 20) (2034,)
(1353, 20) (1353,)


In [2]:
# # Generate a dataset and plot it
# np.random.seed(0)
# X, y = sklearn.datasets.make_moons(1000, noise=0.20)
# print('输入：',X.shape)
# print('输出',y.shape)
# plt.scatter(X[:,0], X[:,1], s=40, c=y, cmap=plt.cm.Spectral)

In [3]:
# # Train the logistic regression classifier
# clf = sklearn.linear_model.LogisticRegressionCV()
# clf.fit(X, y)

In [4]:
# # Helper function to plot a decision boundary.
# # If you don't fully understand this function don't worry, it just generates the contour plot below.
# def plot_decision_boundary(pred_func):
#     # Set min and max values and give it some padding
#     x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
#     y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
#     h = 0.01
#     # Generate a grid of points with distance h between them
#     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
#     # Predict the function value for the whole gid
#     Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
#     Z = Z.reshape(xx.shape)
#     # Plot the contour and training examples
#     plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
#     plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)

In [5]:
# # Plot the decision boundary
# plot_decision_boundary(lambda x: clf.predict(x))
# plt.title("Logistic Regression")

初始化层

In [6]:
def init_layer(input_dim, output_dim, actFunc):
    np.random.seed(0)
    W = np.random.randn(input_dim, output_dim) / np.sqrt(input_dim)
    b = np.zeros((1,output_dim))
    print('w:',W.shape)
    print('b:',b.shape)
    layer = {'W': W, 'b': b, 'actFunc': actFunc}
    return layer

In [7]:
def init_layers(nn_architecture):
    layers = []
    for l in nn_architecture:
        layer = init_layer(l['input_dim'], l['output_dim'], l['actFunc'])
        layers.append(layer)
    return layers

激活函数

In [8]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def relu(z):
    return np.maximum(0, z)


softmax

In [9]:
def softmax(Z):
    exp_scores = np.exp(Z)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    return probs

反向传播

In [10]:
def sigmoid_backward(dA, Z):
    sig = sigmoid(Z)
    return dA * sig * (1-sig)

def relu_backward(dA, Z):
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0] = 0
    return dZ

def tanh_backward(dA, Z):
    t = np.tanh(Z)
    res = (1 - t * t)
#     print('res:', res.shape)
#     print('dA:', dA.shape)
    return res * dA

损失函数

In [11]:
def loss(Z, y):
    # 计算损失
    probs = softmax(Z)
    corect_logprobs = -np.log(probs[range(num_examples), y])
    data_loss = np.sum(corect_logprobs)
    #在损失上加上正则项（可选）
    # data_loss += reg_lambda/2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
    return 1./num_examples * data_loss

前向传播

In [12]:
def single_layer_forward_prop(X, layer):
    W = layer['W']
#     print(W.shape)
    Z = X.dot(layer['W']) + layer['b']
    if layer['actFunc'] is 'relu':
        actFunction = relu
    elif layer['actFunc'] is 'sigmoid':
        actFunction = sigmoid
    else:
        actFunction = np.tanh
    return actFunction(Z), Z

In [13]:
def full_layers_forward_prop(X, layers):
    memory_forward = []
    Z_out = X
    memo_forward = {
        'Z_out': X
    }
    memory_forward.append(memo_forward)
    layers_now = 0
    for layer in layers:
#         print('forward layers_now:',layers_now)
        Z_out, Z_hide = single_layer_forward_prop(Z_out, layer)
        memo_forward = {
            'Z_out': Z_out,
            'Z_hide': Z_hide
        }
        memory_forward.append(memo_forward)
        layers_now += 1

    # 返回最终的Z_out => actFunc(Z=X*W + b)
    # memory_forward记录每一层的Z_out=actFunc(Z_hide)和Z_hide=W*X+b
#     print('Z_out: ',Z_out.shape)
    return Z_out, memory_forward

反向传播

In [14]:
def single_layer_backward_prop(memo_forward_now, memo_forward_pre, dA_now, layer):
    # 前向神经元个数
    # dA_now为由下一层传回的梯度
    # memo_forward_pre 记录上一层计算结果， Z_hide=X*w+b和Z_out => X_pre
    # memo_forward_now 记录当前层的计算结果，Z_hide => Z_now和Z_out
    X_pre = memo_forward_pre['Z_out']
    Z_now = memo_forward_now['Z_hide']
    back_dim = X_pre.shape[0]

    if layer['actFunc'] is 'sigmoid':
        actFuncBack = sigmoid_backward
    elif layer['actFunc'] is 'relu':
        actFuncBack = relu_backward
    else:
        actFuncBack = tanh_backward

    # 计算当前层外层导数
    # dZ_now = actFunc'(Z_hide)
    dZ_now = actFuncBack(dA_now, Z_now)
    # dW_now = actFunc'(Z_hide) * (X=Z_hide*dW)
#     print('X_pre',X_pre.shape)
#     print('dZ_now',dZ_now.shape)
#     print('dA_now',dA_now.shape)
#     print('Z_now',Z_now.shape)
    dW_now = X_pre.T.dot(dZ_now) / back_dim
    # db_now = actFunc'(Z_hide) * (1=Z_hide*db); 维度转换
    db_now = np.sum(dZ_now, axis=0, keepdims=True) / back_dim
#     print('dW_now:',dW_now.shape)
#     print('db_now',db_now.shape)
    # dA_pre为向前一层传递的梯度；对上一层的Z_out即本层的X求导结果
    # dA_pre = actFunc'(Z_hide) * (W=Z_hide*dX)
    W_now = copy.deepcopy(layer['W'])
    dA_pre = dZ_now.dot(W_now.T)
#     print('dA_pre',dA_pre.shape)
    
    return dA_pre,dW_now, db_now

In [15]:
def full_layers_backward_prop(Z_out, memory_forward, layers, X, y):
#     Z_out, memo_forward = full_layers_forward_prop(X, layers)
    # 反向传播
    probs = softmax(Z_out)
    probs[range(num_examples), y] -= 1
    dA_pre = probs
#     print('dA_now:', dA_now.shape)
#     print('probs:', probs.shape)
    memory_backward = []
    layers.reverse()
    memory_forward.reverse()

    length = len(layers)
    for idx in range(length):
#         print('layer_now:', idx)
        dA_pre, dW_now, db_now = single_layer_backward_prop(memory_forward[idx],memory_forward[idx+1],dA_pre,layers[idx])
        memo_backward = {
            'dW_now': dW_now,
            'db_now': db_now
        }
        memory_backward.append(memo_backward)

    return memory_backward

更新网络

In [16]:
def update(layers, memory_backward):
#     print('layers: ',len(layers)
#     print('memory_backward: ', len(memory_backward))
    length = len(layers)
#     print(memory_backward)
#     print(layers)
#     print(memory_backward)
    for idx in range(length):
        dW = memory_backward[idx]['dW_now']
#         print('dW.shape: ', dW.shape)
        layers[idx]['W'] -= epsilon * memory_backward[idx]['dW_now']
        layers[idx]['b'] -= epsilon * memory_backward[idx]['db_now']
        
#     print(memory_backward)
#     print(layers)
    return layers

预测函数

In [17]:
def predict(X, layers):
    Z_out, memory_forward = full_layers_forward_prop(X,layers)
    probs = softmax(Z_out)
    return np.argmax(probs, axis=1)

计算准确率

In [18]:
def get_acc(X, layers):
    Z_out, memory_forward = full_layers_forward_prop(X,layers)
    probs = softmax(Z_out)
#     print(probs)
#     return np.argmax(probs, axis=1)
    acc = np.mean(Y_test==np.argmax(probs, axis=1))
    return acc

训练函数

In [19]:
def train(X, y, nn_architcture, epochs):
    layers = init_layers(nn_architcture)
    cost_history = []
    accuracy_history = []
    best_acc = 0

    for i in range(epochs):
        Z_out, memory_forward = full_layers_forward_prop(X,layers)
#         print(Z_out.shape)
        cost = loss(Z_out, y)
        acc = get_acc(X_test, layers)
        cost_history.append(cost)
        accuracy_history.append(acc)
        if best_acc < acc :
            best_acc = acc
            
        if i % 50 == 0:
            print('||best_acc => ', best_acc, '||cost => ', cost, '||acc => ', acc)
            print('cost: ', cost)
            print('acc: ', acc)


        memory_backward = full_layers_backward_prop(Z_out, memory_forward, layers, X, y)
        layers = update(layers, memory_backward)
        
        layers.reverse()
        memory_forward.reverse()

    return layers, cost_history, accuracy_history

In [20]:
num_examples = X_train.shape[0] # 训练样本的数量
nn_input_dim = X_train.shape[1] # 输入层的维度
nn_output_dim = 4 # 输出层的维度

# 梯度下降的参数（我直接手动赋值）
epsilon = 0.1 # 梯度下降的学习率
reg_lambda = 0.01 # 正则化的强度
epochs = 500

print('样本数量：', num_examples)
print('输入样本维度：',nn_input_dim)
print('输出数量：',nn_output_dim)

样本数量： 2034
输入样本维度： 20
输出数量： 4


输出

In [21]:
nn_architcture = [
#     {'input_dim': nn_input_dim, 'output_dim': 4, 'actFunc': 'tanh'},
#     {'input_dim': 4, 'output_dim': 6, 'actFunc': 'tanh'},
#     {'input_dim': 6, 'output_dim': 6, 'actFunc': 'tanh'},
#     {'input_dim': 6, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': nn_input_dim, 'output_dim': nn_output_dim, 'actFunc': 'tanh'},
]

layers, cost_history,accuracy_history = train(X_train, Y_train, nn_architcture, epochs)
acc = get_acc(X_test, layers)
# plot_decision_boundary(lambda x: predict(x, layers))
# plt.title("Decision Boundary for hidden layer size 1.")
print("Decision Boundary for hidden layer size 1. acc: ", acc)

w: (20, 4)
b: (1, 4)
||best_acc =>  0.20916481892091648 ||cost =>  1.425108324743589 ||acc =>  0.20916481892091648
cost:  1.425108324743589
acc:  0.20916481892091648
||best_acc =>  0.2727272727272727 ||cost =>  1.359792129615471 ||acc =>  0.2727272727272727
cost:  1.359792129615471
acc:  0.2727272727272727
||best_acc =>  0.31337767923133775 ||cost =>  1.3462430321413155 ||acc =>  0.31337767923133775
cost:  1.3462430321413155
acc:  0.31337767923133775
||best_acc =>  0.352549889135255 ||cost =>  1.3350477237431118 ||acc =>  0.352549889135255
cost:  1.3350477237431118
acc:  0.352549889135255
||best_acc =>  0.3991130820399113 ||cost =>  1.3249253488879633 ||acc =>  0.3991130820399113
cost:  1.3249253488879633
acc:  0.3991130820399113
||best_acc =>  0.42424242424242425 ||cost =>  1.315737045844439 ||acc =>  0.42424242424242425
cost:  1.315737045844439
acc:  0.42424242424242425
||best_acc =>  0.43754619364375463 ||cost =>  1.3073874617082992 ||acc =>  0.43754619364375463
cost:  1.30738746170

In [22]:
nn_architcture = [
    {'input_dim': nn_input_dim, 'output_dim': 8, 'actFunc': 'tanh'},
#     {'input_dim': 4, 'output_dim': 6, 'actFunc': 'tanh'},
#     {'input_dim': 6, 'output_dim': 6, 'actFunc': 'tanh'},
#     {'input_dim': 6, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 8, 'output_dim': nn_output_dim, 'actFunc': 'tanh'},
]

layers, cost_history,accuracy_history  = train(X_train, Y_train, nn_architcture, epochs)
acc = get_acc(X_test, layers)
# plot_decision_boundary(lambda x: predict(x, layers))
# plt.title("Decision Boundary for hidden layer size 2.")
print("Decision Boundary for hidden layer size 2, acc: ", acc)

w: (20, 8)
b: (1, 8)
w: (8, 4)
b: (1, 4)
||best_acc =>  0.24611973392461198 ||cost =>  1.396465543343874 ||acc =>  0.24611973392461198
cost:  1.396465543343874
acc:  0.24611973392461198
||best_acc =>  0.31929046563192903 ||cost =>  1.35794925102426 ||acc =>  0.31929046563192903
cost:  1.35794925102426
acc:  0.31929046563192903
||best_acc =>  0.36954915003695493 ||cost =>  1.3400043177270022 ||acc =>  0.36954915003695493
cost:  1.3400043177270022
acc:  0.36954915003695493
||best_acc =>  0.4042867701404287 ||cost =>  1.3225403500385093 ||acc =>  0.4042867701404287
cost:  1.3225403500385093
acc:  0.4042867701404287
||best_acc =>  0.4205469327420547 ||cost =>  1.3049645718869975 ||acc =>  0.4198078344419808
cost:  1.3049645718869975
acc:  0.4198078344419808
||best_acc =>  0.43385070214338506 ||cost =>  1.2876367747771897 ||acc =>  0.43163340724316335
cost:  1.2876367747771897
acc:  0.43163340724316335
||best_acc =>  0.43754619364375463 ||cost =>  1.2712837947226325 ||acc =>  0.437546193643

In [23]:
nn_architcture = [
    {'input_dim': nn_input_dim, 'output_dim': 20, 'actFunc': 'tanh'},
    {'input_dim': 20, 'output_dim': 20, 'actFunc': 'tanh'},
#     {'input_dim': 20, 'output_dim': 20, 'actFunc': 'tanh'},
#     {'input_dim': 20, 'output_dim': 8, 'actFunc': 'tanh'},
    {'input_dim': 20, 'output_dim': nn_output_dim, 'actFunc': 'tanh'},
]

layers, cost_history,accuracy_history  = train(X_train, Y_train, nn_architcture, epochs)
acc = get_acc(X_test, layers)
# plot_decision_boundary(lambda x: predict(x, layers))
# plt.title("Decision Boundary for hidden layer size 3")
print("Decision Boundary for hidden layer size 3. acc: ", acc)

w: (20, 20)
b: (1, 20)
w: (20, 20)
b: (1, 20)
w: (20, 4)
b: (1, 4)
||best_acc =>  0.2402069475240207 ||cost =>  1.404561893587037 ||acc =>  0.2402069475240207
cost:  1.404561893587037
acc:  0.2402069475240207
||best_acc =>  0.3821138211382114 ||cost =>  1.3391960117285997 ||acc =>  0.38137472283813745
cost:  1.3391960117285997
acc:  0.38137472283813745
||best_acc =>  0.44567627494456763 ||cost =>  1.301085662828957 ||acc =>  0.4449371766444937
cost:  1.301085662828957
acc:  0.4449371766444937
||best_acc =>  0.4575018477457502 ||cost =>  1.2677228345896416 ||acc =>  0.45528455284552843
cost:  1.2677228345896416
acc:  0.45528455284552843
||best_acc =>  0.4575018477457502 ||cost =>  1.24138498344743 ||acc =>  0.4530672579453067
cost:  1.24138498344743
acc:  0.4530672579453067
||best_acc =>  0.4575018477457502 ||cost =>  1.222188838623472 ||acc =>  0.4530672579453067
cost:  1.222188838623472
acc:  0.4530672579453067
||best_acc =>  0.4575018477457502 ||cost =>  1.2085526612234936 ||acc =>  

In [24]:
nn_architcture = [
    {'input_dim': nn_input_dim, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 4, 'output_dim': 6, 'actFunc': 'tanh'},
#     {'input_dim': 6, 'output_dim': 6, 'actFunc': 'tanh'},
    {'input_dim': 6, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 4, 'output_dim': nn_output_dim, 'actFunc': 'tanh'},
]

layers, cost_history,accuracy_history  = train(X_train, Y_train, nn_architcture, epochs)
acc = get_acc(X_test, layers)
# plot_decision_boundary(lambda x: predict(x, layers))
# plt.title("Decision Boundary for hidden layer size 4")
print("Decision Boundary for hidden layer size 4. acc: ", acc)

w: (20, 4)
b: (1, 4)
w: (4, 6)
b: (1, 6)
w: (6, 4)
b: (1, 4)
w: (4, 4)
b: (1, 4)
||best_acc =>  0.19660014781966 ||cost =>  1.399085128895874 ||acc =>  0.19660014781966
cost:  1.399085128895874
acc:  0.19660014781966
||best_acc =>  0.2838137472283814 ||cost =>  1.3723064931815914 ||acc =>  0.2838137472283814
cost:  1.3723064931815914
acc:  0.2838137472283814
||best_acc =>  0.3155949741315595 ||cost =>  1.3652055561990348 ||acc =>  0.3155949741315595
cost:  1.3652055561990348
acc:  0.3155949741315595
||best_acc =>  0.3399852180339985 ||cost =>  1.3465000461528684 ||acc =>  0.3399852180339985
cost:  1.3465000461528684
acc:  0.3399852180339985
||best_acc =>  0.36881005173688103 ||cost =>  1.3188478635829892 ||acc =>  0.36881005173688103
cost:  1.3188478635829892
acc:  0.36881005173688103
||best_acc =>  0.3917220990391722 ||cost =>  1.2891180941147171 ||acc =>  0.3909830007390983
cost:  1.2891180941147171
acc:  0.3909830007390983
||best_acc =>  0.4072431633407243 ||cost =>  1.2647672587684

In [25]:
nn_architcture = [
    {'input_dim': nn_input_dim, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 4, 'output_dim': 6, 'actFunc': 'tanh'},
    {'input_dim': 6, 'output_dim': 6, 'actFunc': 'tanh'},
    {'input_dim': 6, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 4, 'output_dim': nn_output_dim, 'actFunc': 'tanh'},
]

layers, cost_history,accuracy_history  = train(X_train, Y_train, nn_architcture, epochs)
acc = get_acc(X_test, layers)
# plot_decision_boundary(lambda x: predict(x, layers))
# plt.title("Decision Boundary for hidden layer size 4")
print("Decision Boundary for hidden layer size 4. acc: ", acc)

w: (20, 4)
b: (1, 4)
w: (4, 6)
b: (1, 6)
w: (6, 6)
b: (1, 6)
w: (6, 4)
b: (1, 4)
w: (4, 4)
b: (1, 4)
||best_acc =>  0.21581670362158167 ||cost =>  1.3978359110802032 ||acc =>  0.21581670362158167
cost:  1.3978359110802032
acc:  0.21581670362158167
||best_acc =>  0.2749445676274945 ||cost =>  1.3724994102252872 ||acc =>  0.2749445676274945
cost:  1.3724994102252872
acc:  0.2749445676274945
||best_acc =>  0.2934220251293422 ||cost =>  1.3707427201207483 ||acc =>  0.2926829268292683
cost:  1.3707427201207483
acc:  0.2926829268292683
||best_acc =>  0.30303030303030304 ||cost =>  1.3699802359861153 ||acc =>  0.30229120473022913
cost:  1.3699802359861153
acc:  0.30229120473022913
||best_acc =>  0.3082039911308204 ||cost =>  1.3692390315854386 ||acc =>  0.3074648928307465
cost:  1.3692390315854386
acc:  0.3074648928307465
||best_acc =>  0.3170731707317073 ||cost =>  1.3681552455414112 ||acc =>  0.3170731707317073
cost:  1.3681552455414112
acc:  0.3170731707317073
||best_acc =>  0.331116038433