In [1]:
# %% 1
# Package imports
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.linear_model
import matplotlib
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import copy

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train',  categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',  categories=categories)

# pprint(newsgroups_train.data[0])

num_train = len(newsgroups_train.data)
num_test  = len(newsgroups_test.data)

vectorizer = TfidfVectorizer(max_features=100)

X = vectorizer.fit_transform( newsgroups_train.data + newsgroups_test.data )
X_train = X[0:num_train, :]
X_test = X[num_train:num_train+num_test,:]

Y_train = newsgroups_train.target
Y_test = newsgroups_test.target

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(2034, 100) (2034,)
(1353, 100) (1353,)


初始化层

In [6]:
def init_layer(input_dim, output_dim, actFunc):
    np.random.seed(0)
    W = np.random.randn(input_dim, output_dim) / np.sqrt(input_dim)
    b = np.zeros((1,output_dim))
#     print('w:',W.shape)
#     print('b:',b.shape)
    layer = {'W': W, 'b': b, 'actFunc': actFunc}
    return layer

In [7]:
def init_layers(nn_architecture):
    layers = []
    for l in nn_architecture:
        layer = init_layer(l['input_dim'], l['output_dim'], l['actFunc'])
        layers.append(layer)
    return layers

激活函数

In [8]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def relu(z):
    return np.maximum(0, z)


softmax

In [9]:
def softmax(Z):
    exp_scores = np.exp(Z)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    return probs

反向传播

In [10]:
def sigmoid_backward(dA, Z):
    sig = sigmoid(Z)
    return dA * sig * (1-sig)

def relu_backward(dA, Z):
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0] = 0
    return dZ

def tanh_backward(dA, Z):
    t = np.tanh(Z)
    res = (1 - t * t)
#     print('res:', res.shape)
#     print('dA:', dA.shape)
    return res * dA

损失函数

In [11]:
def loss(Z, y):
    # 计算损失
    probs = softmax(Z)
    corect_logprobs = -np.log(probs[range(num_examples), y])
    data_loss = np.sum(corect_logprobs)
    #在损失上加上正则项（可选）
    # data_loss += reg_lambda/2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
    return 1./num_examples * data_loss

前向传播

In [12]:
def single_layer_forward_prop(X, layer):
    W = layer['W']
#     print(W.shape)
    Z = X.dot(layer['W']) + layer['b']
    if layer['actFunc'] is 'relu':
        actFunction = relu
    elif layer['actFunc'] is 'sigmoid':
        actFunction = sigmoid
    else:
        actFunction = np.tanh
    return actFunction(Z), Z

In [13]:
def full_layers_forward_prop(X, layers):
    memory_forward = []
    Z_out = X
    memo_forward = {
        'Z_out': X
    }
    memory_forward.append(memo_forward)
    layers_now = 0
    for layer in layers:
#         print('forward layers_now:',layers_now)
        Z_out, Z_hide = single_layer_forward_prop(Z_out, layer)
        memo_forward = {
            'Z_out': Z_out,
            'Z_hide': Z_hide
        }
        memory_forward.append(memo_forward)
        layers_now += 1

    # 返回最终的Z_out => actFunc(Z=X*W + b)
    # memory_forward记录每一层的Z_out=actFunc(Z_hide)和Z_hide=W*X+b
#     print('Z_out: ',Z_out.shape)
    return Z_out, memory_forward

反向传播

In [14]:
def single_layer_backward_prop(memo_forward_now, memo_forward_pre, dA_now, layer):
    # 前向神经元个数
    # dA_now为由下一层传回的梯度
    # memo_forward_pre 记录上一层计算结果， Z_hide=X*w+b和Z_out => X_pre
    # memo_forward_now 记录当前层的计算结果，Z_hide => Z_now和Z_out
    X_pre = memo_forward_pre['Z_out']
    Z_now = memo_forward_now['Z_hide']
    back_dim = X_pre.shape[0]

    if layer['actFunc'] is 'sigmoid':
        actFuncBack = sigmoid_backward
    elif layer['actFunc'] is 'relu':
        actFuncBack = relu_backward
    else:
        actFuncBack = tanh_backward

    # 计算当前层外层导数
    # dZ_now = actFunc'(Z_hide)
    dZ_now = actFuncBack(dA_now, Z_now)
    # dW_now = actFunc'(Z_hide) * (X=Z_hide*dW)
#     print('X_pre',X_pre.shape)
#     print('dZ_now',dZ_now.shape)
#     print('dA_now',dA_now.shape)
#     print('Z_now',Z_now.shape)
    dW_now = X_pre.T.dot(dZ_now) / back_dim
    # db_now = actFunc'(Z_hide) * (1=Z_hide*db); 维度转换
    db_now = np.sum(dZ_now, axis=0, keepdims=True) / back_dim
#     print('dW_now:',dW_now.shape)
#     print('db_now',db_now.shape)
    # dA_pre为向前一层传递的梯度；对上一层的Z_out即本层的X求导结果
    # dA_pre = actFunc'(Z_hide) * (W=Z_hide*dX)
    W_now = copy.deepcopy(layer['W'])
    dA_pre = dZ_now.dot(W_now.T)
#     print('dA_pre',dA_pre.shape)
    
    return dA_pre,dW_now, db_now

In [15]:
def full_layers_backward_prop(Z_out, memory_forward, layers, X, y):
#     Z_out, memo_forward = full_layers_forward_prop(X, layers)
    # 反向传播
    probs = softmax(Z_out)
    probs[range(num_examples), y] -= 1
    dA_pre = probs
#     print('dA_now:', dA_now.shape)
#     print('probs:', probs.shape)
    memory_backward = []
    layers.reverse()
    memory_forward.reverse()

    length = len(layers)
    for idx in range(length):
#         print('layer_now:', idx)
        dA_pre, dW_now, db_now = single_layer_backward_prop(memory_forward[idx],memory_forward[idx+1],dA_pre,layers[idx])
        memo_backward = {
            'dW_now': dW_now,
            'db_now': db_now
        }
        memory_backward.append(memo_backward)

    return memory_backward

更新网络

In [16]:
def update(layers, memory_backward,learning_rate):
#     print('layers: ',len(layers)
#     print('memory_backward: ', len(memory_backward))
    length = len(layers)
#     print(memory_backward)
#     print(layers)
#     print(memory_backward)
    for idx in range(length):
        dW = memory_backward[idx]['dW_now']
#         print('dW.shape: ', dW.shape)
        layers[idx]['W'] -= learning_rate * memory_backward[idx]['dW_now']
        layers[idx]['b'] -= learning_rate * memory_backward[idx]['db_now']
        
#     print(memory_backward)
#     print(layers)
    return layers

预测函数

In [17]:
def predict(X, layers):
    Z_out, memory_forward = full_layers_forward_prop(X,layers)
    probs = softmax(Z_out)
    return np.argmax(probs, axis=1)

计算准确率

In [18]:
def get_acc(X, layers):
    acc = np.mean(Y_test==predict(X, layers))
    return acc

In [19]:
def model():
    pass

训练函数

In [20]:
def train(X, y, nn_architcture, epochs,base_lr):
    layers = init_layers(nn_architcture)
    cost_history = []
    accuracy_history = []
    best_acc = 0
    lr=base_lr
    
    for i in range(epochs):
        Z_out, memory_forward = full_layers_forward_prop(X,layers)
#         print(Z_out.shape)
        cost = loss(Z_out, y)
        acc = get_acc(X_test, layers)
        cost_history.append(cost)
        accuracy_history.append(acc)
        if best_acc < acc :
            best_acc = acc
            
        if i % 1000 == 0:
            print('||best_acc => ', best_acc, '||cost => ', cost, '||acc => ', acc)
            print('cost: ', cost)
            print('acc: ', acc)
            print('learning_rate',lr)

        #lr=base_lr/(1+1e-6*(i+1))
        memory_backward = full_layers_backward_prop(Z_out, memory_forward, layers, X, y)
        layers = update(layers, memory_backward,lr)
        
        layers.reverse()
        memory_forward.reverse()

    return layers, cost_history, accuracy_history

In [None]:
num_examples = X_train.shape[0] # 训练样本的数量
nn_input_dim = X_train.shape[1] # 输入层的维度
nn_output_dim = 4 # 输出层的维度

# 梯度下降的参数（我直接手动赋值）
epsilon = 0.05 # 初始的学习率
reg_lambda = 0.01 # 正则化的强度
epochs = 2000

print('样本数量：', num_examples)
print('输入样本维度：',nn_input_dim)
print('输出数量：',nn_output_dim)

输出

In [None]:
nn_architcture = [
     {'input_dim': nn_input_dim, 'output_dim': 20, 'actFunc': 'relu'},
    {'input_dim': 20, 'output_dim': 20, 'actFunc': 'relu'},
#     {'input_dim': 6, 'output_dim': 6, 'actFunc': 'tanh'},
#     {'input_dim': 6, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 20, 'output_dim': nn_output_dim, 'actFunc': 'relu'},
]

layers, cost_history,accuracy_history = train(X_train, Y_train, nn_architcture, epochs,epsilon)
acc = get_acc(X_test, layers)
# plot_decision_boundary(lambda x: predict(x, layers))
# plt.title("Decision Boundary for hidden layer size 1.")
print("Decision Boundary for hidden layer size 1. acc: ", acc)

In [23]:
nn_architcture = [
    {'input_dim': nn_input_dim, 'output_dim': 20, 'actFunc': 'tanh'},
    {'input_dim': 20, 'output_dim': 20, 'actFunc': 'tanh'},
#     {'input_dim': 20, 'output_dim': 20, 'actFunc': 'tanh'},
#     {'input_dim': 20, 'output_dim': 8, 'actFunc': 'tanh'},
    {'input_dim': 20, 'output_dim': nn_output_dim, 'actFunc': 'tanh'},
]

layers, cost_history,accuracy_history  = train(X_train, Y_train, nn_architcture, epochs,epsilon)
acc = get_acc(X_test, layers)
# plot_decision_boundary(lambda x: predict(x, layers))
# plt.title("Decision Boundary for hidden layer size 3")
print("Decision Boundary for hidden layer size 3. acc: ", acc)

||best_acc =>  0.18551367331855137 ||cost =>  1.3935356280445361 ||acc =>  0.18551367331855137
cost:  1.3935356280445361
acc:  0.18551367331855137
learning_rate 0.01
||best_acc =>  0.22838137472283815 ||cost =>  1.37645351472104 ||acc =>  0.2246858832224686
cost:  1.37645351472104
acc:  0.2246858832224686
learning_rate 0.009949628981268435
||best_acc =>  0.25942350332594233 ||cost =>  1.3661297178539016 ||acc =>  0.25942350332594233
cost:  1.3661297178539016
acc:  0.25942350332594233
learning_rate 0.009801019748273843
||best_acc =>  0.28972653362897266 ||cost =>  1.357654499160218 ||acc =>  0.28898743532889876
cost:  1.357654499160218
acc:  0.28898743532889876
learning_rate 0.009558584151996714
||best_acc =>  0.3148558758314856 ||cost =>  1.3497588007067556 ||acc =>  0.3148558758314856
cost:  1.3497588007067556
acc:  0.3148558758314856
learning_rate 0.009229416212063095
||best_acc =>  0.3451589061345159 ||cost =>  1.342023928950117 ||acc =>  0.3451589061345159
cost:  1.342023928950117


||best_acc =>  0.5351071692535108 ||cost =>  1.2460807309753403 ||acc =>  0.5351071692535108
cost:  1.2460807309753403
acc:  0.5351071692535108
learning_rate 2.5774903588173645e-07
||best_acc =>  0.5351071692535108 ||cost =>  1.2460803862504852 ||acc =>  0.5351071692535108
cost:  1.2460803862504852
acc:  0.5351071692535108
learning_rate 1.6206774717369139e-07
||best_acc =>  0.5351071692535108 ||cost =>  1.2460801704969977 ||acc =>  0.5351071692535108
cost:  1.2460801704969977
acc:  0.5351071692535108
learning_rate 1.0089589908624819e-07
||best_acc =>  0.5351071692535108 ||cost =>  1.246080036799271 ||acc =>  0.5351071692535108
cost:  1.246080036799271
acc:  0.5351071692535108
learning_rate 6.21910999424082e-08
||best_acc =>  0.5351071692535108 ||cost =>  1.2460799547694674 ||acc =>  0.5351071692535108
cost:  1.2460799547694674
acc:  0.5351071692535108
learning_rate 3.795431829180078e-08
||best_acc =>  0.5351071692535108 ||cost =>  1.2460799049382802 ||acc =>  0.5351071692535108
cost:  

||best_acc =>  0.5351071692535108 ||cost =>  1.2460798322812288 ||acc =>  0.5351071692535108
cost:  1.2460798322812288
acc:  0.5351071692535108
learning_rate 4.729055091374465e-21
||best_acc =>  0.5351071692535108 ||cost =>  1.2460798322812288 ||acc =>  0.5351071692535108
cost:  1.2460798322812288
acc:  0.5351071692535108
learning_rate 1.883115973193672e-21
||best_acc =>  0.5351071692535108 ||cost =>  1.2460798322812288 ||acc =>  0.5351071692535108
cost:  1.2460798322812288
acc:  0.5351071692535108
learning_rate 7.424664529129591e-22
||best_acc =>  0.5351071692535108 ||cost =>  1.2460798322812288 ||acc =>  0.5351071692535108
cost:  1.2460798322812288
acc:  0.5351071692535108
learning_rate 2.8985053395457594e-22
||best_acc =>  0.5351071692535108 ||cost =>  1.2460798322812288 ||acc =>  0.5351071692535108
cost:  1.2460798322812288
acc:  0.5351071692535108
learning_rate 1.1203901988019215e-22
||best_acc =>  0.5351071692535108 ||cost =>  1.2460798322812288 ||acc =>  0.5351071692535108
cost:

In [24]:
nn_architcture = [
    {'input_dim': nn_input_dim, 'output_dim': 8, 'actFunc': 'tanh'},
#     {'input_dim': 4, 'output_dim': 6, 'actFunc': 'tanh'},
#     {'input_dim': 6, 'output_dim': 6, 'actFunc': 'tanh'},
#     {'input_dim': 6, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 8, 'output_dim': nn_output_dim, 'actFunc': 'tanh'},
]

layers, cost_history,accuracy_history  = train(X_train, Y_train, nn_architcture, epochs,epsilon)
acc = get_acc(X_test, layers)
# plot_decision_boundary(lambda x: predict(x, layers))
# plt.title("Decision Boundary for hidden layer size 2.")
print("Decision Boundary for hidden layer size 2, acc: ", acc)

||best_acc =>  0.29637841832963785 ||cost =>  1.373042324108176 ||acc =>  0.29637841832963785
cost:  1.373042324108176
acc:  0.29637841832963785
learning_rate 0.01
||best_acc =>  0.31263858093126384 ||cost =>  1.3580467054868781 ||acc =>  0.31263858093126384
cost:  1.3580467054868781
acc:  0.31263858093126384
learning_rate 0.009949628981268435
||best_acc =>  0.3340724316334072 ||cost =>  1.3479520935995346 ||acc =>  0.3340724316334072
cost:  1.3479520935995346
acc:  0.3340724316334072
learning_rate 0.009801019748273843
||best_acc =>  0.35550628233555065 ||cost =>  1.3399167305924713 ||acc =>  0.35476718403547675
cost:  1.3399167305924713
acc:  0.35476718403547675
learning_rate 0.009558584151996714
||best_acc =>  0.3821138211382114 ||cost =>  1.3328393361808368 ||acc =>  0.3821138211382114
cost:  1.3328393361808368
acc:  0.3821138211382114
learning_rate 0.009229416212063095
||best_acc =>  0.41611234294161126 ||cost =>  1.3262966869486423 ||acc =>  0.41611234294161126
cost:  1.3262966869

||best_acc =>  0.532150776053215 ||cost =>  1.2680615972586744 ||acc =>  0.532150776053215
cost:  1.2680615972586744
acc:  0.532150776053215
learning_rate 2.5774903588173645e-07
||best_acc =>  0.532150776053215 ||cost =>  1.2680614312847989 ||acc =>  0.532150776053215
cost:  1.2680614312847989
acc:  0.532150776053215
learning_rate 1.6206774717369139e-07
||best_acc =>  0.532150776053215 ||cost =>  1.2680613274065338 ||acc =>  0.532150776053215
cost:  1.2680613274065338
acc:  0.532150776053215
learning_rate 1.0089589908624819e-07
||best_acc =>  0.532150776053215 ||cost =>  1.268061263035466 ||acc =>  0.532150776053215
cost:  1.268061263035466
acc:  0.532150776053215
learning_rate 6.21910999424082e-08
||best_acc =>  0.532150776053215 ||cost =>  1.268061223540817 ||acc =>  0.532150776053215
cost:  1.268061223540817
acc:  0.532150776053215
learning_rate 3.795431829180078e-08
||best_acc =>  0.532150776053215 ||cost =>  1.268061199548745 ||acc =>  0.532150776053215
cost:  1.268061199548745
ac

||best_acc =>  0.532150776053215 ||cost =>  1.268061164566777 ||acc =>  0.532150776053215
cost:  1.268061164566777
acc:  0.532150776053215
learning_rate 1.883115973193672e-21
||best_acc =>  0.532150776053215 ||cost =>  1.268061164566777 ||acc =>  0.532150776053215
cost:  1.268061164566777
acc:  0.532150776053215
learning_rate 7.424664529129591e-22
||best_acc =>  0.532150776053215 ||cost =>  1.268061164566777 ||acc =>  0.532150776053215
cost:  1.268061164566777
acc:  0.532150776053215
learning_rate 2.8985053395457594e-22
||best_acc =>  0.532150776053215 ||cost =>  1.268061164566777 ||acc =>  0.532150776053215
cost:  1.268061164566777
acc:  0.532150776053215
learning_rate 1.1203901988019215e-22
||best_acc =>  0.532150776053215 ||cost =>  1.268061164566777 ||acc =>  0.532150776053215
cost:  1.268061164566777
acc:  0.532150776053215
learning_rate 4.288079715289919e-23
||best_acc =>  0.532150776053215 ||cost =>  1.268061164566777 ||acc =>  0.532150776053215
cost:  1.268061164566777
acc:  0.

In [25]:
nn_architcture = [
    {'input_dim': nn_input_dim, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 4, 'output_dim': 6, 'actFunc': 'tanh'},
#     {'input_dim': 6, 'output_dim': 6, 'actFunc': 'tanh'},
    {'input_dim': 6, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 4, 'output_dim': nn_output_dim, 'actFunc': 'tanh'},
]

layers, cost_history,accuracy_history  = train(X_train, Y_train, nn_architcture, epochs,epsilon)
acc = get_acc(X_test, layers)
# plot_decision_boundary(lambda x: predict(x, layers))
# plt.title("Decision Boundary for hidden layer size 4")
print("Decision Boundary for hidden layer size 4. acc: ", acc)

TypeError: train() missing 1 required positional argument: 'base_lr'

In [26]:
nn_architcture = [
    {'input_dim': nn_input_dim, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 4, 'output_dim': 6, 'actFunc': 'tanh'},
    {'input_dim': 6, 'output_dim': 6, 'actFunc': 'tanh'},
    {'input_dim': 6, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 4, 'output_dim': nn_output_dim, 'actFunc': 'tanh'},
]

layers, cost_history,accuracy_history  = train(X_train, Y_train, nn_architcture, epochs,epsilon)
acc = get_acc(X_test, layers)
# plot_decision_boundary(lambda x: predict(x, layers))
# plt.title("Decision Boundary for hidden layer size 4")
print("Decision Boundary for hidden layer size 4. acc: ", acc)

||best_acc =>  0.2564671101256467 ||cost =>  1.3928003874755799 ||acc =>  0.2564671101256467
cost:  1.3928003874755799
acc:  0.2564671101256467
learning_rate 0.05
||best_acc =>  0.5824094604582409 ||cost =>  0.9812202225650672 ||acc =>  0.5691056910569106
cost:  0.9812202225650672
acc:  0.5691056910569106
learning_rate 0.05
||best_acc =>  0.6452328159645233 ||cost =>  0.789289300993758 ||acc =>  0.6415373244641537
cost:  0.789289300993758
acc:  0.6415373244641537
learning_rate 0.05
||best_acc =>  0.6548410938654841 ||cost =>  0.7579981813165525 ||acc =>  0.6452328159645233
cost:  0.7579981813165525
acc:  0.6452328159645233
learning_rate 0.05
||best_acc =>  0.6548410938654841 ||cost =>  0.7495181534481822 ||acc =>  0.6444937176644494
cost:  0.7495181534481822
acc:  0.6444937176644494
learning_rate 0.05
Decision Boundary for hidden layer size 4. acc:  0.6496674057649667
