In [1]:
# %% 1
# Package imports
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.linear_model
import matplotlib
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import copy

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train',  categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',  categories=categories)

# pprint(newsgroups_train.data[0])

num_train = len(newsgroups_train.data)
num_test  = len(newsgroups_test.data)

vectorizer = TfidfVectorizer(max_features=100)

X = vectorizer.fit_transform( newsgroups_train.data + newsgroups_test.data )
X_train = X[0:num_train, :]
X_test = X[num_train:num_train+num_test,:]

Y_train = newsgroups_train.target
Y_test = newsgroups_test.target

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(2034, 100) (2034,)
(1353, 100) (1353,)


初始化层

In [2]:
def init_layer(input_dim, output_dim, actFunc):
    np.random.seed(0)
    W = np.random.randn(input_dim, output_dim) / np.sqrt(input_dim)
    b = np.zeros((1,output_dim))
#     print('w:',W.shape)
#     print('b:',b.shape)
    layer = {'W': W, 'b': b, 'actFunc': actFunc}
    return layer

In [3]:
def init_layers(nn_architecture):
    layers = []
    for l in nn_architecture:
        layer = init_layer(l['input_dim'], l['output_dim'], l['actFunc'])
        layers.append(layer)
    return layers

激活函数

In [4]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def relu(z):
    return np.maximum(0, z)


softmax

In [5]:
def softmax(Z):
    exp_scores = np.exp(Z)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    return probs

反向传播

In [6]:
def sigmoid_backward(dA, Z):
    sig = sigmoid(Z)
    return dA * sig * (1-sig)

def relu_backward(dA, Z):
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0] = 0
    return dZ

def tanh_backward(dA, Z):
    t = np.tanh(Z)
    res = (1 - t * t)
#     print('res:', res.shape)
#     print('dA:', dA.shape)
    return res * dA

损失函数

In [7]:
def loss(Z, y):
    # 计算损失
    probs = softmax(Z)
    corect_logprobs = -np.log(probs[range(num_examples), y])
    data_loss = np.sum(corect_logprobs)
    #在损失上加上正则项（可选）
    # data_loss += reg_lambda/2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
    return 1./num_examples * data_loss

前向传播

In [8]:
def single_layer_forward_prop(X, layer):
    W = layer['W']
#     print(W.shape)
    Z = X.dot(layer['W']) + layer['b']
    if layer['actFunc'] is 'relu':
        actFunction = relu
    elif layer['actFunc'] is 'sigmoid':
        actFunction = sigmoid
    else:
        actFunction = np.tanh
    return actFunction(Z), Z

In [9]:
def full_layers_forward_prop(X, layers):
    memory_forward = []
    Z_out = X
    memo_forward = {
        'Z_out': X
    }
    memory_forward.append(memo_forward)
    layers_now = 0
    for layer in layers:
#         print('forward layers_now:',layers_now)
        Z_out, Z_hide = single_layer_forward_prop(Z_out, layer)
        memo_forward = {
            'Z_out': Z_out,
            'Z_hide': Z_hide
        }
        memory_forward.append(memo_forward)
        layers_now += 1

    # 返回最终的Z_out => actFunc(Z=X*W + b)
    # memory_forward记录每一层的Z_out=actFunc(Z_hide)和Z_hide=W*X+b
#     print('Z_out: ',Z_out.shape)
    return Z_out, memory_forward

反向传播

In [10]:
def single_layer_backward_prop(memo_forward_now, memo_forward_pre, dA_now, layer):
    # 前向神经元个数
    # dA_now为由下一层传回的梯度
    # memo_forward_pre 记录上一层计算结果， Z_hide=X*w+b和Z_out => X_pre
    # memo_forward_now 记录当前层的计算结果，Z_hide => Z_now和Z_out
    X_pre = memo_forward_pre['Z_out']
    Z_now = memo_forward_now['Z_hide']
    back_dim = X_pre.shape[0]

    if layer['actFunc'] is 'sigmoid':
        actFuncBack = sigmoid_backward
    elif layer['actFunc'] is 'relu':
        actFuncBack = relu_backward
    else:
        actFuncBack = tanh_backward

    # 计算当前层外层导数
    # dZ_now = actFunc'(Z_hide)
    dZ_now = actFuncBack(dA_now, Z_now)
    # dW_now = actFunc'(Z_hide) * (X=Z_hide*dW)
#     print('X_pre',X_pre.shape)
#     print('dZ_now',dZ_now.shape)
#     print('dA_now',dA_now.shape)
#     print('Z_now',Z_now.shape)
    dW_now = X_pre.T.dot(dZ_now) / back_dim
    # db_now = actFunc'(Z_hide) * (1=Z_hide*db); 维度转换
    db_now = np.sum(dZ_now, axis=0, keepdims=True) / back_dim
#     print('dW_now:',dW_now.shape)
#     print('db_now',db_now.shape)
    # dA_pre为向前一层传递的梯度；对上一层的Z_out即本层的X求导结果
    # dA_pre = actFunc'(Z_hide) * (W=Z_hide*dX)
    W_now = copy.deepcopy(layer['W'])
    dA_pre = dZ_now.dot(W_now.T)
#     print('dA_pre',dA_pre.shape)
    
    return dA_pre,dW_now, db_now

In [11]:
def full_layers_backward_prop(Z_out, memory_forward, layers, X, y):
#     Z_out, memo_forward = full_layers_forward_prop(X, layers)
    # 反向传播
    probs = softmax(Z_out)
    probs[range(num_examples), y] -= 1
    dA_pre = probs
#     print('dA_now:', dA_now.shape)
#     print('probs:', probs.shape)
    memory_backward = []
    layers.reverse()
    memory_forward.reverse()

    length = len(layers)
    for idx in range(length):
#         print('layer_now:', idx)
        dA_pre, dW_now, db_now = single_layer_backward_prop(memory_forward[idx],memory_forward[idx+1],dA_pre,layers[idx])
        memo_backward = {
            'dW_now': dW_now,
            'db_now': db_now
        }
        memory_backward.append(memo_backward)

    return memory_backward

更新梯度

In [12]:
def update(layers, memory_backward,learning_rate):
#     print('layers: ',len(layers)
#     print('memory_backward: ', len(memory_backward))
    length = len(layers)
#     print(memory_backward)
#     print(layers)
#     print(memory_backward)
    for idx in range(length):
        dW = memory_backward[idx]['dW_now']
#         print('dW.shape: ', dW.shape)
        layers[idx]['W'] -= learning_rate * memory_backward[idx]['dW_now']
        layers[idx]['b'] -= learning_rate * memory_backward[idx]['db_now']
        
#         sgd_momentum(w, dw, config=None)
        
        
#     print(memory_backward)
#     print(layers)
    return layers

预测函数

In [13]:
def predict(X, layers):
    Z_out, memory_forward = full_layers_forward_prop(X,layers)
    probs = softmax(Z_out)
    return np.argmax(probs, axis=1)

计算准确率

In [14]:
def get_acc(X, layers):
    acc = np.mean(Y_test==predict(X, layers))
    return acc

In [15]:
def model():
    pass

训练函数

In [134]:
import math
def train(X, y, nn_architcture, epochs,base_lr):
    layers = init_layers(nn_architcture)
    cost_history = []
    accuracy_history = []
    best_acc = 0
    lr=base_lr
    
#     gamma = 0.4
    gamma = 0.99998
    power = 0.9998
    stepsize = 100
    
    # sigmoid
    gamma = 0.05
    stepsize = 200  
    i_lr = 1
    
    for i in range(epochs):
        Z_out, memory_forward = full_layers_forward_prop(X,layers)
#         print(Z_out.shape)
        cost = loss(Z_out, y)
        acc = get_acc(X_test, layers)
        cost_history.append(cost)
        accuracy_history.append(acc)
        if best_acc < acc :
            best_acc = acc
            
        if i % 100 == 0:
#             print('||best_acc => ', best_acc, '||cost => ', cost, '||acc => ', acc)
#             print('cost: ', cost)
            print('acc: ', acc)
            print('learning_rate',lr)
        
        if i % 100 == 0:
#             lr=lr * gamma **( math.floor ( i_lr / stepsize ) )
#               lr=lr * gamma ** i_lr
#               lr=lr * ( 1 - i_lr / 20 ) ** ( power )
#               lr=lr * ( 1/ ( 1 + np.exp( gamma * ( i_lr - stepsize ) )) )
              lr=lr * (1 + gamma * i_lr) ** (- power)
              i_lr +=1
    
    
#             print('i_lr',i_lr)
#             print('learning_rate',lr)
            
        memory_backward = full_layers_backward_prop(Z_out, memory_forward, layers, X, y)
        layers = update(layers, memory_backward,lr)
        
        
        layers.reverse()
        memory_forward.reverse()

    return layers, cost_history, accuracy_history

In [135]:
num_examples, nn_input_dim = X_train.shape  # 训练样本的数量  输入层的维度
nn_output_dim = 4 # 输出层的维度

# 梯度下降的参数（我直接手动赋值）
epsilon = 0.05 # 初始的学习率
reg_lambda = 0.01 # 正则化的强度
epochs = 2000

print('样本数量：', num_examples)
print('输入样本维度：',nn_input_dim)
print('输出数量：',nn_output_dim)

样本数量： 2034
输入样本维度： 100
输出数量： 4


输出

In [136]:
# nn_architcture = [
#      {'input_dim': nn_input_dim, 'output_dim': 20, 'actFunc': 'relu'},
#     {'input_dim': 20, 'output_dim': 20, 'actFunc': 'relu'},
# #     {'input_dim': 6, 'output_dim': 6, 'actFunc': 'tanh'},
# #     {'input_dim': 6, 'output_dim': 4, 'actFunc': 'tanh'},
#     {'input_dim': 20, 'output_dim': nn_output_dim, 'actFunc': 'relu'},
# ]

# layers, cost_history,accuracy_history = train(X_train, Y_train, nn_architcture, epochs,epsilon)
# acc = get_acc(X_test, layers)

# print("hidden layer size 1. acc: ", acc)

In [137]:
nn_architcture = [
    {'input_dim': nn_input_dim, 'output_dim': 20, 'actFunc': 'tanh'},
    {'input_dim': 20, 'output_dim': 20, 'actFunc': 'tanh'},
#     {'input_dim': 20, 'output_dim': 20, 'actFunc': 'tanh'},
#     {'input_dim': 20, 'output_dim': 8, 'actFunc': 'tanh'},
    {'input_dim': 20, 'output_dim': nn_output_dim, 'actFunc': 'tanh'},
]

layers, cost_history,accuracy_history  = train(X_train, Y_train, nn_architcture, epochs,epsilon)
acc = get_acc(X_test, layers)

print("hidden layer size 3. acc: ", acc)

acc:  0.18551367331855137
learning_rate 0.05
acc:  0.3436807095343681
learning_rate 0.04761951228954494
acc:  0.4715447154471545
learning_rate 0.04329129093004749
acc:  0.5395417590539542
learning_rate 0.03764565307994942
acc:  0.5639320029563932
learning_rate 0.031372521523160536
acc:  0.5831485587583148
learning_rate 0.02509913733566155
acc:  0.5920177383592018
learning_rate 0.01930804184119793
acc:  0.5912786400591279
learning_rate 0.014303111675839177
acc:  0.5971914264597191
learning_rate 0.010217195877300909
acc:  0.5971914264597191
learning_rate 0.007046865636579731
acc:  0.6001478196600147
learning_rate 0.00469829140758544
acc:  0.6008869179600886
learning_rate 0.0030314214422638725
acc:  0.6001478196600147
learning_rate 0.0018948165071707788
acc:  0.6008869179600886
learning_rate 0.0011484886618975552
acc:  0.6008869179600886
learning_rate 0.0006756532661595908
acc:  0.6008869179600886
learning_rate 0.0003861307952220661
acc:  0.6008869179600886
learning_rate 0.000214542327998

In [142]:
nn_architcture = [
    {'input_dim': nn_input_dim, 'output_dim': 8, 'actFunc': 'tanh'},
#     {'input_dim': 4, 'output_dim': 6, 'actFunc': 'tanh'},
#     {'input_dim': 6, 'output_dim': 6, 'actFunc': 'tanh'},
#     {'input_dim': 6, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 8, 'output_dim': nn_output_dim, 'actFunc': 'tanh'},
]

layers, cost_history,accuracy_history  = train(X_train, Y_train, nn_architcture, epochs,epsilon)
acc = get_acc(X_test, layers)

print("hidden layer size 2, acc: ", acc)

acc:  0.29637841832963785
learning_rate 0.05
acc:  0.4138950480413895
learning_rate 0.04761951228954494
acc:  0.49223946784922396
learning_rate 0.04329129093004749
acc:  0.5343680709534369
learning_rate 0.03764565307994942
acc:  0.5535846267553585
learning_rate 0.031372521523160536
acc:  0.5691056910569106
learning_rate 0.02509913733566155
acc:  0.5794530672579453
learning_rate 0.01930804184119793
acc:  0.5853658536585366
learning_rate 0.014303111675839177
acc:  0.5868440502586844
learning_rate 0.010217195877300909
acc:  0.5868440502586844
learning_rate 0.007046865636579731
acc:  0.5898004434589801
learning_rate 0.00469829140758544
acc:  0.590539541759054
learning_rate 0.0030314214422638725
acc:  0.5920177383592018
learning_rate 0.0018948165071707788
acc:  0.5920177383592018
learning_rate 0.0011484886618975552
acc:  0.5927568366592757
learning_rate 0.0006756532661595908
acc:  0.5927568366592757
learning_rate 0.0003861307952220661
acc:  0.5927568366592757
learning_rate 0.000214542327998

In [139]:
nn_architcture = [
    {'input_dim': nn_input_dim, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 4, 'output_dim': 6, 'actFunc': 'tanh'},
#     {'input_dim': 6, 'output_dim': 6, 'actFunc': 'tanh'},
    {'input_dim': 6, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 4, 'output_dim': nn_output_dim, 'actFunc': 'tanh'},
]

layers, cost_history,accuracy_history  = train(X_train, Y_train, nn_architcture, epochs,epsilon)
acc = get_acc(X_test, layers)

print("hidden layer size 4. acc: ", acc)

acc:  0.28085735402808576
learning_rate 0.05
acc:  0.42350332594235035
learning_rate 0.04761951228954494
acc:  0.49002217294900224
learning_rate 0.04329129093004749
acc:  0.5210643015521065
learning_rate 0.03764565307994942
acc:  0.5336289726533628
learning_rate 0.031372521523160536
acc:  0.5424981522542498
learning_rate 0.02509913733566155
acc:  0.5424981522542498
learning_rate 0.01930804184119793
acc:  0.5439763488543976
learning_rate 0.014303111675839177
acc:  0.5484109386548411
learning_rate 0.010217195877300909
acc:  0.5513673318551368
learning_rate 0.007046865636579731
acc:  0.5521064301552107
learning_rate 0.00469829140758544
acc:  0.5543237250554324
learning_rate 0.0030314214422638725
acc:  0.5558019216555802
learning_rate 0.0018948165071707788
acc:  0.5558019216555802
learning_rate 0.0011484886618975552
acc:  0.5565410199556541
learning_rate 0.0006756532661595908
acc:  0.5565410199556541
learning_rate 0.0003861307952220661
acc:  0.557280118255728
learning_rate 0.00021454232799

In [144]:
nn_architcture = [
    {'input_dim': nn_input_dim, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 4, 'output_dim': 6, 'actFunc': 'tanh'},
    {'input_dim': 6, 'output_dim': 6, 'actFunc': 'tanh'},
    {'input_dim': 6, 'output_dim': 4, 'actFunc': 'tanh'},
    {'input_dim': 4, 'output_dim': nn_output_dim, 'actFunc': 'tanh'},
]

layers, cost_history,accuracy_history  =  train(X_train, Y_train, nn_architcture, epochs, epsilon)
acc = get_acc(X_test, layers)

print("hidden layer size 4. acc: ", acc)

acc:  0.19807834441980784
learning_rate 0.05
acc:  0.3296378418329638
learning_rate 0.04761951228954494
acc:  0.43089430894308944
learning_rate 0.04329129093004749
acc:  0.5040650406504065
learning_rate 0.03764565307994942
acc:  0.5262379896526238
learning_rate 0.031372521523160536
acc:  0.5388026607538803
learning_rate 0.02509913733566155
acc:  0.549150036954915
learning_rate 0.01930804184119793
acc:  0.5506282335550629
learning_rate 0.014303111675839177
acc:  0.5528455284552846
learning_rate 0.010217195877300909
acc:  0.5543237250554324
learning_rate 0.007046865636579731
acc:  0.5558019216555802
learning_rate 0.00469829140758544
acc:  0.5580192165558019
learning_rate 0.0030314214422638725
acc:  0.5587583148558758
learning_rate 0.0018948165071707788
acc:  0.5580192165558019
learning_rate 0.0011484886618975552
acc:  0.5587583148558758
learning_rate 0.0006756532661595908
acc:  0.5580192165558019
learning_rate 0.0003861307952220661
acc:  0.5580192165558019
learning_rate 0.000214542327998

In [141]:
def sgd_momentum(w, dw, config=None):
    """
    带动量的sgd实现
    """
    if config is None: config = {}
    learning_rate=config.setdefault('learning_rate', 1e-2)
    mu=config.setdefault('momentum', 0.9)
    v = config.get('velocity', np.zeros_like(w))
 
    next_w = None 
    v = mu * v - learning_rate * dw 
    next_w = w + v
    
    print(learning_rate)

    config['velocity'] = v
 
    return next_w, config

In [52]:

w = 0.12
dw = 0.01
config={}
next_w,cofig=sgd_momentum(w, dw, config)
print(next_w)
print(config)


0.01
0.11989999999999999
{'learning_rate': 0.01, 'momentum': 0.9, 'velocity': -0.0001}
