## Part 1

In [14]:
import numpy as np

# data I/O
data = open(r'D:\Deep_Learning\hw5\shakespeare_train.txt', 'r').read() # should be simple plain text file
chars = list(set(data)) # 得到输入文件中所有字符种类
data_size, vocab_size =  len(data), len(chars)#统计文件字符数和字符种类数
print ('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = {ch: index for index, ch in enumerate(chars)} #构成从字母到数字的映射
ix_to_char = {index: ch for index, ch in enumerate(chars)} #构成数字到字母的映射

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters 初始化参数
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden   生成输入层到隐藏层的标准正态分布的参数矩阵
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias



data has 268330 characters, 62 unique.


In [40]:
def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)  # forward pass中t是从0开始的，0时刻的前一时刻的隐藏层状态是hs[-1]
  loss = 0
    
  # forward pass
  for t in range(len(inputs)):
    #encode inputs to 1-hot embedding,size(xs)=(len(input),vocab_size)
    xs[t] = np.zeros((vocab_size, 1)) # encode in 1-of-k representation 1-hot-encoding
    xs[t][inputs[t]] = 1 # encode in 1-of-k representation 1-hot-encoding
    
    #forward
    #hs[t] 是t时刻的hidden state， active function = np.tanh(z)，z = Wx*x_t+Wh*hs_(t-1) + bh,即本时刻输入层+前一时刻个隐含层作为Z
    hs[t] = np.tanh(np.dot(Wxh,xs[t])+np.dot(Whh, hs[t-1])+bh) # hidden state
    #ys[t] = w*hs[t]+by
    ys[t] = np.dot(Why, hs[t])+by # unnormalized log probabilities for next chars
    #softmax(ys)
    ps[t] = np.exp(ys[t])/np.sum(np.exp(ys[t]))
    #计算loss = cross_entropy（）
#     print(np.shape(ps[t]))
    loss += -np.log(ps[t][targets[t]]) # softmax (cross-entropy loss)
    
  # backward pass: compute gradients going backwards
  #初始化梯度
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])  
  for t in reversed(range(len(inputs))): # 从后往前
    #dy是softmax层求导，cross_entropy softmax 求导 aj-yi,yi为one-hot标签,aj为softmax之后第j个神经元输出，详情请见https://blog.csdn.net/u014313009/article/details/51045303
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1 # backprop into y.
    #反向传播，求Why与by的导数
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    #反向传播到hidden state请参考https://blog.csdn.net/wjc1182511338/article/details/79191099完成，其中dh处反向传播的梯度外需加上dhnext
    dh = np.dot(Why.T, dy) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

def sample(h, seed_ix, n):
  """ 
  sample a sequence of integers from the model 
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel()) # ravel:将p变形，拉为一维，此处没必要
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # 记录梯度累加和，memory variables for Adagrad（adaptive gradient algorithm, 自适应梯度算法，调整学习率）
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p+seq_length+1 >= len(data) or n == 0:  # 当一次text遍历完成后，从头开始下一次
    hprev = np.zeros((hidden_size,1)) # reset RNN memory
    p = 0 # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]] # targets是对应inputs字符的下一个字符

#   # sample from the model now and then
#   if n % 100 == 0:
#     sample_ix = sample(hprev, inputs[0], 200)
#     txt = ''.join(ix_to_char[ix] for ix in sample_ix)
#     print ('----\n %s \n----' % (txt, ))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001                                                                                # ？？？
  if n % 100 == 0: print ('iter %d, loss: %f' % (n, smooth_loss)) # print progress)
  
  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam # 全是各元素分别运算
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

  p += seq_length # move data pointer
  n += 1 # iteration counter 

  if n ==100000:
      break

# Part 2

In [310]:
import pickle
with open("char-rnn-snapshot.pkl", 'rb') as f:
    a = pickle.load(f, encoding="latin-1")
Wxh = a["Wxh"]  # 权重
Whh = a["Whh"]
Why = a["Why"]
bh = a["bh"]   # 偏置
by = a["by"]
mWxh, mWhh, mWhy = a["mWxh"], a["mWhh"], a["mWhy"]
mbh, mby = a["mbh"], a["mby"]
chars, data_size, vocab_size, char_to_ix, ix_to_char = a["chars"].tolist(), a["data_size"].tolist(), a["vocab_size"].tolist(), a["char_to_ix"].tolist(), a["ix_to_char"].tolist()
hidden_size = 250

In [295]:
def sampleWithTemperature(h, seed_ix, n, tpt):
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y/tpt) / np.sum(np.exp(y/tpt))
    ix = np.random.choice(range(vocab_size), p=p.ravel()) # ravel:将p变形，拉为一维，此处没必要
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes,h

In [389]:
startChar = 's'
hprev = np.zeros((hidden_size,1))
sample_ix = sampleWithTemperature(hprev, char_to_ix[startChar], 50, 10)[0]
txt = startChar + ''.join(ix_to_char[ix] for ix in sample_ix)
print(txt)

suwOUo&;iSplU&NBWtuWBSwEta,tYEvMLVF MHPNINBGiICihG'


# Part 3

In [87]:
def sampleFromString(starter, n, tpt):
    starterIx = [char_to_ix[ch] for ch in starter]
    # compute the hidden activity h at the end of the starter 
    h = np.zeros((hidden_size,1))
    for t in range(len(starterIx)):
        x = np.zeros((vocab_size, 1))
        x[starterIx[t]] = 1
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    # generate text
    ixes = []
    for t in range(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y/tpt) / np.sum(np.exp(y/tpt))
        ix = np.random.choice(range(vocab_size), p=p.ravel()) # ravel:将p变形，拉为一维，此处没必要
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
        
    continuation = "".join([ix_to_char[ix] for ix in ixes])
    fullText = starter+continuation
    return fullText

In [391]:
starter = 'she'
n = 50
tpt = 1
text = sampleFromString(starter, n, tpt)
print("starter = ", starter, "   n = ", n, "   temperature = ", tpt)
print("----------------------------------------------")
print(text)

starter =  she    n =  50    temperature =  1
----------------------------------------------
she blawn,
Will kees tire the fire ceem of seare to o


# Part 4

In [376]:
def findRelevantWeight(init, next, samples=500):
    init_ix = char_to_ix[init]    # ‘：’
    next_ix = char_to_ix[next]    # ‘\n'

    ix = np.zeros((vocab_size, 1))
    ix[init_ix] = 1
    hprev = np.zeros((hidden_size,1))
    hprev = np.tanh(np.dot(Wxh, ix) + np.dot(Whh, hprev) + bh)
    hprev_avg = hprev  # ":"位于句子第一位时，h状态


    chars = np.array(list(ix_to_char.values()))     # 所有字符构成的向量：dict_values(['\n', '!', ' ', ……])
    # ":"不位于第一位时，h状态
    for i in range(samples):
        # "Reshuffle" RNN state  #随机选一个字符作为seed，生成长为10的text，用以更新h
        hprev = sampleWithTemperature(hprev, np.random.randint(0,len(chars)), 10, 1)[1]  
        # Compute state after feeding init_ix
        hprev = np.tanh(np.dot(Wxh, ix) + np.dot(Whh, hprev) + bh)
        hprev_avg += hprev       
    hprev_avg /= samples+1    #':'的平均h状态
                           
    pred = hprev_avg.ravel()*Why[next_ix,:]  # 不是向量内积，各元素分别相乘，（250，）*（250，）——> （250，）
    ibprev = np.argsort(pred)[::-1][:10]   # 反向排序，取从大到小前10个的索引
    print('ibprev = ',ibprev)
    avg_wxh = np.mean(Wxh[:, init_ix])  
    # 找到生成ibprev对应元素的Wxh里的且大于均值的元素
    best_weights = [i for i in ibprev if Wxh[i, init_ix] > 0 and Wxh[i, init_ix] > avg_wxh]

    y = np.dot(Why, hprev) + by
    p = np.exp(y) / np.sum(np.exp(y))   # ':'后生成每种字符的概率列表

#     print("%s with probability (%.2f%%) after %s"
#                 %(repr(next), p[next_ix]*100, repr(init)))
#     print("%s with probability (%.2f%%) after %s"
#                 %(repr(' '), p[char_to_ix[' ']]*100, repr(init)))    
    print("Weights Involved:")
    print("\tWxh at [:,%d]" % init_ix)
    print("\tWhy at [%d,:]" % next_ix)
    print("Most relevant weights:")
    print("\tWxh at [[%s], %d]" %(', '.join((str(s) for s in best_weights)), init_ix))
    print("\tWhy at [%d, [%s]]" %(next_ix, ', '.join((str(s) for s in best_weights))))
    return best_weights

In [393]:
init = ':'
next = ' '
findRelevantWeight(init, next, samples=500)

ibprev =  [ 73 100 143 210 108  84 125 187 114 237]
' ' with probability (83.31%) after ':'
' ' with probability (83.31%) after ':'
Weights Involved:
	Wxh at [:,9]
	Why at [2,:]
Most relevant weighs:
	Wxh at [[100, 108, 125, 187, 114], 9]
	Why at [2, [100, 108, 125, 187, 114]]


[100, 108, 125, 187, 114]