<a href="https://colab.research.google.com/github/Alecia113/NLP-Ex/blob/main/E8_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import numpy as np 

from numpy import array
from numpy import argmax
from numpy import log

In [39]:
import re

# Pad sequences to the max length
def pad_sequences_pre(input_sequences, maxlen):
    output = []
    for inp in input_sequences:
        if len(inp)< maxlen:
            output.append([0]*(maxlen-len(inp)) + inp)
        else:
            output.append(inp[:maxlen])
    return output

# Prepare the data
def dataset_preparation(data):
    corpus = data.lower().split("\n")
    normalized_text=[]
    for string in corpus:
        tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
        normalized_text.append(tokens)
    tokenized_sentences=[sentence.strip().split(" ") for sentence in normalized_text]

    word_list_dict ={}
    for sent in tokenized_sentences:
        for word in sent:
            if word != "":
                word_list_dict[word] = 1
    word_list = list(word_list_dict.keys())
    word_to_index = {word:word_list.index(word) for word in word_list}

    total_words = len(word_list)+1

    # create input sequences using list of tokens
    input_sequences = []
    for line in tokenized_sentences:
        token_list = []
        for word in line:
            if word!="":
                token_list.append(word_to_index[word])
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    # pad sequences 
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences_pre(input_sequences, maxlen=max_sequence_len))

    # create predictors and label
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

    return predictors, np.array(label), max_sequence_len, total_words, word_list, word_to_index

data = '''The cat and her kittens
They put on their mittens
To eat a christmas pie
The poor little kittens
They lost their mittens
And then they began to cry.

O mother dear, we sadly fear
We cannot go to-day,
For we have lost our mittens
If it be so, ye shall not go
For ye are naughty kittens'''

predictors, label, max_sequence_len, total_words, word_list, word_to_index = dataset_preparation(data)


In [40]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

# Define the model
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim_1, hidden_dim_2, total_words):
        super(LSTMTagger, self).__init__()
        self.hidden_dim_1 = hidden_dim_1
        self.hidden_dim_2 = hidden_dim_2
        self.word_embeddings = nn.Embedding(total_words, embedding_dim)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim_1, batch_first=True)  
        self.lstm2 = nn.LSTM(hidden_dim_1, hidden_dim_2, batch_first=True)  
        self.hidden2tag = nn.Linear(hidden_dim_2, total_words)


    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out_1, _ = self.lstm1(embeds)
        lstm_out_2, _ = self.lstm2(lstm_out_1)
        tag_space = self.hidden2tag(lstm_out_2[:,-1,:])
        # The reason we are using log_softmax here is that we want to calculate -log(p) and find the minimum score                    
        tag_scores = F.log_softmax(tag_space, dim=1)      
        return tag_scores

# Parameter setting
EMBEDDING_DIM = 10
HIDDEN_DIM_1 = 150
HIDDEN_DIM_2 = 100
batch_size=predictors.shape[0]

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM_1, HIDDEN_DIM_2, total_words).cuda()
loss_function = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


sentence =torch.from_numpy(predictors).cuda().to(torch.int64)
targets = torch.from_numpy(label).cuda().to(torch.int64)


# Training
for epoch in range(100):  

    model.train()
    model.zero_grad()       
    tag_scores = model(sentence)
    loss = loss_function(tag_scores, targets)
    loss.backward()
    optimizer.step()


    if epoch % 10 == 9:
        model.eval()
        _, predicted = torch.max(tag_scores, 1)
        prediction = predicted.view(-1).cpu().numpy()
        t = targets.view(-1).cpu().numpy()
        acc = accuracy_score(prediction,t)
        print('Epoch: %d, training loss: %.4f, training acc: %.2f%%'%(epoch+1,loss.item(),100*acc))



Epoch: 10, training loss: 3.6821, training acc: 10.42%
Epoch: 20, training loss: 3.4591, training acc: 14.58%
Epoch: 30, training loss: 3.0505, training acc: 14.58%
Epoch: 40, training loss: 2.6286, training acc: 20.83%
Epoch: 50, training loss: 2.2385, training acc: 45.83%
Epoch: 60, training loss: 1.9233, training acc: 62.50%
Epoch: 70, training loss: 1.6583, training acc: 70.83%
Epoch: 80, training loss: 1.4220, training acc: 79.17%
Epoch: 90, training loss: 1.2126, training acc: 85.42%
Epoch: 100, training loss: 1.0376, training acc: 89.58%


In [41]:
# convert index to word
def ind_to_word(predicted_ind):
    for word, index in word_to_index.items():
        if index == predicted_ind:
            return word
    return ""    


# get the top k most predicted results
def get_topK(predicted, k=1):
    
    # Get the index of the highest k index
    # Since the input is just one sentence, we can use [0] to extract the prediction result
    top_k = np.argsort(predicted[0])[-k:]

    # return a list of tuple
    # tuple[0]:word_id, tuple[1]:log(p)
    return [(id, predicted[0][id]) for id in top_k]


# To-Do: modify this function
# Generate text, currently only works with k=1 
# Hint: The easist way is modifying the code from line 40-47, but it is not compulsory

def generate_text(seed_text, next_words, max_sequence_len, k=1):

    seed_candidates = [(seed_text, .0)]
    for _ in range(next_words):
        successives = []
        # if k = 1, len(seed_candidates) will always be 1
        for i in range(len(seed_candidates)):
            seed_text, score = seed_candidates[i]
            token_list = [word_to_index[word] for word in seed_text.split()]
            token_list = pad_sequences_pre([token_list], maxlen=max_sequence_len-1)

            seed_input = torch.from_numpy(np.array(token_list)).cuda().to(torch.int64)
            predicted = model(seed_input).cpu().detach().numpy()

            if k == 1:
              id, s = get_topK(predicted, k)[0]
              # get the output word
              output_word = ind_to_word(id)
              # put the word into the sentence input
              # calcualte the accumulated score by -log(p)
              successives.append((seed_text + ' ' + output_word, score - s)) 
            else:
              # Since it it only works with k = 1, we can simply use [0] to get the word id and log(p)
              # However, if k = 3, you can't simply use [0] to get the candidates
        
              id_n =[]
              s_n = []
              for j in range(k):            
              #j = 1,2 3
                #get_topK(predicted, k)[0]  #取三个最大的【（）（）（）】list【0】第一个【1】第二个【2】第三个 id 和s 应该是list
                id_n.append(get_topK(predicted, k)[j][0])
                s_n.append(get_topK(predicted, k)[j][1])
                #get_topK(predicted, 3)[0][1]    ===s  [0][0]==id
                #id, s = get_topK(predicted, k)[k] 取出【0】第一个输入反馈个index和值
              # get the output word

                output_word = ind_to_word(id_n[j])   #给出个单词
              # put the word into the sentence input
              # calcualte the accumulated score by -log(p)
                successives.append((seed_text + ' ' + output_word, score - s_n[j])) #给出个结果句子+分数。
                #是把这句话添加到里面去了
            # max_s = successives[0][1]
            # for m in range(k):
            #   if max_s <= successives[m][1]:
            #     max_s = successives 
            #     M = m
            #   else:
            #     M = 0
                

        # Get the lowest k accumulated scores (highest k accumulated probabilities)
        # Then, make them as the seed_candidate for the next word to predict
        ordered = sorted(successives, key=lambda tup: tup[1])
        seed_candidates = ordered[:k]

    return seed_candidates[0][0]    #一句话， 数值 取前三个


print(generate_text("we naughty", 3, max_sequence_len, k=1))
print(generate_text("we naughty", 3, max_sequence_len, k=3))

# Please note that it can happen that k=1 and k=3 have the same output because this is only a small dataset.


we naughty lost their mittens
we naughty lost their mittens


In [None]:
# convert index to word
def ind_to_word(predicted_ind):
    for word, index in word_to_index.items():
        if index == predicted_ind:
            return word
    return ""    


# get the top k most predicted results
def get_topK(predicted, k=1):
    
    # Get the index of the highest k index
    # Since the input is just one sentence, we can use [0] to extract the prediction result
    top_k = np.argsort(predicted[0])[-k:]

    # return a list of tuple
    # tuple[0]:word_id, tuple[1]:log(p)
    return [(id, predicted[0][id]) for id in top_k]


# To-Do: modify this function
# Generate text, currently only works with k=1 
# Hint: The easist way is modifying the code from line 40-47, but it is not compulsory

def generate_text(seed_text, next_words, max_sequence_len, k=1):

    seed_candidates = [(seed_text, .0)]
    for _ in range(next_words):
        successives = []
        for i in range(len(seed_candidates)):
            seed_text, score = seed_candidates[i]
            token_list = [word_to_index[word] for word in seed_text.split()]
            token_list = pad_sequences_pre([token_list], maxlen=max_sequence_len-1)

            seed_input = torch.from_numpy(np.array(token_list)).cuda().to(torch.int64)
            predicted = model(seed_input).cpu().detach().numpy()

            if k == 1:
              id, s = get_topK(predicted, k)[0]
              output_word = ind_to_word(id)
              successives.append((seed_text + ' ' + output_word, score - s)) 
            else:
              id_n =[]
              s_n = []
              for j in range(k):            
                id_n.append(get_topK(predicted, k)[j][0])
                s_n.append(get_topK(predicted, k)[j][1])
                output_word = ind_to_word(id_n[j])   
                successives.append((seed_text + ' ' + output_word, score - s_n[j])) 

         ordered = sorted(successives, key=lambda tup: tup[1])
        seed_candidates = ordered[:k]

    return seed_candidates[0][0]   


print(generate_text("we naughty", 3, max_sequence_len, k=1))
print(generate_text("we naughty", 3, max_sequence_len, k=3))

# Please note that it can happen that k=1 and k=3 have the same output because this is only a small dataset.


In [None]:

'''
            # Since it it only works with k = 1, we can simply use [0] to get the word id and log(p)
            # However, if k = 3, you can't simply use [0] to get the candidates
       
            id_n =[]
            s_n = []
           
          #j = 1,2 3
            #get_topK(predicted, k)[0]  #取三个最大的【（）（）（）】list【0】第一个【1】第二个【2】第三个 id 和s 应该是list
            id_n.append(get_topK(predicted, k)[0][0])
            s_n.append(get_topK(predicted, k)[0][1])
            #get_topK(predicted, 3)[0][1]    ===s  [0][0]==id
            #id, s = get_topK(predicted, k)[k] 取出【0】第一个输入反馈个index和值
          # get the output word
            output_word = ind_to_word(id)   #给出个单词
          # put the word into the sentence input
          # calcualte the accumulated score by -log(p)
            successives.append((seed_text + ' ' + output_word, score - s)) #给出个结果句子+分数。
            #是把这句话添加到里面去了
          # max_s = successives[0][1]
          # for m in range(k):
          #   if max_s <= successives[m][1]:
          #     max_s = successives 
          #     M = m
          #   else:
          #     M = 0
'''

        # Get the lowest k accumulated scores (highest k accumulated probabilities)
        # Then, make them as the seed_candidate for the next word to predict
        ordered = sorted(successives, key=lambda tup: tup[1]) #根据分数排这句话
        seed_candidates = ordered[:k]


In [33]:
get_topK(predicted, 3)[0][1]

-3.245199

In [21]:
get_topK(predicted, 3)  #[(29, -0.23189081)]
#[(9, -3.245199), (10, -2.3391106), (29, -0.23189081)] 
#list 里有k个数

[(9, -3.245199), (10, -2.3391106), (29, -0.23189081)]

In [29]:
get_topK(predicted, 0)  #[(29, -0.23189081)]

[(16, -10.416989),
 (15, -10.166246),
 (34, -10.161328),
 (35, -10.045282),
 (18, -9.923558),
 (40, -9.908958),
 (30, -9.807699),
 (21, -9.688912),
 (33, -9.55206),
 (1, -9.49195),
 (42, -8.943313),
 (0, -8.87492),
 (31, -8.700661),
 (36, -8.690514),
 (2, -8.549226),
 (6, -8.16085),
 (41, -7.982132),
 (3, -7.925426),
 (22, -7.8716936),
 (27, -7.779893),
 (38, -7.6586666),
 (14, -7.6326613),
 (12, -7.499868),
 (11, -7.3750477),
 (5, -7.2503753),
 (39, -7.1362243),
 (13, -7.058437),
 (17, -6.8277965),
 (37, -6.6968994),
 (20, -6.5761285),
 (23, -6.5300627),
 (32, -6.5007334),
 (7, -6.4987125),
 (4, -6.4331303),
 (19, -6.2710686),
 (24, -5.8766117),
 (8, -4.8873186),
 (25, -4.5415936),
 (28, -4.414175),
 (26, -3.9523525),
 (9, -3.245199),
 (10, -2.3391106),
 (29, -0.23189081)]

In [7]:
seed_text = "we naughty"
next_words = 3
max_sequence_len = max_sequence_len
#k = 3 这就恒定了
#目前是初始化，[('we naughty', 0.0)]
seed_candidates = [(seed_text, .0)] #"we naughty"  [('we naughty they our kittens', 8.618810653686523)]把里面的数据取出来
for _ in range(next_words): #3 #3  _ = 0,1,2
    successives = [] #逐次导数？
    # if k = 1, len(seed_candidates) will always be 1 # 如果k = 1，len(seed_candidates)将总是1
    for i in range(len(seed_candidates)): #len(sequences) 目前里面一句话1 这里要改。
        seed_text, score = seed_candidates[i]  #之前是文本和分数。初级然后慢慢迭代更新。
     
        token_list = [word_to_index[word] for word in seed_text.split()]  #[24, 41] 目前只有两个词。所以分开找we': 24,'naughty': 41,， 变成index
        token_list = pad_sequences_pre([token_list], maxlen=max_sequence_len-1)   #7扩充到最大长度 list

        seed_input = torch.from_numpy(np.array(token_list)).cuda().to(torch.int64)  #然后变成tensor 长度还是7 里面还是那两个单词。排在倒数的位置
        predicted = model(seed_input).cpu().detach().numpy()  #跑了那个模型RNN变array了 变成（1，43） 这个向量维度是43，现有句子。

        # Since it it only works with k = 1, we can simply use [0] to get the word id and log(p)
          # 因为它只对k = 1起作用，我们可以简单地用[0]来获得单词id和log(p)
        # However, if k = 3, you can't simply use [0] to get the candidates
          # 然而，如果k = 3，就不能简单地用[0]来获得候选者了
        id, s = get_topK(predicted, 1)[0]  #把这句话的array都丢进去做gettopK了
        # get the output word #获得输出字数
        output_word = ind_to_word(id)
        # put the word into the sentence input#把这个词放到句子的输入中
        # calcualte the accumulated score by -log(p)#用-log(p)计算累计得分。 每行的第几个 的log值 
        successives.append((seed_text + ' ' + output_word, score - s))  #candidate = [seq + [j], score + (-log(row[j])) ]、、all_candidates.append(candidate)
  #这后面一样 #每行的第几个 的log值  #score最开始的得分
    # Get the lowest k accumulated scores (highest k accumulated probabilities)
    ## 获得最低的k个累积分数（最高的k个累积概率）。
    # Then, make them as the seed_candidate for the next word to predict
    # 然后，把它们作为下一个要预测的词的种子_候选者
    ordered = sorted(successives, key=lambda tup: tup[1]) #[('we naughty go to day', 1.2721786051988602)]
    seed_candidates = ordered[:1]     #最优的那个
print(seed_candidates[0][0])  #we naughty go to day


we naughty go to day


In [13]:
seed_candidates = ordered[:1]
seed_candidates

[('we naughty go to day', 1.2721786051988602)]

In [8]:
ordered = sorted(successives, key=lambda tup: tup[1])

In [14]:
# convert index to word
def ind_to_word(predicted_ind):
    for word, index in word_to_index.items():
        if index == predicted_ind:
            return word
    return ""    


# get the top k most predicted results
def get_topK(predicted, k=1):
    
    # Get the index of the highest k index
    # Since the input is just one sentence, we can use [0] to extract the prediction result
    top_k = np.argsort(predicted[0])[-k:]

    # return a list of tuple
    # tuple[0]:word_id, tuple[1]:log(p)
    return [(id, predicted[0][id]) for id in top_k]


# To-Do: modify this function
# Generate text, currently only works with k=1 
# Hint: The easist way is modifying the code from line 40-47, but it is not compulsory

def generate_text(seed_text, next_words, max_sequence_len, k=1):

    seed_candidates = [(seed_text, .0)]
    for _ in range(next_words):
        successives = []
        # if k = 1, len(seed_candidates) will always be 1
        for i in range(len(seed_candidates)):
            seed_text, score = seed_candidates[i]
            token_list = [word_to_index[word] for word in seed_text.split()]
            token_list = pad_sequences_pre([token_list], maxlen=max_sequence_len-1)

            seed_input = torch.from_numpy(np.array(token_list)).cuda().to(torch.int64)
            predicted = model(seed_input).cpu().detach().numpy()


            # Since it it only works with k = 1, we can simply use [0] to get the word id and log(p)
            # However, if k = 3, you can't simply use [0] to get the candidates
            # for j in range(k):
            id, s = get_topK(predicted, k)[0]
            # get the output word
            output_word = ind_to_word(id)
            # put the word into the sentence input
            # calcualte the accumulated score by -log(p)
            successives.append((seed_text + ' ' + output_word, score - s)) 

        # Get the lowest k accumulated scores (highest k accumulated probabilities)
        # Then, make them as the seed_candidate for the next word to predict
        ordered = sorted(successives, key=lambda tup: tup[1])
        seed_candidates = ordered[:k]

    return seed_candidates[0][0]


print(generate_text("we naughty", 3, max_sequence_len, k=1))
print(generate_text("we naughty", 3, max_sequence_len, k=3))

# Please note that it can happen that k=1 and k=3 have the same output because this is only a small dataset.


we naughty go to pie
we naughty their mittens ye


**Sample Output** (Your output would be different, it is based on the trained model)


```
we naughty lost their mittens
```

