In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Dense

Using TensorFlow backend.


In [2]:
def load_data():
    data = open('./data_02.txt', mode='r', encoding='UTF-8').read()
    data = data.replace('\n', '').replace('\r', '')
    return data

In [3]:
# 滑动窗口提取数据
def extract_data(data, slide):
    x = []
    y = []
    for i in range(len(data) - slide):
        x.append([a for a in data[i:i + slide]])
        y.append(data[i + slide])
    return x, y

In [4]:
# 字符到数字的批量转化
def char_to_int_data(x, y, char_to_int):
    x_to_int = []
    y_to_int = []
    for i in range(len(x)):
        x_to_int.append([char_to_int[char] for char in x[i]])
        y_to_int.append([char_to_int[char] for char in y[i]])
    return x_to_int, y_to_int

In [5]:
# 实现输入字符文章的批量处理，输入整个字符、滑动窗口大小、转化字典
def data_preprocessing(data, slide, num_letters, char_to_int):
    char_Data = extract_data(data, slide)
    int_Data = char_to_int_data(char_Data[0], char_Data[1], char_to_int)
    Input = int_Data[0]
    Output = list(np.array(int_Data[1]).flatten())
    Input_RESHAPED = np.array(Input).reshape(len(Input), slide)
    new = np.random.randint(0, 10, size=[Input_RESHAPED.shape[0], Input_RESHAPED.shape[1], num_letters])
    for i in range(Input_RESHAPED.shape[0]):
        for j in range(Input_RESHAPED.shape[1]):
            new[i, j, :] = to_categorical(Input_RESHAPED[i, j], num_classes=num_letters)
    return new, Output

In [6]:
# 文本数据
data = load_data()

In [7]:
# 形成字典
letters = list(set(data))
print('letters: {}'.format(letters))
int_to_char_dictionary = {a: b for a, b in enumerate(letters)}
char_int_to_dictionary = {b: a for a, b in enumerate(letters)}
print('int_to_char_dictionary: {}'.format(int_to_char_dictionary))
print('char_int_to_dictionary: {}'.format(char_int_to_dictionary))

letters: ['j', 'a', 'r', 'H', 'q', 'c', 'v', 'A', '0', '!', '2', '4', 'S', 'y', 'h', '’', 'N', 'l', 'p', 'b', '9', 'w', 'e', ',', 'd', 'C', 't', 'U', 'M', 'i', '"', 'u', 'B', ' ', 'x', 'F', 'O', 'm', 'g', 'z', 'I', 'T', 'k', "'", '5', 'D', 'f', '-', 'n', 's', 'P', 'W', '1', ';', 'o', '.']
int_to_char_dictionary: {0: 'j', 1: 'a', 2: 'r', 3: 'H', 4: 'q', 5: 'c', 6: 'v', 7: 'A', 8: '0', 9: '!', 10: '2', 11: '4', 12: 'S', 13: 'y', 14: 'h', 15: '’', 16: 'N', 17: 'l', 18: 'p', 19: 'b', 20: '9', 21: 'w', 22: 'e', 23: ',', 24: 'd', 25: 'C', 26: 't', 27: 'U', 28: 'M', 29: 'i', 30: '"', 31: 'u', 32: 'B', 33: ' ', 34: 'x', 35: 'F', 36: 'O', 37: 'm', 38: 'g', 39: 'z', 40: 'I', 41: 'T', 42: 'k', 43: "'", 44: '5', 45: 'D', 46: 'f', 47: '-', 48: 'n', 49: 's', 50: 'P', 51: 'W', 52: '1', 53: ';', 54: 'o', 55: '.'}
char_int_to_dictionary: {'j': 0, 'a': 1, 'r': 2, 'H': 3, 'q': 4, 'c': 5, 'v': 6, 'A': 7, '0': 8, '!': 9, '2': 10, '4': 11, 'S': 12, 'y': 13, 'h': 14, '’': 15, 'N': 16, 'l': 17, 'p': 18, 'b': 

In [8]:
# 依据前20个字符，预测第21个字符
count = 20
X, y = data_preprocessing(
    data=data,
    slide=count,
    num_letters=len(letters),
    char_to_int=char_int_to_dictionary
)
print(X.shape, len(y))

(64348, 20, 56) 64348


In [9]:
# 训练-测试 数据集分离
X_train, X_test, y_train, y_test = train_test_split(X, np.array(y))

In [10]:
# 将y_train转为One-hot格式
y_train_category = to_categorical(y_train, num_classes=len(letters))

In [11]:
# 建立模型
lstm = Sequential()
lstm.add(
    LSTM(
        units=count,
        input_shape=(X_train.shape[1], X_train.shape[2]),
        activation='relu'
    )
)
lstm.add(
    Dense(
        units=len(letters),
        activation='softmax'
    )
)
# 模型配置
lstm.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)
# 查看模型
lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 20)                6160      
_________________________________________________________________
dense_1 (Dense)              (None, 56)                1176      
Total params: 7,336
Trainable params: 7,336
Non-trainable params: 0
_________________________________________________________________


In [18]:
# 模型训练
lstm.fit(
    X_train, y_train_category,
    batch_size=3000,
    epochs=10,
    verbose=1
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x26adae7c908>

In [19]:
# train score
y_train_predict = lstm.predict_classes(X_train)
train_score = accuracy_score(y_true=y_train, y_pred=y_train_predict)
print(train_score)

# test score
y_test_predict = lstm.predict_classes(X_test)
test_score = accuracy_score(y_true=y_test, y_pred=y_test_predict)
print(test_score)

0.4538447193385964
0.4445825822092373


In [20]:
# predict score
words = 'My name is Blair. My father bought a computer for me last year. I learned to search the Internet.'
X_predict, y_predict_true = data_preprocessing(
    data=words,
    slide=count,
    num_letters=len(letters),
    char_to_int=char_int_to_dictionary
)
y_predict = lstm.predict_classes(X_predict)
predict_score = accuracy_score(y_true=y_predict_true, y_pred=y_predict)
print(predict_score)

0.3116883116883117


In [21]:
# write article contains 1000 letters
words = 'My name is Blair. My father bought a computer for me last year. I learned to search the Internet.'
word_count = 1000
while len(words) < word_count:
    words = words + ' '
    X, y = data_preprocessing(
        data=words,
        slide=count,
        num_letters=len(letters),
        char_to_int=char_int_to_dictionary
    )
    X = X[len(X) - 1, :, :]
    X = X.reshape(1, 20, 56)
    y_predict = lstm.predict_classes(X)
    y_predict = y_predict[0]
    y_predict = int_to_char_dictionary.get(y_predict)
    words = words[0:(len(words) - 1):1] + y_predict
print(words)

My name is Blair. My father bought a computer for me last year. I learned to search the Internet.
My name is Blair. My father bought a computer for me last year. I learned to search the Internet. I will store the chool some the chool stor in the choole have they will store the chool some the chool stor in the choole have they will store the chool some the chool stor in the choole have they will store the chool some the chool stor in the choole have they will store the chool some the chool stor in the choole have they will store the chool some the chool stor in the choole have they will store the chool some the chool stor in the choole have they will store the chool some the chool stor in the choole have they will store the chool some the chool stor in the choole have they will store the chool some the chool stor in the choole have they will store the chool some the chool stor in the choole have they will store the chool some the chool stor in the choole have they will store the chool s