In [None]:
import numpy as np
import tensorflow as tf
import os
import urllib.request
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, LayerNormalization, MultiHeadAttention, Reshape

# 下载莎士比亚文本数据（或者用你自己的数据集）
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
file_path = 'input.txt'
if not os.path.exists(file_path):
    urllib.request.urlretrieve(url, file_path)

# 读取文本数据并转为小写
with open(file_path, 'r') as f:
    text = f.read().lower()

# 打印文本前500个字符
print(text[:500])

# 创建字符到整数的映射
chars = sorted(list(set(text)))
char_to_index = {char: index for index, char in enumerate(chars)}
index_to_char = {index: char for index, char in enumerate(chars)}

# 将文本转换为整数表示
text_as_int = np.array([char_to_index[char] for char in text])

# 创建训练数据（输入-目标对）
seq_length = 100  # 输入序列长度
x_data = []
y_data = []

for i in range(0, len(text_as_int) - seq_length, seq_length):
    x_data.append(text_as_int[i:i + seq_length])
    y_data.append(text_as_int[i + 1:i + seq_length + 1])  # 目标是下一个字符

# 将数据转换为Numpy数组
X = np.array(x_data)
y = np.array(y_data)

# 归一化输入数据
X = X / float(len(chars))

# 将目标数据进行 One-hot 编码
y = tf.keras.utils.to_categorical(y, num_classes=len(chars))

# 检查y的形状
print(f"目标数据y形状: {y.shape}")

# 不需要对y进行squeeze操作，因为它已经是 (None, seq_length, num_classes) 形状

# 分割训练集和验证集（80%训练，20%验证）
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



# 打印数据形状
print(f"训练集 X 的形状: {X_train.shape}")
print(f"验证集 X 的形状: {X_val.shape}")
print(f"训练集 y 的形状: {y_train.shape}")
print(f"验证集 y 的形状: {y_val.shape}")


# 定义 Transformer 模型
def transformer_model(seq_length, num_features, num_classes):
    inputs = Input(shape=(seq_length, num_features))  # (batch_size, seq_length, num_features)
    
    # Transformer架构的输入嵌入层
    embedding = Embedding(input_dim=num_classes, output_dim=256)(inputs)  # 嵌入维度设置为256
    
    # 添加多头自注意力层
    attention = MultiHeadAttention(num_heads=8, key_dim=256)(embedding, embedding)
    attention = LayerNormalization()(attention)  # 归一化

    # Feed-Forward 网络
    ff = Dense(512, activation='relu')(attention)
    ff = Dense(256)(ff)
    ff = LayerNormalization()(ff)

    # Dropout层
    ff = Dropout(0.2)(ff)
    
    # 输出层（softmax 激活，用于分类）
    output = Dense(num_classes, activation='softmax')(ff)
    
    # 移除额外维度，调整输出形状为 (None, 100, num_classes)
    output = Reshape((seq_length, num_classes))(output)

    # 构建并返回模型
    model = Model(inputs, output)
    return model



# 模型参数
num_classes = len(chars)  # 字符集大小
num_features = 1  # 每个时间步的特征数（对于字符级文本，通常是1）
seq_length = 100  # 输入序列长度

# 创建 Transformer 模型
model = transformer_model(seq_length, num_features, num_classes)

# 编译模型
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

# 训练模型
model.fit(X_train, y_train, batch_size=128, epochs=50, validation_data=(X_val, y_val), verbose=1)




# 文本生成函数
def generate_text_transformer(model, seed_text, num_chars, temperature=1.0):
    print(f"生成文本种子: {seed_text}")
    
    generated_text = seed_text
    for _ in range(num_chars):
        # 将种子文本转为数字
        x_input = np.array([char_to_index[char] for char in seed_text])
        x_input = x_input / float(len(chars))  # 归一化
        x_input = np.reshape(x_input, (1, len(seed_text), 1))  # 重塑为Transformer输入形状

        # 使用模型预测下一个字符的概率分布
        predicted_prob = model.predict(x_input, verbose=0)
        
        # 温度控制：调整预测概率
        predicted_prob = predicted_prob / temperature
        
        # 从概率分布中选择下一个字符的索引
        predicted_index = np.random.choice(len(chars), p=predicted_prob[0])

        # 将预测的字符添加到种子文本中
        predicted_char = index_to_char[predicted_index]
        generated_text += predicted_char

        # 更新种子文本以便生成下一个字符
        seed_text = seed_text[1:] + predicted_char

    return generated_text


# 生成文本
seed_text = "shall i compare thee to a summer's day"
generated_text = generate_text_transformer(model, seed_text, 500, temperature=1.0)

# 输出生成的文本
print(f"\n生成的文本:\n{generated_text}")


first citizen:
before we proceed any further, hear me speak.

all:
speak, speak.

first citizen:
you are all resolved rather to die than to famish?

all:
resolved. resolved.

first citizen:
first, you know caius marcius is chief enemy to the people.

all:
we know't, we know't.

first citizen:
let us kill him, and we'll have corn at our own price.
is't a verdict?

all:
no more talking on't; let it be done: away, away!

second citizen:
one word, good citizens.

first citizen:
we are accounted poor
目标数据y形状: (11153, 100, 39)
训练集 X 的形状: (8922, 100)
验证集 X 的形状: (2231, 100)
训练集 y 的形状: (8922, 100, 39)
验证集 y 的形状: (2231, 100, 39)
Epoch 1/50
[1m29/70[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m2:40[0m 4s/step - loss: 3.8224