In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 假设有一个文本分类任务，数据集包含文本和对应的标签
# 定义样本数据
data = {
    'text': [
        'I love this movie',
        'This book is amazing',
        'The weather is nice today',
        'I hate Mondays',
        'The food tastes delicious',
        'I enjoy playing sports'
    ],
    'labels': [1, 0, 1, 1, 1, 0]  # 假设1表示正面情绪，0表示负面情绪
}

# 将数据转换为DataFrame格式
df = pd.DataFrame(data)

# 划分数据集为训练集、验证集和测试集
train_data, test_data, train_labels, test_labels = train_test_split(df['text'], df['labels'], test_size=0.2, random_state=42)
train_data, valid_data, train_labels, valid_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)

# 创建训练集、验证集和测试集的DataFrame
train_df = pd.DataFrame({'text': train_data, 'labels': train_labels})
valid_df = pd.DataFrame({'text': valid_data, 'labels': valid_labels})
test_df = pd.DataFrame({'text': test_data, 'labels': test_labels})

# 保存数据集为CSV文件
train_df.to_csv('train_dataset.csv', index=False)
valid_df.to_csv('valid_dataset.csv', index=False)
test_df.to_csv('test_dataset.csv', index=False)


In [19]:
# 导入必要的库和模块
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification

# 加载BERT模型和Tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name)

# 准备数据集
train_data = pd.read_csv('train_dataset.csv') # 准备训练数据
valid_data = pd.read_csv('valid_dataset.csv')  # 准备验证数据
test_data = pd.read_csv('test_dataset.csv')   # 准备测试数据

# 对数据进行编码和转换
train_encodings = tokenizer(train_data['text'].tolist(), truncation=True, padding=True)
valid_encodings = tokenizer(valid_data['text'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_data['text'].tolist(), truncation=True, padding=True)


train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_data['labels']
)).shuffle(100).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_data['labels']
)).shuffle(100).batch(16)
valid_dataset = tf.data.Dataset.from_tensor_slices((
    dict(valid_encodings),
    valid_data['labels']
)).batch(16)

# 定义微调过程的损失函数和优化器
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)

# 定义微调过程的评估指标
accuracy = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

# 定义微调过程的训练步骤
@tf.function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        outputs = model(inputs, training=True)
        logits = outputs.logits
        train_loss = loss(labels, logits)

    grads = tape.gradient(train_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    accuracy(labels, logits)

    return train_loss

# 开始微调训练过程
for epoch in range(20):
    print('Epoch:', epoch+1)
    for inputs, labels in train_dataset:
        train_loss = train_step(inputs, labels)

    for inputs, labels in valid_dataset:
        outputs = model(inputs, training=False)
        logits = outputs.logits
        accuracy(labels, logits)

    print('Training Loss:', train_loss)
    print('Validation Accuracy:', accuracy.result())

# 在测试集上评估模型性能
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

for inputs, labels in test_dataset:
    outputs = model(inputs, training=False)
    logits = outputs.logits
    test_accuracy(labels, logits)

print('Test Accuracy:', test_accuracy.result())


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1
Training Loss: tf.Tensor(0.7400327, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.25, shape=(), dtype=float32)
Epoch: 2
Training Loss: tf.Tensor(0.6724488, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.375, shape=(), dtype=float32)
Epoch: 3
Training Loss: tf.Tensor(0.6705637, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.41666666, shape=(), dtype=float32)
Epoch: 4
Training Loss: tf.Tensor(0.6443115, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.4375, shape=(), dtype=float32)
Epoch: 5
Training Loss: tf.Tensor(0.5931341, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.45, shape=(), dtype=float32)
Epoch: 6
Training Loss: tf.Tensor(0.56010824, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.45833334, shape=(), dtype=float32)
Epoch: 7
Training Loss: tf.Tensor(0.45180225, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.4642857, shape=(), dtype=float32)
Epoch: 8
Training Loss: tf.Tensor(0.5483914, shape=(),

In [13]:
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2ForSequenceClassification

model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = TFGPT2ForSequenceClassification.from_pretrained(model_name)

# 准备数据集
train_data = pd.read_csv('train_dataset.csv') # 准备训练数据
valid_data = pd.read_csv('valid_dataset.csv')  # 准备验证数据
test_data = pd.read_csv('test_dataset.csv')   # 准备测试数据

# 对数据进行编码和转换
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = '[PAD]'
train_encodings = tokenizer(train_data['text'].tolist(), truncation=True, padding=True)
valid_encodings = tokenizer(valid_data['text'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_data['text'].tolist(), truncation=True, padding=True)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_data['labels']
)).shuffle(100).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_data['labels']
)).shuffle(100).batch(16)
valid_dataset = tf.data.Dataset.from_tensor_slices((
    dict(valid_encodings),
    valid_data['labels']
)).batch(16)

# 定义微调过程的损失函数和优化器
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)

# 定义微调过程的评估指标
accuracy = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

# 定义微调过程的训练步骤
@tf.function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        # 将输入序列和目标序列拼接起来
        input_sequence = tf.concat([inputs['input_ids'], labels[:, :-1]], axis=1)
        target_sequence = labels[:, 1:]

        # 计算模型输出
        outputs = model(input_sequence, training=True)
        logits = outputs.logits

        # 计算损失
        train_loss = loss(target_sequence, logits)

    grads = tape.gradient(train_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    accuracy(target_sequence, logits)

    return train_loss


# 开始微调训练过程
for epoch in range(10):
    print('Epoch:', epoch+1)
    for inputs, labels in train_dataset:
        train_loss = train_step(inputs, labels)

    for inputs, labels in valid_dataset:
        outputs = model(inputs, training=False)
        logits = outputs.logits
        accuracy(labels, logits)

    print('Training Loss:', train_loss)
    print('Validation Accuracy:', accuracy.result())

# 在测试集上评估模型性能
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

for inputs, labels in test_dataset:
    outputs = model(inputs, training=False)[0]
    logits = outputs.logits
    test_accuracy(labels, logits)

print('Test Accuracy:', test_accuracy.result())


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1
Training Loss: tf.Tensor(0.6976159, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.33333334, shape=(), dtype=float32)
Epoch: 2
Training Loss: tf.Tensor(0.64643514, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.33333334, shape=(), dtype=float32)
Epoch: 3
Training Loss: tf.Tensor(0.58924294, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.44444445, shape=(), dtype=float32)
Epoch: 4
Training Loss: tf.Tensor(0.5715023, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.5, shape=(), dtype=float32)
Epoch: 5
Training Loss: tf.Tensor(0.5096499, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.53333336, shape=(), dtype=float32)
Epoch: 6
Training Loss: tf.Tensor(0.50171643, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.5555556, shape=(), dtype=float32)
Epoch: 7
Training Loss: tf.Tensor(0.40239158, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.5714286, shape=(), dtype=float32)
Epoch: 8
Training Loss: tf.Tensor(0.370

In [25]:
import tensorflow as tf
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

# 加载T5模型和Tokenizer
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = TFT5ForConditionalGeneration.from_pretrained(model_name)

# 准备数据集
train_data = pd.read_csv('train_dataset.csv') # 准备训练数据
valid_data = pd.read_csv('valid_dataset.csv')  # 准备验证数据
test_data = pd.read_csv('test_dataset.csv')   # 准备测试数据

# 对数据进行编码和转换
train_encodings = tokenizer(train_data, truncation=True, padding=True)
valid_encodings = tokenizer(valid_data, truncation=True, padding=True)
test_encodings = tokenizer(test_data, truncation=True, padding=True)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(100).batch(16)

valid_dataset = tf.data.Dataset.from_tensor_slices((
    dict(valid_encodings),
    valid_labels
)).batch(16)

# 定义微调过程的损失函数和优化器
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)

# 定义微调过程的评估指标
accuracy = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

# 定义微调过程的训练步骤
@tf.function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        outputs = model(inputs['input_ids'], labels=labels, training=True)
        logits = outputs.logits
        train_loss = loss(labels, logits)

    grads = tape.gradient(train_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    accuracy(labels, logits)

    return train_loss

# 开始微调训练过程
for epoch in range(10):
    print('Epoch:', epoch+1)
    for inputs, labels in train_dataset:
        train_loss = train_step(inputs, labels)

    for inputs, labels in valid_dataset:
        outputs = model(inputs['input_ids'], labels=labels, training=False)
        logits = outputs.logits
        accuracy(labels, logits)

    print('Training Loss:', train_loss)
    print('Validation Accuracy:', accuracy.result())

# 在测试集上评估模型性能
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

for inputs, labels in test_dataset:
    outputs = model(inputs['input_ids'], training=False)
    logits = outputs.logits
    test_accuracy(labels, logits)

print('Test Accuracy:', test_accuracy.result())


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1
Training Loss: tf.Tensor(0.73439735, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.5, shape=(), dtype=float32)
Epoch: 2
Training Loss: tf.Tensor(0.60361856, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.625, shape=(), dtype=float32)
Epoch: 3
Training Loss: tf.Tensor(0.59139115, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.6666667, shape=(), dtype=float32)
Epoch: 4
Training Loss: tf.Tensor(0.6323643, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.6875, shape=(), dtype=float32)
Epoch: 5
Training Loss: tf.Tensor(0.6248029, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.7, shape=(), dtype=float32)
Epoch: 6
Training Loss: tf.Tensor(0.49607703, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.7083333, shape=(), dtype=float32)
Epoch: 7
Training Loss: tf.Tensor(0.52961534, shape=(), dtype=float32)
Validation Accuracy: tf.Tensor(0.71428573, shape=(), dtype=float32)
Epoch: 8
Training Loss: tf.Tensor(0.44395247, shape=()