In [1]:
pip install transformers==4.31.0

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [3]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import pandas as pd
import datetime

2023-07-31 19:39:16.899900: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# 启用混合精度训练（修改处）
from tensorflow.python.keras.mixed_precision.policy import Policy, set_global_policy
policy = Policy('mixed_float16')
set_global_policy(policy)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


2023-07-31 19:39:20.551864: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [5]:
# 加载预训练的BERT模型和分词器，并指定输出层有5个神经元
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# 加载数据集
dataset_path = "processed_amazon_reviews_us_Camera_v1_00.tsv"
chunksize = 5000  # 修改处：分块加载数据

In [7]:
# 准备输入数据
def generate_examples():
    for chunk in pd.read_csv(dataset_path, sep='\t', chunksize=chunksize):
        for index, row in chunk.iterrows():
            if index > 50000:
                break
            example = InputExample(guid=None,
                                   text_a=row["review_body"],
                                   text_b=None,
                                   label=row["star_rating"]-1)  # 减1使得标签值在0到4之间
            yield example

In [8]:
# 将输入数据转换为模型需要的格式
def generate_features():
    for e in generate_examples():
        try:
            input_features = tokenizer.encode_plus(e.text_a,
                                                   add_special_tokens=True,
                                                   max_length=128,
                                                   truncation=True,  # 修改处：添加截断
                                                   padding='max_length',  # 修改处：更改填充方式
                                                   return_attention_mask=True)
            yield InputFeatures(input_ids=input_features["input_ids"],
                                attention_mask=input_features["attention_mask"],
                                token_type_ids=input_features["token_type_ids"],
                                label=e.label)
        except Exception as error:
            pass

In [9]:
# 创建TensorFlow数据集
def gen():
    for f in generate_features():
        yield ({'input_ids': f.input_ids, 'attention_mask': f.attention_mask, 'token_type_ids': f.token_type_ids}, f.label)

dataset = tf.data.Dataset.from_generator(gen,
                                         ({'input_ids': tf.int32, 'attention_mask': tf.int32, 'token_type_ids': tf.int32}, tf.int64),
                                         ({'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None]), 'token_type_ids': tf.TensorShape([None])}, tf.TensorShape([])))


In [10]:
# 分割训练集和验证集
DATASET_SIZE = len(list(generate_features()))  # 修改处：获取特征长度
train_size = int(0.9 * DATASET_SIZE)
val_size = int(0.1 * DATASET_SIZE)
dataset = dataset.shuffle(DATASET_SIZE)
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [11]:
# 编译模型
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

In [12]:
# 设置TensorBoard回调
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [13]:
# 训练模型
model.fit(train_dataset.shuffle(100).batch(16),  # 修改处：减小批次大小
          epochs=2,
          validation_data=val_dataset.batch(16),  # 修改处：减小批次大小
          callbacks=[tensorboard_callback])

Epoch 1/2
Epoch 2/2


2023-07-31 19:41:13.233042: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 6971 of 49990
2023-07-31 19:41:23.232856: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 14388 of 49990
2023-07-31 19:41:33.232933: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 21575 of 49990
2023-07-31 19:41:43.232912: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 29062 of 49990
2023-07-31 19:41:53.232928: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 36056 of 49990
2023-07-31 19:42:03.233005: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 43285 of 49990
2023-07-31 19:42:12.359430: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] Shu

<keras.src.callbacks.History at 0x7fe898d94370>

In [14]:
# 保存模型（这部分是保存模型的代码）
model_save_path = "Users/Model18W"
model.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to Users/Model18W


In [15]:
from datetime import datetime
from datetime import timedelta
from datetime import timezone

In [16]:
SHA_TZ = timezone(
    timedelta(hours=8),
    name='Asia/Shanghai',
)

# 协调世界时
utc_now = datetime.utcnow().replace(tzinfo=timezone.utc)
print(utc_now, utc_now.tzname())
print(utc_now.date(), utc_now.tzname())

# 北京时间
beijing_now = utc_now.astimezone(SHA_TZ)
print("完成时间为：")
print(beijing_now, beijing_now.tzname())
print(beijing_now.date(), beijing_now.tzname())

2023-08-01 01:02:05.917872+00:00 UTC
2023-08-01 UTC
完成时间为：
2023-08-01 09:02:05.917872+08:00 Asia/Shanghai
2023-08-01 Asia/Shanghai
