In [3]:
import torch
from transformers import pipeline

# 打印 PyTorch 版本，确认 PyTorch 是否安装正常
print(torch.__version__)

# 使用 Hugging Face transformers 加载情感分析 pipeline，指定模型为 distilbert-base-uncased
clf = pipeline("sentiment-analysis", model="distilbert-base-uncased")

# 测试情感分析，传入一个简单的句子
print(clf("This course explains NLP very clearly."))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2.2.2


Device set to use cpu


[{'label': 'LABEL_0', 'score': 0.5807062387466431}]


In [2]:
import torch
print(torch.__version__)

2.2.2


In [3]:
import thinc
import blis

print(thinc.__version__)
!pip show blis

8.3.2
Name: blis
Version: 1.3.3
Summary: The Blis BLAS-like linear algebra library, as a self-contained C-extension.
Home-page: https://github.com/explosion/cython-blis
Author: Matthew Honnibal
Author-email: matt@explosion.ai
License: BSD
Location: E:\Anaconda\Lib\site-packages
Requires: numpy
Required-by: thinc




In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

# -------------------------
# 1. 加载模型和分词器
# -------------------------
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 使用 safetensors 加载模型
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    use_safetensors=True
)

# -------------------------
# 2. 加载数据集
# -------------------------
dataset = load_dataset("imdb")

# -------------------------
# 3. 数据预处理
# -------------------------
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

encoded_dataset = dataset.map(preprocess_function, batched=True)

# -------------------------
# 4. 设置训练参数 (兼容 4.57.3)
# -------------------------
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_strategy="epoch",
    eval_steps=1000  # 每 1000 步评估一次
    # 注意：不要传 evaluation_strategy
)

# -------------------------
# 5. 创建 Trainer
# -------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"]
)

# -------------------------
# 6. 开始训练
# -------------------------
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Step,Training Loss
50,0.6886
100,0.6422


KeyboardInterrupt: 

In [8]:
import transformers
print(transformers.__version__)

4.57.3


In [2]:
raw_docs = [
    "Patient presents with tooth pain.",
    "No evidence of caries.",
    "Molar 17 has decay, patient reports pain.",
    "Patient is healthy.",
    "Mild sensitivity in upper molar."
]

# 标签：1=有症状, 0=无症状
labels = [1, 0, 1, 0, 1]

from transformers import DistilBertTokenizer, DistilBertModel
import torch

# 使用轻量级 DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

embeddings = []

for text in raw_docs:
    # 编码
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    
    # outputs.last_hidden_state shape = [1, seq_len, hidden_size]
    # 取平均作为句子向量
    sentence_embedding = outputs.last_hidden_state.mean(dim=1)
    embeddings.append(sentence_embedding[0].numpy())

# 转成 numpy 数组
import numpy as np
X = np.vstack(embeddings)
y = np.array(labels)

print("句子向量 shape:", X.shape)


句子向量 shape: (5, 768)


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 划分训练/测试
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# 训练分类器
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# 测试预测
y_pred = clf.predict(X_test)
print("预测结果:", y_pred)
print("准确率:", accuracy_score(y_test, y_pred))

new_sentences = [
    "Patient complains of severe tooth pain.",
    "Patient has healthy teeth."
]

# 转 embedding
new_embeddings = []
for text in new_sentences:
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    new_embeddings.append(outputs.last_hidden_state.mean(dim=1)[0].numpy())

new_X = np.vstack(new_embeddings)

# 预测
preds = clf.predict(new_X)
print("新句子预测结果:", preds)

预测结果: [1 1]
准确率: 0.5
新句子预测结果: [1 1]
