In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd

# 本地路径
data_path = "/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/alzheimers_disease_data.csv"

# 读取数据
df = pd.read_csv(data_path)

# 检查前几行
df.head()

# 移除不需要的列
df_clean = df.drop(columns=["PatientID", "DoctorInCharge"])

# 设置第一阶段的目标变量，例如预测 Forgetfulness 症状
target_variable = "Forgetfulness"

# 拆分特征和标签
X = df_clean.drop(columns=[target_variable])
y = df_clean[target_variable]

# 80% 训练集，20% 测试集，保持正负样本比例一致
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 合并特征和标签，保存为 DataFrame
train_df = X_train.copy()
train_df[target_variable] = y_train

test_df = X_test.copy()
test_df[target_variable] = y_test

# 展示训练集和测试集大小
import os
output_dir = "/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets"
train_path = os.path.join(output_dir, "train_forgetfulness.csv")
test_path = os.path.join(output_dir, "test_forgetfulness.csv")

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

(train_df.shape, test_df.shape, train_path, test_path)



((1719, 33),
 (430, 33),
 '/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/train_forgetfulness.csv',
 '/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/test_forgetfulness.csv')

In [4]:
! pip install biopython


Collecting biopython
  Downloading biopython-1.85-cp311-cp311-macosx_11_0_arm64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-macosx_11_0_arm64.whl (2.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [22]:
from Bio import Entrez

# 设置邮箱（NCBI 要求）
Entrez.email = "annabian1122@gmail.com"

# 第一步：用关键词搜索文章（比如 Alzheimer's）
search_term = "Alzheimer's disease AND (symptoms OR diagnosis) AND English[lang]"
handle = Entrez.esearch(db="pubmed", term=search_term, retmax=2000)
record = Entrez.read(handle)
handle.close()

# 获取 PubMed ID 列表
id_list = record["IdList"]
print(f"找到文献数量: {len(id_list)}")

# 更干净的方式：用 XML 格式获取摘要数据
handle = Entrez.efetch(db="pubmed", id=",".join(id_list), rettype="abstract", retmode="xml")
records = Entrez.read(handle)
handle.close()

abstracts = []
pmids = []

for article in records['PubmedArticle']:
    try:
        abstract_text = article['MedlineCitation']['Article']['Abstract']['AbstractText']
        # 处理成纯字符串
        abstract_str = " ".join(abstract_text)
        pmid = article['MedlineCitation']['PMID']
        abstracts.append(abstract_str)
        pmids.append(pmid)
    except:
        # 没有摘要就跳过
        continue

# 保存成 DataFrame（确保只包含有摘要的文章）
df = pd.DataFrame({
    "PMID": pmids,
    "Abstract": abstracts
})


# 保存为 CSV 文件（指定路径）
csv_path = "/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/alzheimers_abstracts.csv"
df.to_csv(csv_path, index=False)

print(f"已保存 {len(df)} 篇摘要到 csv 文件：{csv_path}")



找到文献数量: 2000
已保存 1985 篇摘要到 csv 文件：/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/alzheimers_abstracts.csv


In [23]:
print(f"len(id_list): {len(id_list)}")
print(f"len(abstracts): {len(abstracts)}")


len(id_list): 2000
len(abstracts): 1985


In [25]:
import pandas as pd

# 读取你清洗保存的摘要数据
df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/alzheimers_abstracts.csv")

# 自定义症状关键词列表（可扩展）
symptom_keywords = [
    "memory loss", "forgetfulness", "cognitive decline", "confusion",
    "disorientation", "language impairment", "attention deficit", "behavioral change",
    "mild cognitive impairment", "cognitive symptoms"
]

# 添加新列 MentionsSymptom：是否包含任一关键词（1/0）
df["MentionsSymptom"] = df["Abstract"].apply(
    lambda text: int(any(kw.lower() in text.lower() for kw in symptom_keywords))
)

# 保存为新文件
df.to_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/alzheimers_abstracts_labeled.csv", index=False)

print("✅ 已完成关键词标签打标，保存新文件。")


✅ 已完成关键词标签打标，保存新文件。


In [27]:
# 安装 spaCy
! pip install spacy

# 下载英文小模型
! python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [29]:
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler

# 🔹 1. 加载 spaCy 英文模型
nlp = spacy.load("en_core_web_sm")

# 🔹 2. 添加 EntityRuler 自定义实体规则
ruler = nlp.add_pipe("entity_ruler", before="ner")

# 症状关键词列表（你可以随时扩展）
symptom_keywords = [
    "memory loss", "forgetfulness", "cognitive decline", "confusion",
    "disorientation", "language impairment", "attention deficit"
]

# 构建规则
patterns = [{"label": "SYMPTOM", "pattern": kw} for kw in symptom_keywords]
ruler.add_patterns(patterns)

# ✅ 测试单句效果（可选）
doc = nlp("The patient experienced memory loss and confusion over time.")
print("\n🧪 测试句子实体识别:")
for ent in doc.ents:
    print(f" - {ent.text} ({ent.label_})")

# 🔹 3. 读取摘要数据
df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/alzheimers_abstracts.csv")

# 🔹 4. 对前 5 条摘要执行实体识别（可以改成全部）
print("\n📋 批量处理摘要中的实体识别结果：")
for i in range(5):  # 改成 len(df) 可以处理全部
    text = df.loc[i, "Abstract"]
    print(f"\n📄 Abstract #{i+1}:\n{text}")
    
    doc = nlp(text)
    
    print("🔍 Recognized Entities:")
    for ent in doc.ents:
        print(f" - {ent.text} ({ent.label_})")



🧪 测试句子实体识别:
 - memory loss (SYMPTOM)
 - confusion (SYMPTOM)

📋 批量处理摘要中的实体识别结果：

📄 Abstract #1:
Lesion-symptom mapping methods assess the relationship between lesions caused by cerebral small vessel disease and cognition, but current technology like support vector regression (SVR)) primarily provide group-level results. We propose a novel lesion-symptom mapping approach that can indicate how lesion patterns contribute to cognitive impairment on an individual level. A convolutional neural network (CNN) predicts cognitive scores and is combined with explainable artificial intelligence (XAI) to map the relation between cognition and vascular lesions. This method was evaluated primarily using real white matter hyperintensity maps of 821 memory clinic patients and simulated cognitive data, with weighted lesions and noise levels. Simulated data provided ground truth locations to assess predictive performance of the CNN and accuracy of strategic lesion identification by XAI, using an establis

In [31]:
from sklearn.model_selection import train_test_split

# 读取你处理过的数据集（带标签）
df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/alzheimers_abstracts_labeled.csv")

# 拆分成训练 + 临时（dev+test）
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["MentionsSymptom"])

# 再从 temp 中拆出 dev 和 test（50/50）
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["MentionsSymptom"])

# 保存三个文件
train_df.to_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Abstracts_train.csv", index=False)
dev_df.to_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Abstracts_dev.csv", index=False)
test_df.to_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Abstracts_test.csv", index=False)

print("✅ 数据集拆分完成！训练/验证/测试集已保存。")


✅ 数据集拆分完成！训练/验证/测试集已保存。


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd

# 加载训练和测试数据
train_df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Abstracts_train.csv")
test_df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Abstracts_test.csv")

# 提取文本和标签
X_train = train_df["Abstract"]
y_train = train_df["MentionsSymptom"]
X_test = test_df["Abstract"]
y_test = test_df["MentionsSymptom"]

# TF-IDF 向量化
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 训练逻辑回归模型
clf = LogisticRegression(max_iter=200)
clf.fit(X_train_vec, y_train)

# 预测 + 输出结果
y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.80      0.96      0.87       199
           1       0.88      0.51      0.64        99

    accuracy                           0.81       298
   macro avg       0.84      0.73      0.76       298
weighted avg       0.82      0.81      0.80       298



In [34]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# 预测测试集
y_pred = clf.predict(X_test_vec)
y_prob = clf.predict_proba(X_test_vec)[:, 1]

# 分类评估报告（包含 precision, recall, F1, accuracy）
print("📊 Classification Report:")
print(classification_report(y_test, y_pred))

# ROC-AUC 分数
auc = roc_auc_score(y_test, y_prob)
print(f"🧠 ROC-AUC: {auc:.4f}")

from sklearn.metrics import confusion_matrix

# 计算混淆矩阵
cm = confusion_matrix(y_test, y_pred)
print("🧩 Confusion Matrix:")
print(cm)



📊 Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.96      0.87       199
           1       0.88      0.51      0.64        99

    accuracy                           0.81       298
   macro avg       0.84      0.73      0.76       298
weighted avg       0.82      0.81      0.80       298

🧠 ROC-AUC: 0.9168
🧩 Confusion Matrix:
[[192   7]
 [ 49  50]]


In [37]:
! pip install --upgrade transformers


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [44]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score

# 1. 读取数据
df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Abstracts_train.csv")
df_test = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Abstracts_test.csv")

# 2. 载入 tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# 3. 预处理（tokenize）
train_dataset = Dataset.from_pandas(df[["Abstract", "MentionsSymptom"]].rename(columns={"Abstract": "text", "MentionsSymptom": "label"}))
test_dataset = Dataset.from_pandas(df_test[["Abstract", "MentionsSymptom"]].rename(columns={"Abstract": "text", "MentionsSymptom": "label"}))

train_dataset = train_dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length"), batched=True)
test_dataset = test_dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length"), batched=True)

# Define label map and load model
id2label = {0: "NEGATIVE", 1: "NEUTRAL"}
label2id = {"NEGATIVE": 0, "NEUTRAL": 1}
# 4. 加载模型
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)


# 5. 设置训练参数
from transformers import TrainingArguments


from transformers import (
    AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
training_args = TrainingArguments(
    output_dir="/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/text_classification_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_strategy="epoch",                      # run eval at the end of each epoch
    save_strategy="epoch", 
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none",
    # fp16=True  # 只在支持 GPU 的时候开启
)


# 6. 自定义指标
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    auc = roc_auc_score(labels, pred.predictions[:, 1])
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "precision": report["1"]["precision"],
        "recall": report["1"]["recall"],
        "f1": report["1"]["f1-score"],
        "roc_auc": auc
    }

# 7. 训练模型
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()


Map: 100%|██████████| 1389/1389 [00:00<00:00, 3183.85 examples/s]
Map: 100%|██████████| 298/298 [00:00<00:00, 2819.22 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,No log,0.2736,0.885906,0.782609,0.909091,0.841121,0.972387
2,No log,0.131512,0.963087,0.968085,0.919192,0.943005,0.974164
3,No log,0.124372,0.969799,0.98913,0.919192,0.95288,0.968885
4,No log,0.131684,0.966443,0.968421,0.929293,0.948454,0.974773


{'eval_loss': 0.12437228858470917,
 'eval_accuracy': 0.9697986577181208,
 'eval_precision': 0.9891304347826086,
 'eval_recall': 0.9191919191919192,
 'eval_f1': 0.9528795811518325,
 'eval_roc_auc': 0.9688848281813105,
 'eval_runtime': 32.516,
 'eval_samples_per_second': 9.165,
 'eval_steps_per_second': 0.584,
 'epoch': 4.0}

In [42]:
import transformers
print(transformers.__version__)


4.51.3


In [45]:
# Define evaluation metric
import evaluate
accuracy = evaluate.load("accuracy")

In [46]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=preds, references=labels)

In [48]:
predictions = trainer.predict(test_dataset)


In [49]:
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids
# Define evaluation metric
import evaluate
accuracy = evaluate.load("accuracy")

In [50]:
print("\nClassification Report:")
report = classification_report(
    y_true, y_pred,
    labels=[0, 1],
    target_names=["No Symptom", "Mentions Symptom"],
    digits=4
)
print(report)

from sklearn.metrics import confusion_matrix, accuracy_score

print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred, labels=[0, 1]))

print("\nOverall Accuracy:", accuracy_score(y_true, y_pred))



Classification Report:
                  precision    recall  f1-score   support

      No Symptom     0.9612    0.9950    0.9778       199
Mentions Symptom     0.9891    0.9192    0.9529        99

        accuracy                         0.9698       298
       macro avg     0.9751    0.9571    0.9653       298
    weighted avg     0.9705    0.9698    0.9695       298


Confusion Matrix:
[[198   1]
 [  8  91]]

Overall Accuracy: 0.9697986577181208
