In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd

# 读取测试集数据
test_df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/Symptom_Forgetfulness_Datasets/symptom_test.csv")
X_test = test_df["Abstract"]
y_test = test_df["MentionsSymptom"]

# TF-IDF向量化
vectorizer = TfidfVectorizer(max_features=5000)
X_test_vec = vectorizer.fit_transform(X_test)

# 加载Logistic回归模型
clf = LogisticRegression(max_iter=200)
clf.fit(X_test_vec, y_test)

# 生成预测结果
y_pred_logistic = clf.predict(X_test_vec)

# 保存预测结果为CSV
logistic_pred_df = pd.DataFrame({"Abstract": X_test, "Prediction": y_pred_logistic})
logistic_pred_df.to_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/logistic_predictions.csv", index=False, encoding='utf-8')

In [21]:
from transformers import DistilBertTokenizerFast, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
import pandas as pd

# 读取测试集数据
df_test = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/Symptom_Forgetfulness_Datasets/symptom_test.csv")

# 加载DistilBERT模型和Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# 预处理测试集数据
test_dataset = Dataset.from_pandas(df_test[["Abstract", "MentionsSymptom"]].rename(columns={"Abstract": "text", "MentionsSymptom": "label"}))
test_dataset = test_dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length"), batched=True)

# 更改输出路径为桌面
output_dir = "/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/"  # 修改路径

# 检查输出路径是否存在，不存在则创建
import os
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"创建输出目录：{output_dir}")

# 设置训练参数
training_args = TrainingArguments(
    output_dir=output_dir,  # 使用绝对路径，避免相对路径权限问题
    per_device_eval_batch_size=16,
)

# 创建Trainer对象
trainer = Trainer(
    model=model,
    args=training_args,
)

# 生成预测结果
predictions = trainer.predict(test_dataset)
y_pred_distilbert = np.argmax(predictions.predictions, axis=1)

# 保存预测结果为CSV
distilbert_pred_df = pd.DataFrame({"Abstract": df_test["Abstract"], "Prediction": y_pred_distilbert})
distilbert_pred_df.to_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/distilbert_predictions.csv", index=False, encoding='utf-8')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1442/1442 [00:00<00:00, 2663.50 examples/s]


In [22]:
import pandas as pd
import numpy as np

# 加载测试集数据和模型预测结果
true_df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/Symptom_Forgetfulness_Datasets/symptom_test.csv")   # 包含真实标签
logistic_pred_df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/logistic_predictions.csv")  # Logistic回归预测结果
distilbert_pred_df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/distilbert_predictions.csv")  # DistilBERT预测结果

# Check if the data format is correct
print("Sample from the test dataset:")
print(true_df.head())

print("\nSample from Logistic regression predictions:")
print(logistic_pred_df.head())

print("\nSample from DistilBERT predictions:")
print(distilbert_pred_df.head())

# Extract texts, true labels, and predicted labels
texts = true_df["Abstract"]
y_true = true_df["MentionsSymptom"]
y_pred_logistic = logistic_pred_df["Prediction"]
y_pred_distilbert = distilbert_pred_df["Prediction"]


Sample from the test dataset:
       PMID                                           Abstract  \
0  39956389  Aging is a major risk factor for neurodegenera...   
1  40276322  Alzheimer's disease (AD) clinical trials with ...   
2  39858593  A notion of the continuous production of amylo...   
3  39751865  The complex set of interactions between the im...   
4  39846482  The neuropathologies of Alzheimer's disease (A...   

   MentionsSymptom  
0                1  
1                0  
2                0  
3                0  
4                0  

Sample from Logistic regression predictions:
                                            Abstract  Prediction
0  Aging is a major risk factor for neurodegenera...           0
1  Alzheimer's disease (AD) clinical trials with ...           0
2  A notion of the continuous production of amylo...           0
3  The complex set of interactions between the im...           0
4  The neuropathologies of Alzheimer's disease (A...           0

Sample fro

In [23]:
# Load true labels and prediction labels
true_df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/Symptom_Forgetfulness_Datasets/symptom_test.csv")
logistic_pred_df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/logistic_predictions.csv")
distilbert_pred_df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/distilbert_predictions.csv")

# Extract texts, true labels, and predicted labels
texts = true_df["Abstract"]
y_true = true_df["MentionsSymptom"]
y_pred_logistic = logistic_pred_df["Prediction"]
y_pred_distilbert = distilbert_pred_df["Prediction"]

# Store success and failure cases
success_cases = []
failure_cases = []

for i in range(len(y_true)):
    text = texts[i]
    true_label = y_true[i]
    pred_log = y_pred_logistic[i]
    pred_distilbert = y_pred_distilbert[i]

    # Success cases: both models predict correctly
    if pred_log == true_label and pred_distilbert == true_label:
        success_cases.append((text, true_label, pred_log, pred_distilbert))
    
    # Failure cases: either model predicts incorrectl
    if pred_log != true_label or pred_distilbert != true_label:
        failure_cases.append((text, true_label, pred_log, pred_distilbert))

# Print case statistics
print(f"Number of success cases: {len(success_cases)}")
print(f"Number of failure cases: {len(failure_cases)}")

# Create dataframes for success and failure cases
success_df = pd.DataFrame(success_cases, columns=['Text', 'True Label', 'Logistic Prediction', 'DistilBERT Prediction'])
failure_df = pd.DataFrame(failure_cases, columns=['Text', 'True Label', 'Logistic Prediction', 'DistilBERT Prediction'])

# Save success and failure cases as CSV files
success_df.to_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/success_cases.csv", index=False, encoding='utf-8')
failure_df.to_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/failure_cases.csv", index=False, encoding='utf-8')

print("\nSuccess and failure cases have been saved to:")
print("1. success_cases.csv")
print("2. failure_cases.csv")


Number of success cases: 263
Number of failure cases: 1179

Success and failure cases have been saved to:
1. success_cases.csv
2. failure_cases.csv


In [24]:
import pandas as pd

# Load success and failure case data
success_df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/success_cases.csv")
failure_df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/failure_cases.csv")

# Print data format examples
print("Sample from success cases:")
print(success_df.head(), "\n")

print("Sample from failure cases:")
print(failure_df.head(), "\n")


Sample from success cases:
                                                Text  True Label  \
0  The neuropathologies of Alzheimer's disease (A...           0   
1  Bipolar disorder (BD) is a prevalent mood diso...           1   
2  "The eyes are a window to the brain," promptin...           1   
3  Longitudinal investigation of the Apolipoprote...           1   
4  Geriatric patients have difficulty to comply t...           0   

   Logistic Prediction  DistilBERT Prediction  
0                    0                      0  
1                    1                      1  
2                    1                      1  
3                    1                      1  
4                    0                      0   

Sample from failure cases:
                                                Text  True Label  \
0  Aging is a major risk factor for neurodegenera...           1   
1  Alzheimer's disease (AD) clinical trials with ...           0   
2  A notion of the continuous production of

In [25]:
from collections import Counter

# Extract all success case texts
success_texts = success_df["Text"].tolist()

# Word frequency statistics in successful cases
word_counts = Counter(" ".join(success_texts).split())
top_words = word_counts.most_common(20)

print("Most common words in successful cases:")
for word, freq in top_words:
    print(f"{word}: {freq}")

# Proportion of cases where both models predict correctly
both_correct = len(success_df[(success_df['Logistic Prediction'] == success_df['True Label']) & 
                              (success_df['DistilBERT Prediction'] == success_df['True Label'])])
print(f"\nProportion of success cases where both models predict correctly: {both_correct / len(success_df) * 100:.2f}%")


Most common words in successful cases:
and: 2953
the: 2116
of: 1965
in: 1379
to: 1188
with: 1002
for: 796
a: 787
cognitive: 544
were: 534
The: 357
on: 349
was: 348
=: 331
that: 329
is: 315
AD: 300
from: 296
as: 289
Alzheimer's: 287

Proportion of success cases where both models predict correctly: 100.00%


In [26]:
# Extract text features from failure cases
failure_texts = failure_df["Text"].tolist()

# Word frequency statistics in failure cases
word_counts_fail = Counter(" ".join(failure_texts).split())
top_words_fail = word_counts_fail.most_common(20)

print("\nMost common words in failure cases:")
for word, freq in top_words_fail:
    print(f"{word}: {freq}")

# Proportion of conflict cases where both models predict differently
conflict_cases = len(failure_df[(failure_df['Logistic Prediction'] != failure_df['DistilBERT Prediction'])])
print(f"\nProportion of failure cases with conflicting predictions between both models: {conflict_cases / len(failure_df) * 100:.2f}%")



Most common words in failure cases:
and: 11380
the: 9996
of: 8892
in: 6555
to: 4646
a: 3286
with: 3234
for: 2607
is: 1781
as: 1699
AD: 1632
were: 1631
that: 1610
The: 1406
disease: 1384
by: 1339
Alzheimer's: 1314
was: 1309
on: 1159
cognitive: 967

Proportion of failure cases with conflicting predictions between both models: 97.37%


In [27]:
# Compare error types between the two models
logistic_errors = len(failure_df[failure_df['Logistic Prediction'] != failure_df['True Label']])
distilbert_errors = len(failure_df[failure_df['DistilBERT Prediction'] != failure_df['True Label']])

print(f"\nNumber of Logistic regression prediction errors: {logistic_errors}")
print(f"Number of DistilBERT prediction errors: {distilbert_errors}")

# Calculate the proportion of conflict cases between the models
conflict_ratio = conflict_cases / len(failure_df)
print(f"\nModel conflict ratio: {conflict_ratio * 100:.2f}%")



Number of Logistic regression prediction errors: 182
Number of DistilBERT prediction errors: 1028

Model conflict ratio: 97.37%
