In [None]:
! pip install biopython # Install biopython package to interact with PubMed

In [None]:
# Import the Entrez module from biopython
from Bio import Entrez
import time

# Set the email for interact
Entrez.email = "annabian1122@gmail.com"

# Search for articles using a keyword
search_term = "Alzheimer's disease AND English[lang]"
handle = Entrez.esearch(db="pubmed", term=search_term, retmax=20000) # Search PubMed with a limit of 20,000 results
record = Entrez.read(handle) # Read the search results
handle.close() # Close the handle after reading

# Get the list of PubMed IDs (PMIDs) from the search results
id_list = record["IdList"]
print(f"Number of articles found: {len(id_list)}")

# Fetch abstracts of the articles using the PubMed IDs
handle = Entrez.efetch(db="pubmed", id=",".join(id_list), rettype="abstract", retmode="xml")
records = Entrez.read(handle) # Read the fetched data
handle.close() # Close the handle after reading

batch_size = 500

# Initialize lists to store abstracts and corresponding PMIDs
abstracts = []
pmids = []

import pandas as pd # Import pandas for data manipulation

for start in range(0, len(id_list), batch_size):
    end = min(start + batch_size, len(id_list))
    batch_ids = id_list[start:end]

    print(f"Fetching records {start+1} to {end}...")

    handle = Entrez.efetch(db="pubmed", id=",".join(batch_ids), rettype="abstract", retmode="xml") # Fetch abstracts in XML format
    batch_records = Entrez.read(handle) # Read the fetched data
    handle.close() # Close the handle after reading

    # Loop through the fetched articles and extract abstract text and PMIDs
    for article in batch_records['PubmedArticle']:
        try:
            # Extract the abstract text and join it into a single string
            abstract_text = article['MedlineCitation']['Article']['Abstract']['AbstractText']
            abstract_str = " ".join(abstract_text)
            # Extract the PubMed ID (PMID)
            pmid = article['MedlineCitation']['PMID']
            # Append abstract and PMID to the lists
            abstracts.append(abstract_str)
            pmids.append(pmid)
        except:
            # Skip articles without an abstract
            continue

    time.sleep(0.5)


# Save the extracted data into a DataFrame (ensure only articles with abstracts are included)
df = pd.DataFrame({
    "PMID": pmids,
    "Abstract": abstracts
})

# Save the DataFrame to a CSV file
csv_path = "alzheimers_abstracts.csv"
df.to_csv(csv_path, index=False)

print(f"Saved {len(df)} abstracts to the CSV file: {csv_path}") # Print the number of abstracts saved


In [None]:
print(f"len(id_list): {len(id_list)}")
print(f"len(abstracts): {len(abstracts)}")


In [None]:
import pandas as pd

# Read the cleaned and saved abstracts data
df = pd.read_csv("/content/alzheimers_abstracts.csv")

# Define a custom list of symptom-related keywords
symptom_keywords = [
    "memory loss", "forgetfulness", "cognitive decline", "confusion",
    "disorientation", "language impairment", "attention deficit", "behavioral change",
    "mild cognitive impairment", "cognitive symptoms"
]

# Add a new column 'MentionsSymptom': mark 1 if any keyword is found in the abstract, else 0
df["MentionsSymptom"] = df["Abstract"].apply(
    lambda text: int(any(kw.lower() in text.lower() for kw in symptom_keywords))
)

# Save the updated DataFrame as a new CSV file
df.to_csv("alzheimers_abstracts__symptoms.csv", index=False)
print("Successfully completed keyword tagging and saved the new file.")


In [None]:
! pip install spacy

! python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler

# Load the English spaCy model
nlp = spacy.load("en_core_web_sm")

# Add an EntityRuler to insert custom entity recognition rules before the built-in NER
ruler = nlp.add_pipe("entity_ruler", before="ner")

# Define a list of symptom keywords
symptom_keywords = [
    "memory loss", "forgetfulness", "cognitive decline", "confusion",
    "disorientation", "language impairment", "attention deficit"
]

# Build patterns for the EntityRuler
patterns = [{"label": "SYMPTOM", "pattern": kw} for kw in symptom_keywords]
ruler.add_patterns(patterns)

# Test entity recognition on a single sample sentence
doc = nlp("The patient experienced memory loss and confusion over time.")
print("\nTesting entity recognition on a sample sentence:")
for ent in doc.ents:
    print(f" - {ent.text} ({ent.label_})")

# Read the abstracts dataset
df = pd.read_csv("/content/alzheimers_abstracts__symptoms.csv")

# Perform entity recognition on the first 5 abstracts
print("\nBatch processing entity recognition results for abstracts: ")
for i in range(5):
    text = df.loc[i, "Abstract"]
    print(f"\nAbstract #{i+1}:\n{text}")

    doc = nlp(text)

    print("Recognized Entities:")
    for ent in doc.ents:
        print(f" - {ent.text} ({ent.label_})")


In [None]:
from sklearn.model_selection import train_test_split

# Read preprocessed dataset (with symptom labels)）
df = pd.read_csv("/content/alzheimers_abstracts__symptoms.csv")

# Split the data into training set and temporary set (dev + test)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["MentionsSymptom"])
# 30% of data goes to dev + test, 70% to train

# Further split the temporary set into dev (validation) and test sets (50% each)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["MentionsSymptom"])

# Save the splits into separate CSV files
train_df.to_csv("symptom_train.csv", index=False)
dev_df.to_csv("symptom_dev.csv", index=False)
test_df.to_csv("symptom_test.csv", index=False)

print("Symptom recognition train/validation/test sets have been saved.")


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd

# Read the previously saved training and testing datasets
train_df = pd.read_csv("/content/symptom_train.csv")
test_df = pd.read_csv("/content/symptom_test.csv")

# Split features and labels
X_train = train_df["Abstract"] # Extract the abstract text for training
y_train = train_df["MentionsSymptom"] # Extract the label (whether symptom is mentioned: 0 or 1) for training
X_test = test_df["Abstract"] # Extract the abstract text for testing
y_test = test_df["MentionsSymptom"] # Extract the label for testing

# Apply TF-IDF vectorization to the text
vectorizer = TfidfVectorizer(max_features=5000) # Keep only the top 5000 most frequent words
X_train_vec = vectorizer.fit_transform(X_train) # Fit on training data and transform it
X_test_vec = vectorizer.transform(X_test)

# Train a Logistic Regression model
clf = LogisticRegression(max_iter=200) # Initialize the model (allow up to 200 iterations to converge)
clf.fit(X_train_vec, y_train) # Train the model on the vectorized training data

# Predict and output results
y_pred = clf.predict(X_test_vec) # Predict the labels for test data
print(classification_report(y_test, y_pred)) # Print precision, recall, f1-score, and support for each class

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Predict the labels for the test set
y_pred = clf.predict(X_test_vec) # Model's predicted class (0 or 1) for each test s
y_prob = clf.predict_proba(X_test_vec)[:, 1] # Model's predicted probability of class 1 for each sample

# Print the classification report (precision, recall, F1-score, support)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate and print the ROC-AUC score
auc = roc_auc_score(y_test, y_prob) # Area under the Receiver Operating Characteristic curve
print(f"ROC-AUC: {auc:.4f}")

from sklearn.metrics import confusion_matrix

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)



In [None]:
! pip install datasets

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score

# Load training and testing datasets
df = pd.read_csv("/content/symptom_train.csv")
df_test = pd.read_csv("/content/symptom_test.csv")

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Data Preprocessing: prepare Huggingface Dataset objects
# Rename columns to 'text' and 'label' for Huggingface compatibility
train_dataset = Dataset.from_pandas(df[["Abstract", "MentionsSymptom"]].rename(columns={"Abstract": "text", "MentionsSymptom": "label"}))
test_dataset = Dataset.from_pandas(df_test[["Abstract", "MentionsSymptom"]].rename(columns={"Abstract": "text", "MentionsSymptom": "label"}))

# Tokenize the datasets
train_dataset = train_dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length"), batched=True)
test_dataset = test_dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length"), batched=True)

# Define label map and load model
id2label = {0: "No Symptom", 1: "Mentions Symptom"}
label2id = {"No Symptom": 0, "Mentions Symptom": 1}

# Load DistilBERT model for sequence classification
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2, # Binary classification (0 or 1)
    id2label=id2label,
    label2id=label2id
)


# Set training arguments
from transformers import TrainingArguments


from transformers import (AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification,
    TrainingArguments, Trainer)

training_args = TrainingArguments(
    output_dir="text_classification_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_strategy="epoch",                      # run eval at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none",
    fp16=True  # # Use FP16 (faster on GPUs) with Colab
)


# Define custom evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    auc = roc_auc_score(labels, pred.predictions[:, 1])
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": report["accuracy"],        # Overall accuracy
        "precision": report["1"]["precision"], # Precision for class '1' (Mentions Symptom)
        "recall": report["1"]["recall"],       # Recall for class '1'
        "f1": report["1"]["f1-score"],         # F1-score for class '1'
        "roc_auc": auc                         # ROC-AUC score
    }


# Initialize Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train() # Start training
trainer.evaluate() # Final evaluation on the test set


In [None]:
import transformers
print(transformers.__version__)

In [None]:
! pip install evaluate

In [None]:
# Define evaluation metric
import evaluate
accuracy = evaluate.load("accuracy")

In [None]:
# Custom function to compute metrics during evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred # Unpack predictions and true labels
    preds = np.argmax(predictions, axis=1) # Take the class with the highest probability
    return accuracy.compute(predictions=preds, references=labels) # Return computed accuracy

In [None]:
# Use Trainer to make final predictions on the test dataset
predictions = trainer.predict(test_dataset)

In [None]:
print(predictions.predictions)  # Model's predicted logits (2D array: samples x 2 classes)
print(predictions.label_ids)    # True labels (1D array)
print(predictions.metrics)      # Evaluation metrics

In [None]:
# Convert logits to final class predictions
y_pred_accuracy = np.argmax(predictions.predictions, axis=1) # Take the index of the highest logit (class 0 or 1)
y_true_accuracy = predictions.label_ids # True labels extracted from predictions

In [None]:
# Print a detailed classification report
print("\nClassification Report:")
report = classification_report(
    y_true_accuracy, y_pred_accuracy,
    labels=[0, 1], # Define the label order
    target_names=["No Symptom", "Mentions Symptom"], # Label names for readability
    digits=4 # Display results with 4 decimal places
)
print(report)

from sklearn.metrics import confusion_matrix, accuracy_score
# Print confusion matrix and overall accuracy
print("\nConfusion Matrix:")
print(confusion_matrix(y_true_accuracy, y_pred_accuracy, labels=[0, 1]))

print("\nOverall Accuracy:", accuracy_score(y_true_accuracy, y_pred_accuracy))
