In [1]:
import pandas as pd

# Read your cleaned and saved abstracts dataset
df = pd.read_csv("/content/alzheimers_abstracts.csv")

# Define a list of risk factor keywords
risk_keywords = [
    "air pollution", "PM2.5", "particulate matter", "environmental exposure",
    "toxins", "neuroinflammation", "smoking", "hypertension", "cholesterol",
    "diet", "sleep quality", "obesity", "pesticides"
]

# Add a new column 'Factors': mark 1 if any keyword is found in the abstract, else 0
df["Factors"] = df["Abstract"].apply(
    lambda text: int(any(kw.lower() in text.lower() for kw in risk_keywords))
)

# Save the updated DataFrame to a new CSV file
df.to_csv("alzheimers_abstracts_risk.csv", index=False)

print("Successfully completed keyword tagging for risk factors and saved the new file.")


Successfully completed keyword tagging for risk factors and saved the new file.


In [2]:
from sklearn.model_selection import train_test_split

# Read your processed dataset (already labeled with 'Factors')
df = pd.read_csv("/content/alzheimers_abstracts_risk.csv")

# Split the dataset into Training set + Temporary set (dev + test)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["Factors"])

# Further split the Temporary set into Dev and Test sets (50% each)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["Factors"])

# Save the resulting datasets to new CSV files
train_df.to_csv("risk_train.csv", index=False)
dev_df.to_csv("risk_dev.csv", index=False)
test_df.to_csv("risk_test.csv", index=False)

print("Train/Dev/Test sets have been saved.")


Train/Dev/Test sets have been saved.


In [3]:
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler

nlp = spacy.load("en_core_web_sm")

# Add an EntityRuler to insert custom entity recognition patterns before the built-in NER
ruler = nlp.add_pipe("entity_ruler", before="ner")

# Define a list of risk factor keywords
risk_keywords = [
    "air pollution", "PM2.5", "particulate matter", "environmental exposure",
    "toxins", "neuroinflammation", "smoking", "hypertension", "cholesterol",
    "diet", "sleep quality", "obesity", "pesticides"
]

# Build custom patterns with label 'RISK'
patterns = [{"label": "RISK", "pattern": kw} for kw in risk_keywords]
ruler.add_patterns(patterns)

# Test entity recognition on a single simple sentence
doc = nlp("The patient experienced air pollution.")
print("\nTesting entity recognition on a sample sentence:")
for ent in doc.ents:
    print(f" - {ent.text} ({ent.label_})")

# Read the abstracts dataset
df = pd.read_csv("/content/alzheimers_abstracts.csv")

# Perform entity recognition on the first 5 abstracts
print("\nBatch processing entity recognition results:")
for i in range(5):
    text = df.loc[i, "Abstract"]
    print(f"\nAbstract #{i+1}:\n{text}")

    doc = nlp(text) # Apply NLP pipeline

    print("Recognized Entities:")
    for ent in doc.ents:
        print(f" - {ent.text} ({ent.label_})")



Testing entity recognition on a sample sentence:
 - air pollution (RISK)

Batch processing entity recognition results:

Abstract #1:
Whether or not neuropsychiatric symptoms (NPS) in advance of dementia are associated with Alzheimer disease (AD) and/or other neurodegenerative dementias remains to be determined. The mild behavioural impairment (MBI) construct selects persons with NPS that are later-life emergent and persistent to identify a high-risk group for cognitive decline and incident dementia. Here, in older adults without dementia at baseline, we examined whether postmortem AD and other neurodegenerative pathologies were associated with MBI in the five years before death. National Alzheimer's Coordinating Center study autopsy participants (n=1016, 82.6 years, 48.7% female, 60% normal cognition) were included in the analyses. Using the Neuropsychiatric Inventory-Questionnaire, MBI+ status was operationalized as NPS persistence at >2/3 of pre-dementia study visits; otherwise, sta

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd

# Load the training and testing datasets
train_df = pd.read_csv("/content/risk_train.csv")
test_df = pd.read_csv("/content/risk_test.csv")

# Extract features (text) and labels
X_train2 = train_df["Abstract"]
y_train2 = train_df["Factors"]
X_test2 = test_df["Abstract"]
y_test2 = test_df["Factors"]

# Apply TF-IDF vectorization to the text
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec2 = vectorizer.fit_transform(X_train2)
X_test_vec2 = vectorizer.transform(X_test2)

# Train a Logistic Regression model
clf = LogisticRegression(max_iter=200)
clf.fit(X_train_vec2, y_train2)

# Predict and print the classification report
y_pred2 = clf.predict(X_test_vec2)
print(classification_report(y_test2, y_pred2))


              precision    recall  f1-score   support

           0       0.89      1.00      0.94      1202
           1       0.97      0.36      0.53       240

    accuracy                           0.89      1442
   macro avg       0.93      0.68      0.73      1442
weighted avg       0.90      0.89      0.87      1442



In [5]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Predict on the test set
y_pred2 = clf.predict(X_test_vec2) # Predicted class labels (0 or 1)
y_prob2 = clf.predict_proba(X_test_vec2)[:, 1]

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test2, y_pred2))

# Calculate and print ROC-AUC score
auc = roc_auc_score(y_test2, y_prob2)
print(f"ROC-AUC: {auc:.4f}")

from sklearn.metrics import confusion_matrix

# Compute and print the confusion matrix
cm = confusion_matrix(y_test2, y_pred2)
print("Confusion Matrix:")
print(cm)

Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      1202
           1       0.97      0.36      0.53       240

    accuracy                           0.89      1442
   macro avg       0.93      0.68      0.73      1442
weighted avg       0.90      0.89      0.87      1442

ROC-AUC: 0.9456
Confusion Matrix:
[[1199    3]
 [ 153   87]]


In [6]:
! pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [7]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score

# Load the train and test datasets
df = pd.read_csv("/content/risk_train.csv")
df_test = pd.read_csv("/content/risk_test.csv")

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Preprocessing: prepare Huggingface Dataset objects
# Rename columns to match Huggingface expected names ("text", "label")
train_dataset2 = Dataset.from_pandas(df[["Abstract", "Factors"]].rename(columns={"Abstract": "text", "Factors": "label"}))
test_dataset2 = Dataset.from_pandas(df_test[["Abstract", "Factors"]].rename(columns={"Abstract": "text", "Factors": "label"}))
# Apply tokenization
train_dataset2 = train_dataset2.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length"), batched=True)
test_dataset2 = test_dataset2.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length"), batched=True)

# Define label map and load model
id2label = {0: "No risk", 1: "Mentions risk"}
label2id = {"No risk": 0, "Mentions risk": 1}

# load model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)


# Set training parameters
from transformers import TrainingArguments


from transformers import (
    AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
training_args = TrainingArguments(
    output_dir="text_classification_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_strategy="epoch",                      # run eval at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none",
    fp16=True  # using a GPU with FP16 support with Colab
)


# Define custom evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    auc = roc_auc_score(labels, pred.predictions[:, 1])
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "precision": report["1"]["precision"],
        "recall": report["1"]["recall"],
        "f1": report["1"]["f1-score"],
        "roc_auc": auc
    }

# Initialize Trainer and train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset2,
    eval_dataset=test_dataset2,
    compute_metrics=compute_metrics
)

trainer.train() # Start model training
trainer.evaluate() # Evaluate model on the test set


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/6727 [00:00<?, ? examples/s]

Map:   0%|          | 0/1442 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,No log,0.049677,0.990291,1.0,0.941667,0.969957,0.986171
2,0.186800,0.036235,0.993759,0.995708,0.966667,0.980973,0.995447
3,0.022100,0.036605,0.993065,0.987288,0.970833,0.978992,0.994731
4,0.015300,0.043715,0.993065,0.987288,0.970833,0.978992,0.995852


{'eval_loss': 0.03623471036553383,
 'eval_accuracy': 0.9937586685159501,
 'eval_precision': 0.9957081545064378,
 'eval_recall': 0.9666666666666667,
 'eval_f1': 0.9809725158562368,
 'eval_roc_auc': 0.9954468247365502,
 'eval_runtime': 5.7688,
 'eval_samples_per_second': 249.964,
 'eval_steps_per_second': 15.774,
 'epoch': 4.0}

In [8]:
import transformers
print(transformers.__version__)


4.51.3


In [9]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [10]:
# Define evaluation metric
import evaluate
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [11]:
# Custom function to compute metrics during evaluation
def compute_metrics(eval_pred):
    predictions2, labels = eval_pred
    preds2 = np.argmax(predictions2, axis=1)
    return accuracy.compute(predictions=preds2, references=labels)

In [12]:
# Make predictions on the test dataset using the trained model
predictions2 = trainer.predict(test_dataset2)

In [13]:
# Extract predicted labels and true labels
y_pred2 = np.argmax(predictions2.predictions, axis=1)
y_true2 = predictions2.label_ids
# Define evaluation metric
import evaluate
accuracy = evaluate.load("accuracy")

In [14]:
# Print detailed classification report
print("\nClassification Report:")
report = classification_report(
    y_true2, y_pred2,
    labels=[0, 1], # Define the order of labels
    target_names=["No risk", "Mentions risk"], # Define label names
    digits=4 # Display results with 4 decimal places
)
print(report)

from sklearn.metrics import confusion_matrix, accuracy_score
# Print confusion matrix and overall accuracy
print("\nConfusion Matrix:")
print(confusion_matrix(y_true2, y_pred2, labels=[0, 1]))

print("\nOverall Accuracy:", accuracy_score(y_true2, y_pred2))



Classification Report:
               precision    recall  f1-score   support

      No risk     0.9934    0.9992    0.9963      1202
Mentions risk     0.9957    0.9667    0.9810       240

     accuracy                         0.9938      1442
    macro avg     0.9945    0.9829    0.9886      1442
 weighted avg     0.9938    0.9938    0.9937      1442


Confusion Matrix:
[[1201    1]
 [   8  232]]

Overall Accuracy: 0.9937586685159501
