In [None]:
import pandas as pd

# Read your cleaned and saved abstracts dataset
df = pd.read_csv("/content/alzheimers_abstracts.csv")

# Define a list of risk factor keywords
risk_keywords = [
    "air pollution", "PM2.5", "particulate matter", "environmental exposure",
    "toxins", "neuroinflammation", "smoking", "hypertension", "cholesterol",
    "diet", "sleep quality", "obesity", "pesticides"
]

# Add a new column 'Factors': mark 1 if any keyword is found in the abstract, else 0
df["Factors"] = df["Abstract"].apply(
    lambda text: int(any(kw.lower() in text.lower() for kw in risk_keywords))
)

# Save the updated DataFrame to a new CSV file
df.to_csv("alzheimers_abstracts_risk.csv", index=False)

print("Successfully completed keyword tagging for risk factors and saved the new file.")


In [None]:
from sklearn.model_selection import train_test_split

# Read your processed dataset (already labeled with 'Factors')
df = pd.read_csv("/content/alzheimers_abstracts_risk.csv")

# Split the dataset into Training set + Temporary set (dev + test)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["Factors"])

# Further split the Temporary set into Dev and Test sets (50% each)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["Factors"])

# Save the resulting datasets to new CSV files
train_df.to_csv("risk_train.csv", index=False)
dev_df.to_csv("risk_dev.csv", index=False)
test_df.to_csv("risk_test.csv", index=False)

print("Train/Dev/Test sets have been saved.")


In [None]:
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler

nlp = spacy.load("en_core_web_sm")

# Add an EntityRuler to insert custom entity recognition patterns before the built-in NER
ruler = nlp.add_pipe("entity_ruler", before="ner")

# Define a list of risk factor keywords
risk_keywords = [
    "air pollution", "PM2.5", "particulate matter", "environmental exposure",
    "toxins", "neuroinflammation", "smoking", "hypertension", "cholesterol",
    "diet", "sleep quality", "obesity", "pesticides"
]

# Build custom patterns with label 'RISK'
patterns = [{"label": "RISK", "pattern": kw} for kw in risk_keywords]
ruler.add_patterns(patterns)

# Test entity recognition on a single simple sentence
doc = nlp("The patient experienced air pollution.")
print("\nTesting entity recognition on a sample sentence:")
for ent in doc.ents:
    print(f" - {ent.text} ({ent.label_})")

# Read the abstracts dataset
df = pd.read_csv("/content/alzheimers_abstracts.csv")

# Perform entity recognition on the first 5 abstracts
print("\nBatch processing entity recognition results:")
for i in range(5):
    text = df.loc[i, "Abstract"]
    print(f"\nAbstract #{i+1}:\n{text}")

    doc = nlp(text) # Apply NLP pipeline

    print("Recognized Entities:")
    for ent in doc.ents:
        print(f" - {ent.text} ({ent.label_})")


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd

# Load the training and testing datasets
train_df = pd.read_csv("/content/risk_train.csv")
test_df = pd.read_csv("/content/risk_test.csv")

# Extract features (text) and labels
X_train2 = train_df["Abstract"]
y_train2 = train_df["Factors"]
X_test2 = test_df["Abstract"]
y_test2 = test_df["Factors"]

# Apply TF-IDF vectorization to the text
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec2 = vectorizer.fit_transform(X_train2)
X_test_vec2 = vectorizer.transform(X_test2)

# Train a Logistic Regression model
clf = LogisticRegression(max_iter=200)
clf.fit(X_train_vec2, y_train2)

# Predict and print the classification report
y_pred2 = clf.predict(X_test_vec2)
print(classification_report(y_test2, y_pred2))


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Predict on the test set
y_pred2 = clf.predict(X_test_vec2) # Predicted class labels (0 or 1)
y_prob2 = clf.predict_proba(X_test_vec2)[:, 1]

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test2, y_pred2))

# Calculate and print ROC-AUC score
auc = roc_auc_score(y_test2, y_prob2)
print(f"ROC-AUC: {auc:.4f}")

from sklearn.metrics import confusion_matrix

# Compute and print the confusion matrix
cm = confusion_matrix(y_test2, y_pred2)
print("Confusion Matrix:")
print(cm)

In [None]:
! pip install datasets

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score

# Load the train and test datasets
df = pd.read_csv("/content/risk_train.csv")
df_test = pd.read_csv("/content/risk_test.csv")

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Preprocessing: prepare Huggingface Dataset objects
# Rename columns to match Huggingface expected names ("text", "label")
train_dataset2 = Dataset.from_pandas(df[["Abstract", "Factors"]].rename(columns={"Abstract": "text", "Factors": "label"}))
test_dataset2 = Dataset.from_pandas(df_test[["Abstract", "Factors"]].rename(columns={"Abstract": "text", "Factors": "label"}))
# Apply tokenization
train_dataset2 = train_dataset2.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length"), batched=True)
test_dataset2 = test_dataset2.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length"), batched=True)

# Define label map and load model
id2label = {0: "No risk", 1: "Mentions risk"}
label2id = {"No risk": 0, "Mentions risk": 1}

# load model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)


# Set training parameters
from transformers import TrainingArguments


from transformers import (
    AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
training_args = TrainingArguments(
    output_dir="text_classification_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_strategy="epoch",                      # run eval at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none",
    fp16=True  # using a GPU with FP16 support with Colab
)


# Define custom evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    auc = roc_auc_score(labels, pred.predictions[:, 1])
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "precision": report["1"]["precision"],
        "recall": report["1"]["recall"],
        "f1": report["1"]["f1-score"],
        "roc_auc": auc
    }

# Initialize Trainer and train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset2,
    eval_dataset=test_dataset2,
    compute_metrics=compute_metrics
)

trainer.train() # Start model training
trainer.evaluate() # Evaluate model on the test set


In [None]:
import transformers
print(transformers.__version__)


In [None]:
! pip install evaluate

In [None]:
# Define evaluation metric
import evaluate
accuracy = evaluate.load("accuracy")

In [None]:
# Custom function to compute metrics during evaluation
def compute_metrics(eval_pred):
    predictions2, labels = eval_pred
    preds2 = np.argmax(predictions2, axis=1)
    return accuracy.compute(predictions=preds2, references=labels)

In [None]:
# Make predictions on the test dataset using the trained model
predictions2 = trainer.predict(test_dataset2)

In [None]:
# Extract predicted labels and true labels
y_pred2 = np.argmax(predictions2.predictions, axis=1)
y_true2 = predictions2.label_ids
# Define evaluation metric
import evaluate
accuracy = evaluate.load("accuracy")

In [None]:
# Print detailed classification report
print("\nClassification Report:")
report = classification_report(
    y_true2, y_pred2,
    labels=[0, 1], # Define the order of labels
    target_names=["No risk", "Mentions risk"], # Define label names
    digits=4 # Display results with 4 decimal places
)
print(report)

from sklearn.metrics import confusion_matrix, accuracy_score
# Print confusion matrix and overall accuracy
print("\nConfusion Matrix:")
print(confusion_matrix(y_true2, y_pred2, labels=[0, 1]))

print("\nOverall Accuracy:", accuracy_score(y_true2, y_pred2))
