Prepare Data:
    - ohsumed 88 - 91 contains the normal ohsumed data 
    - qrels.ohsu has the relevance labels for the corresponding UIs


In [35]:
def parse_relevance_file(filename):
    relevance_dict = {}
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                query_id, medline_id, relevance = line.strip().split()
                # Store by medline_id since we'll join on this
                if medline_id not in relevance_dict:
                    relevance_dict[medline_id] = []
                relevance_dict[medline_id].append({
                    'query_id': query_id,
                    'relevance': int(relevance)
                })
            except ValueError:
                continue  # Skip malformed lines
    
    print(f"Total medline IDs with relevance judgments: {len(relevance_dict)}")
    return relevance_dict

In [36]:
from datasets import load_dataset

# Load both train and test splits
dataset = load_dataset("community-datasets/ohsumed")
print("Available splits:", dataset.keys())

# Combine train and test if both exist
all_docs = []
if 'train' in dataset:
    all_docs.extend(dataset['train'])
if 'test' in dataset:
    all_docs.extend(dataset['test'])

print(f"Total entries in combined dataset: {len(all_docs)}")

Available splits: dict_keys(['train', 'test'])


KeyboardInterrupt: 

In [None]:
relevance_dict = parse_relevance_file('qrels.ohsu.88-91')

# Combine with relevance information
docs_with_relevance = []
docs_without_relevance = []

for doc in all_docs:
    medline_id = str(doc['medline_ui'])
    
    if medline_id in relevance_dict:
        for rel_info in relevance_dict[medline_id]:
            doc_with_relevance = doc.copy()
            doc_with_relevance['query_id'] = rel_info['query_id']
            doc_with_relevance['relevance'] = rel_info['relevance']
            docs_with_relevance.append(doc_with_relevance)
    else:
        docs_without_relevance.append(doc)

print(f"Documents with relevance judgments: {len(docs_with_relevance)}")
print(f"Documents without relevance judgments: {len(docs_without_relevance)}")

Total medline IDs with relevance judgments: 3121
Documents with relevance judgments: 3205
Documents without relevance judgments: 345443


In [27]:
import pandas as pd

# Convert to DataFrames and save as CSVs
# First the documents with relevance
df_with_relevance = pd.DataFrame(docs_with_relevance)
df_without_relevance = pd.DataFrame(docs_without_relevance)

# Save with relevance dataset
df_with_relevance.to_csv('ohsumed_with_relevance.csv', index=False)

# Save without relevance dataset
df_without_relevance.to_csv('ohsumed_without_relevance.csv', index=False)

print("Saved files:")
print("1. ohsumed_with_relevance.csv")
print("2. ohsumed_without_relevance.csv")

# Print sample of columns to verify
print("\nColumns in with_relevance dataset:")
print(df_with_relevance.columns.tolist())
print("\nColumns in without_relevance dataset:")
print(df_without_relevance.columns.tolist())

Saved files:
1. ohsumed_with_relevance.csv
2. ohsumed_without_relevance.csv

Columns in with_relevance dataset:
['seq_id', 'medline_ui', 'mesh_terms', 'title', 'publication_type', 'abstract', 'author', 'source', 'query_id', 'relevance']

Columns in without_relevance dataset:
['seq_id', 'medline_ui', 'mesh_terms', 'title', 'publication_type', 'abstract', 'author', 'source']


# PRepare data 

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split

# First, let's add sensitivity labels to both datasets
def is_sensitive(mesh_terms):
    """Check if document contains C12 or C13 related terms"""
    sensitive_terms = ['urogenital', 'urinary', 'genital', 'reproductive', 
                      'pregnancy', 'gynecologic', 'obstetric', 'sexual', 
                      'fertility', 'prostate', 'testicular', 'ovarian', 
                      'uterine', 'vaginal', 'prenatal']
    return 1 if any(term.lower() in str(mesh_terms).lower() for term in sensitive_terms) else 0

# Load the CSVs
docs_with_relevance = pd.read_csv('ohsumed_with_relevance.csv')
docs_without_relevance = pd.read_csv('ohsumed_without_relevance.csv')

# Add sensitivity labels
docs_with_relevance['sensitive'] = docs_with_relevance['mesh_terms'].apply(is_sensitive)
docs_without_relevance['sensitive'] = docs_without_relevance['mesh_terms'].apply(is_sensitive)

# Following the paper's methodology:
# 1. Test set = documents with relevance judgments
test_set = docs_with_relevance

# 2. Split remaining documents into train (85%) and validation (15%)
train_set, val_set = train_test_split(docs_without_relevance, test_size=0.15, random_state=42)

print("Dataset sizes:")
print(f"Training set: {len(train_set)}")
print(f"Validation set: {len(val_set)}")
print(f"Test set: {len(test_set)}")

print("\nSensitivity distribution:")
print("Train set sensitive ratio:", train_set['sensitive'].mean())
print("Val set sensitive ratio:", val_set['sensitive'].mean())
print("Test set sensitive ratio:", test_set['sensitive'].mean())

# Save the splits
train_set.to_csv('ohsumed_train.csv', index=False)
val_set.to_csv('ohsumed_val.csv', index=False)
test_set.to_csv('ohsumed_test.csv', index=False)

Dataset sizes:
Training set: 293626
Validation set: 51817
Test set: 3205

Sensitivity distribution:
Train set sensitive ratio: 0.07719343654853453
Val set sensitive ratio: 0.07898952081363259
Test set sensitive ratio: 0.09797191887675508


# Try Distilbert 

In [39]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, fbeta_score
from torch.optim import AdamW
import torch
import pandas as pd

# Load our split datasets
train_docs = pd.read_csv('ohsumed_train.csv')
val_docs = pd.read_csv('ohsumed_val.csv')
test_docs = pd.read_csv('ohsumed_test.csv')

class OHSUMEDDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Fixed prepare_data function
def prepare_data(docs):
    # Using pandas DataFrame methods correctly
    texts = [f"{str(title)} {str(abstract)}" for title, abstract in zip(docs['title'], docs['abstract'])]
    labels = docs['sensitive'].tolist()
    return texts, labels

# Initialize model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Prepare datasets
train_texts, train_labels = prepare_data(train_docs)
val_texts, val_labels = prepare_data(val_docs)
test_texts, test_labels = prepare_data(test_docs)

train_dataset = OHSUMEDDataset(train_texts, train_labels, tokenizer)
val_dataset = OHSUMEDDataset(val_texts, val_labels, tokenizer)
test_dataset = OHSUMEDDataset(test_texts, test_labels, tokenizer)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# Training function
def train_epoch(model, train_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    f2 = fbeta_score(true_labels, predictions, beta=2)
    accuracy = accuracy_score(true_labels, predictions)
    
    return {
        'precision': precision * 100,
        'recall': recall * 100,
        'f1': f1 * 100,
        'f2': f2 * 100,
        'accuracy': accuracy * 100
    }

# Training loop
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

for epoch in range(num_epochs):
    loss = train_epoch(model, train_loader, optimizer)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")
    
    if (epoch + 1) % 1 == 0:
        metrics = evaluate(model, val_loader)
        print(f"Validation metrics:", metrics)

# Final evaluation on test set
test_metrics = evaluate(model, test_loader)
print("\nTest metrics:", test_metrics)

# Compare with paper results
paper_results = {
    'precision': 82.75,
    'recall': 80.08,
    'f1': 81.39,
    'f2': 80.60,
    'accuracy': 95.52
}

print("\nPaper results:", paper_results)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 