In [1]:
import json
import torch
import datetime
from transformers import BertTokenizer, BertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
import tqdm as tqdm

In [3]:
# Load pre-trained bioBERT model from local storage
MODEL_PATH = "../models/biobert_healthspan"  # Update with your model path

In [None]:
# Load JSON file containing document information
INPUT_JSON = "../data/preprocessed_pdf_info_list.json"  # Update with your file name
OUTPUT_JSON = "results/04040900_classified_documents.json"

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

try:
    # Load model and tokenizer
    model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
    tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

    # Check model config
    print("✅ Model Loaded Successfully!")
    print(f"Model Configuration: {model.config}")

    # Run a test inference
    test_text = "Aging is associated with various biological changes."
    inputs = tokenizer(test_text, return_tensors="pt", truncation=True, padding=True)

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)
    
    print("✅ Test Inference Successful!")
    print(f"Raw Model Output: {outputs.logits}")

except Exception as e:
    print(f"❌ Error loading model: {e}")

model.to(device)
model.eval()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


✅ Model Loaded Successfully!
Model Configuration: BertConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.50.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

✅ Test Inference Successful!
Raw Model Output: tensor([[-0.4735,  0.5654]])


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [6]:
# Read documents from JSON
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    documents = json.load(f)

In [7]:
# Initialize counters
relevant_documents = []
processed_documents = []
total_docs = len(documents)

In [8]:
# Normalize date function
def normalize_date(doc_date):
    """ Convert date to a normalized score (0 to 1), newer = closer to 1 """
    try:
        if not doc_date or doc_date.lower() == "unknown":
            return 0  # No date, lowest weight
        parsed_date = datetime.datetime.strptime(doc_date, "%Y-%m-%d")  # Ensure format is YYYY-MM-DD
        days_since = (datetime.datetime.today() - parsed_date).days
        return max(0, 1 - (days_since / 3650))  # Normalize (last 10 years max influence)
    except:
        return 0  # If date parsing fails, treat as unknown

In [9]:
# Chunking function to split large documents
def chunk_text(text, max_tokens=512):
    """ Split text into chunks that fit within model input limits """
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = [tokens[i:i+max_tokens] for i in range(0, len(tokens), max_tokens)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

In [34]:
# ✅ **Relevance Keywords Failsafe**
RELEVANCE_KEYWORDS = {
    "longevity", "healthspan", "lifespan extension", "aging", "senescence", 
    "anti-aging", "caloric restriction", "resveratrol", "rapamycin", "mTOR", 
     "biomarkers of aging"
}

# ✅ **Keyword Matching Failsafe**
def keyword_boost(text):
    """ Boost relevance if key longevity-related words are found """
    if any(keyword.lower() in text.lower() for keyword in RELEVANCE_KEYWORDS):
        return 0.95  # Boost relevance to 95% if it was too low
    return 0.0 

In [11]:
def classify_text(text):
    """Classifies a given text using the bioBERT model."""
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return prediction  # 0 or 1 depending on classification

In [35]:
#running time 603 min

# Initialize progress bar
with tqdm.tqdm(total=len(documents), desc="Processing Documents", unit="doc") as pbar:

    # Classify and process each document
    for doc in documents:
        try:
            text = doc.get("processed_text", "").strip()
            if not text:
                continue  # Skip empty documents
            
            chunks = chunk_text(text)  # Split text into chunks
            chunk_scores = []

            for chunk in chunks:
                inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=512)
                with torch.no_grad():
                    outputs = model(**inputs)
                softmax_probs = torch.nn.functional.softmax(outputs.logits, dim=1)
                chunk_scores.append(softmax_probs[0][1].item())  # Probability of relevance
            
            relevance_score = sum(chunk_scores) / len(chunk_scores)  # Average over chunks

            # **Failsafe Keyword Boosting**
            if relevance_score < 0.30:
                keyword_score = keyword_boost(text)
                relevance_score = max(relevance_score, keyword_score) 

            # **New:** Apply a **dynamic threshold**
            threshold = 0.49  # Set a lower threshold for tolerance
            if relevance_score < threshold:
                # If no documents meet the threshold, gradually decrease it
                threshold = max(0.2, relevance_score)

            # Get normalized date score
            date_score = normalize_date(doc.get("date", "Unknown"))

            # Compute weighted score (75% relevance, 25% date)
            final_score = (0.75 * relevance_score) + (0.25 * date_score)

            # Store the document
            processed_documents.append({
                "title": doc.get("title", "Unknown"),
                "author": doc.get("author", "Unknown"),
                "date": doc.get("date", "Unknown"),
                "filename": doc.get("filename", "Unknown"),
                "processed_text": text,
                "relevance_score": round(relevance_score, 4),
                "date_score": round(date_score, 4),
                "final_score": round(final_score, 4)
            })

        except Exception as e:
            print(f"❌ Error processing document '{doc.get('filename', 'Unknown')}': {e}")

        pbar.update(1)  # Update progress bar after processing each document  

Processing Documents: 100%|██████████| 709/709 [8:52:00<00:00, 45.02s/doc]    


In [36]:
# Sort by final weighted score (highest first), then by date
processed_documents.sort(key=lambda doc: (-doc["final_score"], -doc["date_score"]))

In [37]:
# Count relevant documents
num_relevant = sum(1 for doc in processed_documents if doc["relevance_score"] >= threshold)


In [38]:
# Print and store results
print(f"✅ Total Documents: {total_docs}")
print(f"✅ Relevant Documents Found: {num_relevant}")

✅ Total Documents: 709
✅ Relevant Documents Found: 1343


In [39]:
# Save results to a file
with open(OUTPUT_JSON, "w", encoding="utf-8") as file:
    json.dump(processed_documents, file, indent=4)
    
print(f"✅ Results saved to {OUTPUT_JSON}")

✅ Results saved to results/03040923_classified_documents.json


In [40]:
import json
from datetime import datetime
from fpdf import FPDF
import pandas as pd

# Helper: Convert date string to datetime object
def parse_date(date_str):
    for fmt in ("%Y-%m-%d", "%d-%m-%Y", "%Y/%m/%d", "%B %d, %Y"):
        try:
            return datetime.strptime(date_str, fmt)
        except:
            continue
    return None

# Step 1: Add a numeric sortable timestamp
for doc in processed_documents:
    doc_date = parse_date(doc.get("date", ""))
    doc["date_numeric"] = doc_date.timestamp() if doc_date else 0

# Step 2: Sort by relevance_score (desc), then by date_numeric (desc)
top_documents = sorted(
    processed_documents,
    key=lambda d: (d.get("relevance_score", 0), d.get("date_numeric", 0)),
    reverse=True
)[:15]

# Step 3: Save to JSON
json_path = "top_15_by_relevance_then_date.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(top_documents, f, indent=2, ensure_ascii=False)
print(f"✅ Saved top 15 documents to JSON: '{json_path}'")

# Step 4: Save to PDF
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", size=12)

for i, doc in enumerate(top_documents, 1):
    pdf.set_font("Arial", 'B', 12)
    pdf.multi_cell(0, 10, f"{i}. Title: {doc['title']}")
    pdf.set_font("Arial", size=11)
    pdf.multi_cell(0, 10, f"Author: {doc['author']}")
    pdf.multi_cell(0, 10, f"Date: {doc['date']}")
    pdf.multi_cell(0, 10, f"Filename: {doc['filename']}")
    pdf.multi_cell(0, 10, f"Relevance Score: {doc['relevance_score']}")
    pdf.multi_cell(0, 10, f"Date Numeric: {doc['date_numeric']}")
    pdf.set_font("Arial", style='I', size=10)
    pdf.multi_cell(0, 10, "-" * 60)
    pdf.set_font("Arial", size=11)
    text = doc['processed_text'][:10000]
    pdf.multi_cell(0, 8, text)
    pdf.ln(5)

#pdf_path = "top_15_by_relevance_then_date.pdf"
#pdf.output(pdf_path)
#print(f"✅ Saved top 15 documents to PDF: '{pdf_path}'")

# Step 5: Save to DataFrame
df = pd.DataFrame(top_documents)
df_path = "top_15_by_relevance_then_date.csv"
df.to_csv(df_path, index=False)
print(f"✅ Saved top 15 documents to DataFrame (CSV): '{df_path}'")


✅ Saved top 15 documents to JSON: 'top_15_by_relevance_then_date.json'
✅ Saved top 15 documents to DataFrame (CSV): 'top_15_by_relevance_then_date.csv'
