### Bert model v 01
---

In [None]:
import os
import re
from PyPDF2 import PdfReader
from datetime import datetime
from PyPDF2.generic import IndirectObject
import json


1. Data Collection and Preparation:
---

In [None]:
# Helping functions to extract metadata from PDF files
def resolve_metadata_value(value):
    if isinstance(value, IndirectObject):
        value = value.get_object()
    return str(value) if value is not None else None

In [None]:
# Helping function
def extract_pdf_info(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text = ''
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text

        metadata = reader.metadata
        if metadata is not None:
            title = resolve_metadata_value(metadata.get('/Title')) or 'Unknown Title'
            author = resolve_metadata_value(metadata.get('/Author')) or 'Unknown Author'
            creation_date = resolve_metadata_value(metadata.get('/CreationDate')) or ''
            mod_date = resolve_metadata_value(metadata.get('/ModDate')) or ''
        else:
            title = 'Unknown Title'
            author = 'Unknown Author'
            creation_date = ''
            mod_date = ''

        # Extract date from metadata
        date_str = creation_date or mod_date
        date = None
        if date_str:
            date_match = re.search(r"D:(\d{14})", date_str)
            if date_match:
                try:
                    date = datetime.strptime(date_match.group(1), '%Y%m%d%H%M%S')
                except ValueError:
                    pass  # Handle invalid date format gracefully

        return {'title': title, 'author': author, 'date': date, 'text': text}



*Create a dictionary from the Public database, with keys: title, author, date, text, and filename*

In [None]:
# Directory containing data as PDFs
pdf_dir = '../data/AWS/'

In [5]:
pdf_info_list = []

for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_info = extract_pdf_info(pdf_path)
        pdf_info['filename'] = pdf_file
        pdf_info_list.append(pdf_info)


PdfReadError("Invalid Elementary Object starting with b'\\\\' @508629: b'r (pdfTeX-1.40.21)\\n \\\\par /Author()/Title()/Subject()/Creator(LaTeX with hyperref'")


*Second, Create a dictionary from the podcasts and book, with keys: title, author, date, text, and filename*

In [15]:
# Directory containing additional PDFs
pdf_dir = '../data/Training_docs/'

In [None]:
#Aditional list with proven docs
pdf_proven_list = []

for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_info = extract_pdf_info(pdf_path)
        pdf_info['filename'] = pdf_file
        pdf_proven_list.append(pdf_info)


*Finally, create a dictionary from the sections of the book, with keys: title, author, date, text, and filename*

In [34]:
# Directory containing additional PDFs
pdf_dir = '../data/Training_docs/tokenized_sections'

In [35]:
#Aditional list with proven docs
pdf_book_sections_list = []

for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_info = extract_pdf_info(pdf_path)
        pdf_info['filename'] = pdf_file
        pdf_book_sections_list.append(pdf_info)


In [36]:
# Combine with existing pdf_info_list
combined_data = pdf_info_list + pdf_proven_list + pdf_book_sections_list

*let's save them for future use if needed*

In [37]:
# save the combined_data to a JSON file
# Define a serialization function for datetime objects
def serialize_datetime(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    raise TypeError(f"Type {type(obj)} not serializable")

# Assuming pdf_info_list is your list of dictionaries
with open('../data/combined_data.json', 'w') as json_file:
    json.dump(combined_data, json_file, default=serialize_datetime, indent=4)

In [None]:
# save the pdf_info_list to a JSON file
# Define a serialization function for datetime objects
def serialize_datetime(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    raise TypeError(f"Type {type(obj)} not serializable")

# Assuming pdf_info_list is your list of dictionaries
with open('../data/pdf_info_list.json', 'w') as json_file:
    json.dump(pdf_info_list, json_file, default=serialize_datetime, indent=4)


*Assign labels and save only the combined data*

In [20]:
# Define your list of healthspan-related keywords
healthspan_keywords = ['longevity', 'aging', 'senescence', 'lifespan', 'healthspan', 'caloric restriction', 'telomere', 'autophagy', 'gerontology', 'anti-aging', 'resveratrol', 'sirtuins', 'mTOR', 'NAD+', 'oxidative stress', 'inflammation', 'mitochondria', 'genomics', 'epigenetics', 'stem cells', 'regeneration', 'DNA repair', 'protein folding', 'calorie restriction', 'intermittent fasting', 'blue zones', 'hormesis', 'geroprotector', 'rapamycin', 'metformin', 'amyloids', 'proteostasis', 'senolytics', 'leptin', 'circadian rhythm', 'sleep', 'exercise', 'diet', 'nutrition', 'microbiome', 'gut health', 'probiotics', 'prebiotics', 'polyphenols', 'flavonoids', 'antioxidants','hormone replacement', 'testosterone', 'stress management', 'mindfulness', 'meditation', 'cognitive function', 'neuroplasticity', 'brain health']

In [38]:
healthspan_keywords.__len__()

54

In [39]:
def label_document(text, keywords, threshold=3):
 """Assign a label based on the presence of at least 3 healthspan-related keywords."""
 text_lower = text.lower()
 count = sum(1 for keyword in keywords if keyword in text_lower)
 return 1 if count >= threshold else 0



In [40]:
# Apply labeling to each document
for pdf in combined_data:
 pdf['label'] = label_document(pdf['text'], healthspan_keywords)

# renundantly, unneded but it may be useful
for pdf in pdf_info_list:
    pdf['label'] = label_document(pdf['text'], healthspan_keywords)

In [41]:
# Save the labeled data to a new JSON file
# Define a serialization function for datetime objects
def serialize_datetime(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    raise TypeError(f"Type {type(obj)} not serializable")

# this is our list of dictionaries
with open('../data/combined_data.json', 'w') as json_file:
    json.dump(combined_data, json_file, default=serialize_datetime, indent=4)

---
2. Text preprocessing.
---

In [42]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\email\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\email\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [43]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)



In [44]:
!pip install tqdm
from tqdm import tqdm



In [30]:
# Apply preprocessing to each document with a progress bar
for pdf in tqdm(pdf_info_list, desc="Processing PDFs"):
    pdf['processed_text'] = preprocess_text(pdf['text'])

Processing PDFs: 100%|██████████| 709/709 [52:18<00:00,  4.43s/it]  


In [45]:
# Apply preprocessing to each document with a progress bar
for pdf in tqdm(combined_data, desc="Processing PDFs"):
    pdf['processed_text'] = preprocess_text(pdf['text'])

Processing PDFs:   0%|          | 0/740 [00:00<?, ?it/s]

Processing PDFs: 100%|██████████| 740/740 [59:00<00:00,  4.78s/it]  


In [46]:
# save to file
with open('../data/preprocessed_combined_data.json', 'w') as json_file:
    json.dump(combined_data, json_file, default=serialize_datetime, indent=4)

with open('../data/preprocessed_pdf_info_list.json', 'w') as json_file:
    json.dump(pdf_info_list, json_file, default=serialize_datetime, indent=4)

In [49]:
len(combined_data)

740

---
3. Split dataset between train and eval
---

In [51]:
import json

# Load the dataset from the JSON file
with open('../data/preprocessed_combined_data.json', 'r') as file:
    data = json.load(file)

# Extract texts and labels
texts = [entry['processed_text'] for entry in data]
labels = [entry['label'] for entry in data]


In [52]:
from sklearn.model_selection import train_test_split

# Split the data: 80% for training, 20% for evaluation
texts_train, texts_eval, labels_train, labels_eval = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)


---
4. Model Selection and Training
---

Let's try with BioBERT

In [54]:
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    get_scheduler,
    EarlyStoppingCallback
)


In [55]:
from torch.optim import AdamW


In [72]:
class HealthspanDataset(Dataset):
   def __init__(self, texts, labels, tokenizer, max_len, default_text="default"):
        # Ensure all elements in texts are strings; convert or replace with default_text if not
        self.texts = [str(text) if isinstance(text, (str, int, float)) else default_text for text in texts]
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

   def __len__(self):
       return len(self.texts)

   def __getitem__(self, idx):
       text = self.texts[idx]
       label = self.labels[idx]
       encoding = self.tokenizer(
           text,
           max_length=self.max_len,
           padding='max_length',
           truncation=True,
           return_tensors='pt',
       )
       return {
           'input_ids': encoding['input_ids'].squeeze(),
           'attention_mask': encoding['attention_mask'].squeeze(),
           'labels': torch.tensor(label, dtype=torch.long),
       }

In [73]:
# Define maximum sequence length
MAX_LEN = 512

In [74]:
# Initialize BioBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")



In [75]:
# Create dataset instances
train_dataset = HealthspanDataset(texts_train, labels_train, tokenizer, max_len=MAX_LEN)
eval_dataset = HealthspanDataset(texts_eval, labels_eval, tokenizer, max_len=MAX_LEN)

In [None]:
# Prepare data
#texts = [pdf['processed_text'] for pdf in pdf_info_list]
#labels = [pdf['label'] for pdf in pdf_info_list]



In [None]:
# Create dataset
#dataset = HealthspanDataset(texts, labels, tokenizer, max_len=512)



In [76]:
# Initialize BioBERT model
model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=2)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [77]:
# Define training arguments
training_args = TrainingArguments(
   output_dir='./results',
   num_train_epochs=4,
   per_device_train_batch_size=8,
   per_device_eval_batch_size=8,
   warmup_steps=500,
   weight_decay=0.01,
   logging_dir='./logs',
   evaluation_strategy="epoch",
   save_strategy="epoch",  # Ensure this matches evaluation_strategy
   load_best_model_at_end=True,
   metric_for_best_model="eval_loss",  # Or another metric you're monitoring
   greater_is_better=False,  # Set to True if a higher metric is better
)





In [78]:
# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

num_training_steps = len(train_dataset) * training_args.num_train_epochs
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


In [79]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Ensure you have an evaluation dataset
    optimizers=(optimizer, lr_scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


In [None]:
# Train model
trainer.train()

Epoch,Training Loss,Validation Loss
