In [7]:
import os
import json
import pandas as pd

# Path to the dataset
dataset_path = '/content/test'

# List all JSON files
json_files = [os.path.join(dataset_path, file) for file in os.listdir(dataset_path) if file.endswith('.json')]

# Load data into a DataFrame
data = []
for file in json_files:
    with open(file, 'r') as f:
        content = json.load(f)
        data.append(content)

df = pd.DataFrame(data)
print(df.head())  # Inspect the first few rows
print(df.columns)

                                            paper_id  \
0  xml_output_grobid_python_client\Assisted-repro...   
1  xml_output_cochrane\Angel-M-ller_et_al-2018-Co...   
2  xml_output_cochrane\Boomsma_et_al-2022-Cochran...   
3  xml_output_cochrane\Brito_et_al-2019-Cochrane_...   
4  xml_output_grobid_python_client\Antim&#xfc;lle...   

                                              header  \
0  {'generated_with': 'S2ORC 1.0.0', 'date_genera...   
1  {'generated_with': 'S2ORC 1.0.0', 'date_genera...   
2  {'generated_with': 'S2ORC 1.0.0', 'date_genera...   
3  {'generated_with': 'S2ORC 1.0.0', 'date_genera...   
4  {'generated_with': 'S2ORC 1.0.0', 'date_genera...   

                                               title  \
0  Assisted reproductive technology: consideratio...   
1  Point of care rapid test for diagnosis of syph...   
2  Peri-implantation glucocorticoid administratio...   
3  Interventions for uterine fibroids: an overvie...   
4  Antim√ºllerian hormone is not associated wi

In [8]:
import re
import pandas as pd
from transformers import BertTokenizer

# Manually define a set of stopwords
stop_words = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
    "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
    "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the",
    "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by",
    "for", "with", "about", "against", "between", "into", "through", "during",
    "before", "after", "above", "below", "to", "from", "up", "down", "in", "out",
    "on", "off", "over", "under", "again", "further", "then", "once", "here",
    "there", "when", "where", "why", "how", "all", "any", "both", "each", "few",
    "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own",
    "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don",
    "should", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "couldn",
    "didn", "doesn", "hadn", "hasn", "haven", "isn", "ma", "mightn", "mustn",
    "needn", "shan", "shouldn", "wasn", "weren", "won", "wouldn"
])

# Initialize the BERT tokenizer (make sure it's locally available)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to clean text
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Optionally, convert text to lowercase
    text = text.lower()

    return text

# Function to remove stopwords
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

# Preprocess 'abstract' or relevant field
df['cleaned_abstract'] = df['abstract'].apply(lambda x: clean_text(str(x)))
df['cleaned_abstract'] = df['cleaned_abstract'].apply(lambda x: remove_stopwords(x))

# Tokenize the text using BERT's tokenizer
def tokenize_text(text):
    return tokenizer.encode(text, add_special_tokens=True)

df['tokenized_abstract'] = df['cleaned_abstract'].apply(lambda x: tokenize_text(x))

# Inspect the processed data
print(df[['cleaned_abstract', 'tokenized_abstract']].head())


                                    cleaned_abstract  \
0  study aimed discuss fertility concerns unique ...   
1  protocol cochrane review diagnostic test accur...   
2  findings glucocorticoids compared glucocortico...   
3  protocol cochrane review overviewthe objective...   
4  objective assess association antim ullerian ho...   

                                  tokenized_abstract  
0  [101, 2817, 6461, 6848, 17376, 5936, 4310, 116...  
1  [101, 8778, 22329, 3319, 16474, 3231, 10640, 1...  
2  [101, 9556, 1043, 7630, 3597, 27108, 4588, 170...  
3  [101, 8778, 22329, 3319, 19184, 10760, 11100, ...  
4  [101, 7863, 14358, 2523, 3424, 2213, 17359, 39...  


In [9]:
!pip install torch torchvision transformers datasets
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Adjust `num_labels` as per your task

# Load tokenizer (same one used during preprocessing)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
import numpy as np

# Assume df is your DataFrame with a 'cleaned_abstract' and 'tokenized_abstract' column
# Ensure the necessary columns exist
if 'cleaned_abstract' not in df.columns or 'tokenized_abstract' not in df.columns:
    raise ValueError("The dataset must contain 'cleaned_abstract' and 'tokenized_abstract' columns.")

# Check if 'label' column exists; if not, create it for demonstration purposes
if 'label' not in df.columns:
    # Example: Generate dummy labels (binary classification) for demonstration
    df['label'] = np.random.randint(0, 2, size=len(df))

# Convert tokenized abstracts into PyTorch tensors
df['input_ids'] = df['tokenized_abstract'].apply(lambda x: torch.tensor(x, dtype=torch.long))

# Create attention masks (all ones, assuming no padding)
df['attention_masks'] = df['input_ids'].apply(lambda x: torch.ones_like(x, dtype=torch.long))

# Extract features and labels
input_ids = df['input_ids'].tolist()
attention_masks = df['attention_masks'].tolist()
labels = df['label'].tolist()

# Split the dataset into training and validation sets
train_texts, val_texts, train_masks, val_masks, train_labels, val_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.1, random_state=42
)

# Convert lists to PyTorch tensors
# Pad input tensors to the same length
from torch.nn.utils.rnn import pad_sequence

train_texts = pad_sequence(train_texts, batch_first=True, padding_value=0)
val_texts = pad_sequence(val_texts, batch_first=True, padding_value=0)
train_masks = pad_sequence([torch.tensor(mask) for mask in train_masks], batch_first=True, padding_value=0)
val_masks = pad_sequence([torch.tensor(mask) for mask in val_masks], batch_first=True, padding_value=0)

train_labels = torch.tensor(train_labels, dtype=torch.long)
val_labels = torch.tensor(val_labels, dtype=torch.long)

# Create TensorDatasets
train_dataset = torch.utils.data.TensorDataset(train_texts, train_masks, train_labels)
val_dataset = torch.utils.data.TensorDataset(val_texts, val_masks, val_labels)

# Verify dataset shapes
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")


Training dataset size: 45
Validation dataset size: 5


  train_masks = pad_sequence([torch.tensor(mask) for mask in train_masks], batch_first=True, padding_value=0)
  val_masks = pad_sequence([torch.tensor(mask) for mask in val_masks], batch_first=True, padding_value=0)


In [34]:


from transformers import BertForSequenceClassification, AdamW, BertTokenizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_scheduler
import torch
import torch.nn as nn
import numpy as np

# Load Pretrained BERT Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)  # Binary classification
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Training Arguments
batch_size = 16
epochs = 3
learning_rate = 2e-5

# DataLoader Setup
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
num_training_steps = len(train_dataloader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Device Setup
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Training Loop
loss_fn = nn.CrossEntropyLoss()

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    print("-" * 30)

    # Training
    model.train()
    total_train_loss = 0
    for batch in train_dataloader:
        b_input_ids = batch[0].to(device)
        b_attention_masks = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_attention_masks, labels=b_labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Training Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    total_val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_dataloader:
            b_input_ids = batch[0].to(device)
            b_attention_masks = batch[1].to(device)
            b_labels = batch[2].to(device)

            outputs = model(b_input_ids, attention_mask=b_attention_masks, labels=b_labels)
            loss = outputs.loss
            total_val_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            correct += (preds == b_labels).sum().item()
            total += b_labels.size(0)

    avg_val_loss = total_val_loss / len(val_dataloader)
    accuracy = correct / total
    print(f"Validation Loss: {avg_val_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")

# Save the Model
model.save_pretrained("bert-finetuned")
tokenizer.save_pretrained("bert-finetuned")

print("Training complete. Model saved to 'bert-finetuned'.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
------------------------------
Training Loss: 0.7100
Validation Loss: 0.6806
Validation Accuracy: 0.6000
Epoch 2/3
------------------------------
Training Loss: 0.6832
Validation Loss: 0.6788
Validation Accuracy: 0.8000
Epoch 3/3
------------------------------
Training Loss: 0.6642
Validation Loss: 0.6848
Validation Accuracy: 0.6000
Training complete. Model saved to 'bert-finetuned'.


In [36]:
# Save the model and tokenizer after training
model.save_pretrained("bert-finetuned")  # Save the model weights
tokenizer.save_pretrained("bert-finetuned")  # Save the tokenizer

print("Model and tokenizer saved to 'bert-finetuned'.")


Model and tokenizer saved to 'bert-finetuned'.
