In [None]:
import os
import glob
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch


In [2]:
# Define paths
train_obs_path = "./TRAIN/OBS"
train_rct_path = "./TRAIN/RCT"
test_obs_path = "./TEST/OBS_test"
test_rct_path = "./TEST/RCT_test"


In [3]:
# Function to load data
def load_data(folder_path):
    articles = []
    abstracts = []
    article_files = glob.glob(os.path.join(folder_path, "articles_*.txt"))
    for article_file in article_files:
        article_id = os.path.basename(article_file).split("-")[1].split(".")[0]
        abstract_file = os.path.join(folder_path, f"abstract-{article_id}.txt")

        with open(article_file, "r", encoding="utf-8") as f:
            articles.append(f.read())

        if os.path.exists(abstract_file):
            with open(abstract_file, "r", encoding="utf-8") as f:
                abstracts.append(f.read())
    return articles, abstracts


In [4]:
# Load training data
obs_articles, obs_abstracts = load_data(train_obs_path)
rct_articles, rct_abstracts = load_data(train_rct_path)


In [5]:
# Combine data
train_articles = obs_articles + rct_articles
train_abstracts = obs_abstracts + rct_abstracts


In [None]:
# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")


In [None]:
# Tokenize data
train_encodings = tokenizer(
    train_articles, padding=True, truncation=True, max_length=512, return_tensors="pt"
)
label_encodings = tokenizer(
    train_abstracts, padding=True, truncation=True, max_length=128, return_tensors="pt"
)


In [None]:
# Prepare dataset
dataset = torch.utils.data.TensorDataset(
    train_encodings.input_ids, train_encodings.attention_mask, label_encodings.input_ids
)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)


In [None]:
# Define training loop
def train_model(model, dataloader, epochs=3):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    for epoch in range(epochs):
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            labels[labels == tokenizer.pad_token_id] = -100
            outputs = model(
                input_ids=input_ids, attention_mask=attention_mask, labels=labels
            )
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


# Train the model
train_model(model, dataloader)


In [None]:
# Generate abstracts for test data
def generate_abstracts(model, tokenizer, test_folder, output_folder):
    article_files = glob.glob(os.path.join(test_folder, "article-*.txt"))
    os.makedirs(output_folder, exist_ok=True)
    for article_file in article_files:
        with open(article_file, "r", encoding="utf-8") as f:
            article = f.read()
        inputs = tokenizer.encode(
            "summarize: " + article,
            return_tensors="pt",
            max_length=512,
            truncation=True,
        )
        outputs = model.generate(
            inputs, max_length=128, num_beams=4, early_stopping=True
        )
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        article_id = os.path.basename(article_file).split("-")[1].split(".")[0]
        output_file = os.path.join(output_folder, f"abstract-{article_id}.txt")
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(summary)


In [None]:
# Generate abstracts for OBS and RCT tests
generate_abstracts(model, tokenizer, test_obs_path, "./OUTPUT/OBS")
generate_abstracts(model, tokenizer, test_rct_path, "./OUTPUT/RCT")


In [None]:
print("Abstract generation complete!")
