In [None]:
! / usr / bin / env python
coding: utf - 8

"""RadCLIP GPT-2 Model"""

__author__ = "Christoper Alexander"
__copyright__ = "Copyright 2023"
__credits__ = ["Andrew D'Amico", "Christoper Alexander", "Katya Nosulko", "Vivek Chamala", "Matthew Conger"]
__license__ = ""
__version__ = "0.0.1"
__maintainer__ = "Andrew Damico"
__email__ = "andrew.damico@u.northwestern.edu"

In [1]:
import transformers
from datasets import load_dataset
from transformers import (
    GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, GPT2Config,
    pipeline
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = GPT2Tokenizer.from_pretrained(
    'gpt2-xl', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>'
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
configuration = GPT2Config.from_pretrained('gpt2-xl', output_hidden_states=False)
configuration.pad_token_id = configuration.eos_token_id

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2-xl", config=configuration, device_map="auto")

In [4]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 1600)

In [5]:
dataset = load_dataset("text", data_files={"train": "train_gpt.txt", "test": "test_gpt.txt"})

Found cached dataset text (/home/ubuntu/.cache/huggingface/datasets/text/default-73b982222f046e78/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)
100%|██████████| 2/2 [00:00<00:00, 825.00it/s]


In [6]:
# Tokenize the dataset
train_dataset = dataset["train"].map(
    lambda examples: tokenizer(examples["text"], max_length=1024, truncation=True, padding="max_length"), batched=True
)
test_dataset = dataset["test"].map(
    lambda examples: tokenizer(examples["text"], max_length=1024, truncation=True, padding="max_length"), batched=True
)

Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/text/default-73b982222f046e78/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-096307c9d53f8711.arrow
                                                                

In [7]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [8]:
training_args = transformers.TrainingArguments(
    output_dir="test_gpt_xl_10k",
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=5e-4,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_steps=1000,
    save_steps=2000,
    warmup_steps=100,
    # prediction_loss_only=True,
    logging_dir="logs_gpt_xl",
    logging_steps=50,
    # fp16=True, # Enable mixed precision training
    # half_precision_backend="auto", # Set the backend for mixed precision training
    ddp_find_unused_parameters=None,
    optim="adamw_torch",
)

In [9]:
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
trainer.train()

In [None]:
model

In [12]:
model.save_pretrained("gptxl_10k")

In [13]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [23]:
test = generator(
    "This is the condition: there is no pneumothorax or pleural effusion. Explanation:",
    max_length=300,
    num_return_sequences=2,
    do_sample=True,
    top_k=50,
    top_p=0.95,
)

In [24]:
print(test[0]["generated_text"])

This is the condition: there is no pneumothorax or pleural effusion. Explanation: the patient suffered from some kind of atypical virus pneumonia, but no atypical pneumonia was involved. Clinical manifestation: This patient had obvious clinical manifestations of pneumonia, so chest X-ray examination was necessary and performed to verify the presence of pneumothorax and pleural effusion. In this examination, there was obviously increased absorption in the bilateral lungs and severe pneumonia in the bilateral pulmonary lobes. In addition, there were obvious changes in the color of the pleural effusion and decreased absorption in the bilateral lungs (Fig. ). The diagnostic criteria for viral pneumonia are as follows: The treatment methods for respiratory illness of children includes oxygen inhalation, simple aspiration, sputum suction and medication. However, it is necessary to conduct aggressive treatment immediately after the onset of respiratory illness. At present, there are few clini

In [25]:
print(test[1]["generated_text"])

This is the condition: there is no pneumothorax or pleural effusion. Explanation: in the process of healing, the pressure just entered the chest cavity. The lung was injured at the middle of the fourth intercostal space, the pleural cavity was filled with fluid. During the chest wall healing, the visceral pleura were retracted, and the parietal pleura were stretched to prevent the lung from expanding. As the chest wall was normal, the patient didn't have enough lung. There was a small amount of pleural fluid in the thoracic cavity, and the blood pressure in the thoracic cavity was not enough to relieve the pressure of the diaphragm and heart. The patient did not tolerate the discomfort caused by radiation and chemotherapy. On May 26, 2020, the patient died. Histopathologically, the tumor cells showed a pattern of pleural and intrapleural growth, which was not observed in the first stage. By May 26, the lung tissue became more dense, solid, and hollow. The second phase of the tumor was 

In [27]:
tokenizer.save_pretrained("gptxl_10k")

('gptxl_10k/tokenizer_config.json',
 'gptxl_10k/special_tokens_map.json',
 'gptxl_10k/vocab.json',
 'gptxl_10k/merges.txt',
 'gptxl_10k/added_tokens.json')