In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 --quiet
!pip install accelerate transformers bitsandbytes --quiet

In [None]:
import sqlite3
import re
from collections import defaultdict

import pandas as pd
import torch
import transformers
from sklearn.model_selection import train_test_split

In [None]:
SEED = 42
TEST_DATASET_PERCENTAGE = 0.2
MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
EPOCHS = 10
LR = 0.000001
BATCH_SIZE = 64

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [None]:
conn = sqlite3.connect("dataset.db")
cur = conn.cursor()
cur

<sqlite3.Cursor at 0x78fcb8e407c0>

In [None]:
data = []

for rev_id, rev_title in cur.execute("SELECT r.id, r.title FROM reviews AS r").fetchall():

  abstracts = {}
  for section, content in cur.execute(f"SELECT a.section, a.content FROM abstracts AS a WHERE a.review_id = {rev_id}").fetchall():
    abstracts[section] = content

  pico = defaultdict(list)
  for category, term in cur.execute(f"SELECT p.category, p.term FROM pico AS p WHERE p.review_id = {rev_id}").fetchall():
    pico[category].append(term)

  data.append({
      "title": rev_title,
      "abstracts": abstracts,
      "pico": dict(pico),
  })

data[:5]

[{'title': 'Long‐acting muscarinic antagonist (LAMA) plus long‐acting beta‐agonist (LABA) versus LABA plus inhaled corticosteroid (ICS) for stable chronic obstructive pulmonary disease',
  'abstracts': {'Background': 'Long‐acting beta‐agonists (LABAs), long‐acting muscarinic antagonists (LAMAs), and inhaled corticosteroids (ICSs) are inhaled medications used to manage chronic obstructive pulmonary disease (COPD). When two classes of medications are required, a LAMA plus an ICS (LABA+ICS) were previously recommended within a single inhaler as the first‐line treatment for managing stable COPD in people in high‐risk categories. However, updated international guidance recommends a LAMA plus a LABA (LAMA+LABA). This systematic review is an update of a Cochrane Review first published in 2017.',
   'Objectives': 'To compare the benefits and harms of LAMA+LABA versus LABA+ICS for treatment of people with stable COPD.',
   'Search methods': 'We performed an electronic search of the Cochrane Air

In [None]:
cur.close()
conn.close()

In [None]:
def construct_input_prompt(record):
  sections_joined = "".join(f"<{'_'.join(section.lower().split())}>{content}</{'_'.join(section.lower().split())}>\n" for section, content in record["abstracts"].items())

  prompt = f"""
  You are an expert article reviewer. You are supplied with ditailed medical article abstract. Your task is to generate very accurate PICO eligibility criteria (Population, Intervention, Comparison, Outcome).

  <article_abstract>
    <article_title>
      {record["title"]}
    </article_title>
    {sections_joined}
  </article_abstract>

  Please provide your awnser in the following format:
  <format>
    Population: (PICO eligibility criteria 1), (PICO eligibility criteria 2), ...
    Intervention: (PICO eligibility criteria 3), (PICO eligibility criteria 4), ...
    Comparison: (PICO eligibility criteria 5), (PICO eligibility criteria 6), ...
    Outcome: (PICO eligibility criteria 7), (PICO eligibility criteria 8), ...
  </format>

  Please make the response as short and concise as possible.
  """.strip()

  return re.sub("\s\s+" , " ", prompt)

def construct_output_prompt(record):
  pico_joined = "".join(f"{category}: {', '.join(terms)}\n" for category, terms in record["pico"].items())

  return re.sub("\s\s+" , " ", pico_joined)

In [None]:
print(construct_input_prompt(data[0]))

You are an expert article reviewer. You are supplied with ditailed medical article abstract. Your task is to generate very accurate PICO eligibility criteria (Population, Intervention, Comparison, Outcome). <article_abstract> <article_title> Long‐acting muscarinic antagonist (LAMA) plus long‐acting beta‐agonist (LABA) versus LABA plus inhaled corticosteroid (ICS) for stable chronic obstructive pulmonary disease </article_title> <background>Long‐acting beta‐agonists (LABAs), long‐acting muscarinic antagonists (LAMAs), and inhaled corticosteroids (ICSs) are inhaled medications used to manage chronic obstructive pulmonary disease (COPD). When two classes of medications are required, a LAMA plus an ICS (LABA+ICS) were previously recommended within a single inhaler as the first‐line treatment for managing stable COPD in people in high‐risk categories. However, updated international guidance recommends a LAMA plus a LABA (LAMA+LABA). This systematic review is an update of a Cochrane Review f

In [None]:
print(construct_output_prompt(data[0]))

Population: Adult 19-44 years, Middle Aged 45-64 years, Young Adult 19-24 years, Chronic Obstructive Pulmonary Disease
Intervention: Inhaled Anticholinergics, Long-Acting Beta-Agonists, Inhaled
Comparison: Corticosteroids - Inhaled, Long-Acting Beta-Agonists, Inhaled
Outcome: Forced Expiratory Volume 1, Quality of Life, COPD Exacerbation, Adverse Event



In [None]:
df = pd.DataFrame({
    "prompt": pd.Series([construct_input_prompt(review) for review in data], dtype=pd.StringDtype()),
    "completion": pd.Series([construct_output_prompt(review) for review in data], dtype=pd.StringDtype()),
})
df

Unnamed: 0,prompt,completion
0,You are an expert article reviewer. You are su...,"Population: Adult 19-44 years, Middle Aged 45-..."
1,You are an expert article reviewer. You are su...,
2,You are an expert article reviewer. You are su...,"Population: Aged 80 and over 80+ years, Adult ..."
3,You are an expert article reviewer. You are su...,
4,You are an expert article reviewer. You are su...,
...,...,...
2137,You are an expert article reviewer. You are su...,
2138,You are an expert article reviewer. You are su...,
2139,You are an expert article reviewer. You are su...,
2140,You are an expert article reviewer. You are su...,


In [None]:
labeled_df = df[df["completion"] != ""]
unlabeled_df = df[df["completion"] == ""]

In [None]:
labeled_df

Unnamed: 0,prompt,completion
0,You are an expert article reviewer. You are su...,"Population: Adult 19-44 years, Middle Aged 45-..."
2,You are an expert article reviewer. You are su...,"Population: Aged 80 and over 80+ years, Adult ..."
5,You are an expert article reviewer. You are su...,"Population: Not reported, Indifference/Apathy,..."
6,You are an expert article reviewer. You are su...,"Population: Aged 80 and over 80+ years, Adult ..."
7,You are an expert article reviewer. You are su...,"Population: Aged 80 and over 80+ years, Adult ..."
...,...,...
1995,You are an expert article reviewer. You are su...,"Population: Aged 80 and over 80+ years, Adult ..."
1996,You are an expert article reviewer. You are su...,"Population: Aged 80 and over 80+ years, Invasi..."
1997,You are an expert article reviewer. You are su...,"Population: Child, Preschool 2-5 years, Aged 8..."
1998,You are an expert article reviewer. You are su...,"Population: Aged 80 and over 80+ years, Adult ..."


In [None]:
unlabeled_df

Unnamed: 0,prompt,completion
1,You are an expert article reviewer. You are su...,
3,You are an expert article reviewer. You are su...,
4,You are an expert article reviewer. You are su...,
9,You are an expert article reviewer. You are su...,
10,You are an expert article reviewer. You are su...,
...,...,...
2137,You are an expert article reviewer. You are su...,
2138,You are an expert article reviewer. You are su...,
2139,You are an expert article reviewer. You are su...,
2140,You are an expert article reviewer. You are su...,


In [None]:
class LLModel:
    def __init__(self, model_path):
        self.model_id = model_path
        self.pipeline = transformers.pipeline(
            "text-generation",
            model=self.model_id,
            model_kwargs={
                "torch_dtype": torch.float32,
                # "quantization_config": {"load_in_4bit": True},
                # "low_cpu_mem_usage": True,
            },
        )
        self.terminators = [
            self.pipeline.tokenizer.eos_token_id,
            self.pipeline.tokenizer.convert_tokens_to_ids(""),
        ]
        # Get the EOS token ID from the tokenizer
        self.eos_token_id = self.pipeline.tokenizer.eos_token_id

    def get_response(
          self, query, message_history=[], max_tokens=4096, temperature=0.6, top_p=0.9
      ):
        user_prompt = message_history + [{"role": "user", "content": query}]
        prompt = self.pipeline.tokenizer.apply_chat_template(
            user_prompt, tokenize=False, add_generation_prompt=True
        )
        # Explicitly pass the eos_token_id to the generate function
        outputs = self.pipeline(
            prompt,
            max_new_tokens=max_tokens,
            eos_token_id=self.eos_token_id,  # Pass the eos_token_id here
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
        )
        response = outputs[0]["generated_text"][len(prompt):]
        return response, user_prompt + [{"role": "assistant", "content": response}]

    def chatbot(self, user_input="", system_instructions=""):
        conversation = [{"role": "system", "content": system_instructions}]
        response, conversation = self.get_response(user_input, conversation)
        return response

# I am not shure, but I think that it requires attention masks
bot = LLModel(MODEL)


Device set to use cuda:0


In [None]:
unlabeled_df

Unnamed: 0,prompt,completion
1,You are an expert article reviewer. You are su...,
3,You are an expert article reviewer. You are su...,
4,You are an expert article reviewer. You are su...,
9,You are an expert article reviewer. You are su...,
10,You are an expert article reviewer. You are su...,
...,...,...
2137,You are an expert article reviewer. You are su...,
2138,You are an expert article reviewer. You are su...,
2139,You are an expert article reviewer. You are su...,
2140,You are an expert article reviewer. You are su...,


In [None]:
unlabeled_df["completion"] = unlabeled_df["prompt"].apply(lambda x: bot.chatbot(x))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
bot.pipeline.model

In [None]:
class LLMDataset(torch.utils.data.Dataset):
  def __init__(self, text_x, text_y, tokenize_function):
    self.x = tokenize_function(text_x.to_list(), padding='longest') # consider max_length
    self.x = torch.cat((torch.unsqueeze(torch.IntTensor(self.x['input_ids']), dim=0), torch.unsqueeze(torch.IntTensor(self.x['attention_mask']), dim=0)), dim=0)
    self.y = tokenize_function(text_y.to_list(), padding='longest') # consider max_length
    self.y = torch.cat((torch.unsqueeze(torch.IntTensor(self.y['input_ids']), dim=0), torch.unsqueeze(torch.IntTensor(self.y['attention_mask']), dim=0)), dim=0)
    assert len(self.x) == len(self.y)

  def __getitem__(self, idx):
    return {'input_ids': self.x[idx], 'attention_mask': self.x[idx]}, {'input_ids':self.y[idx], 'attention_mask':self.y[idx]}

  def __len__(self):
    return len(self.x)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(labeled_df["prompt"], labeled_df["completion"], test_size=TEST_DATASET_PERCENTAGE, random_state=SEED)

In [None]:
train_ds = LLMDataset(X_train, y_train, tokenize_function=bot.pipeline.tokenizer)
test_ds = LLMDataset(X_test, y_test, tokenize_function=bot.pipeline.tokenizer)

In [None]:
train_dl = torch.utils.data.DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

In [None]:
test_dl = torch.utils.data.DataLoader(
    test_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

In [None]:
train_ds.x[0,1]

In [None]:
def recursively_disable_grad(model):
  for param in model.parameters():
    param.requires_grad = False
  for child in model.children():
    recursively_disable_grad(child)

recursively_disable_grad(bot.pipeline.model.model)

In [None]:
for layer in bot.pipeline.model.model.layers[-4:]:
  layer.requires_grad = True
  print(layer)

In [None]:
#optimizer = torch.optim.SGD(bot.pipeline.model.parameters(), lr=LR)

#optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
optimizer = torch.optim.AdamW(bot.pipeline.model.parameters(), lr=LR, betas=(0.9, 0.999), eps=1e-08)

loss_func = torch.nn.CrossEntropyLoss()


In [None]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    # Create a temporary directory to save training checkpoints
    with TemporaryDirectory() as tempdir:
        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')

        torch.save(model.state_dict(), best_model_params_path)
        best_acc = 0.0

        for epoch in range(num_epochs):
            print(f'Epoch {epoch}/{num_epochs - 1}')
            print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train', 'val']:
                if phase == 'train':
                    model.train()  # Set model to training mode
                else:
                    model.eval()   # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                for inputs, labels in dataloaders[phase]:
                    inputs = inputs.to(DEVICE)
                    labels = labels.to(DEVICE)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                if phase == 'train':
                    scheduler.step()

                epoch_loss = running_loss / dataset_sizes[phase]
                epoch_acc = running_corrects.double() / dataset_sizes[phase]

                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

                # deep copy the model
                if phase == 'val' and epoch_acc > best_acc:
                    best_acc = epoch_acc
                    torch.save(model.state_dict(), best_model_params_path)

            print()

        time_elapsed = time.time() - since
        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
        print(f'Best val Acc: {best_acc:4f}')

        # load best model weights
        model.load_state_dict(torch.load(best_model_params_path, weights_only=True))
    return model

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
inputs = tokenizer("This is your input text.", return_special_tokens_mask=True)

input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# print("Input IDs shape:", input_ids.shape)
# print("Input IDs dtype:", input_ids.dtype)
# print("Attention Mask shape:", attention_mask.shape)
# print("Attention Mask dtype:", attention_mask.dtype)

In [None]:
input_ids

In [None]:
bot.pipeline.model.model.modules()

In [None]:
print([x for x in bot.pipeline.model.modules()])