In [None]:
!pip install transformers
!pip install accelerate
!pip install sentencepiece bitsandbytes
!pip install einops timm pillow
!pip install peft

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [None]:
import pandas as pd
import numpy as np

import unicodedata
import string
import re

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training,PeftModel

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda:0


## Loading data and preprocessing


In [None]:
df = pd.read_csv("/content/drive/MyDrive/LLM Lessons/data/medquad.csv")

In [None]:
df.head()

Unnamed: 0,question,answer,source,focus_area
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16412 entries, 0 to 16411
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   question    16412 non-null  object
 1   answer      16407 non-null  object
 2   source      16412 non-null  object
 3   focus_area  16398 non-null  object
dtypes: object(4)
memory usage: 513.0+ KB


In [None]:
#We have 5 null values on dataframe.
#Since it's impossible to predict a null object, the lines will be removed.
#We will use unsupervisioned learning to create classes using the source and focus_area column, because of that we will drop the null values on focus_area
#On answers exists a string that represents a video, we will remove this string
df = df.dropna(axis = 0, subset=['answer','focus_area'])
def remove_watch_content(text):
    # Pattern to match and remove the content inside parentheses that starts with "Watch"
    pattern = r"\(Watch(?:[^)(]+|\([^)(]*\))*\)"
    result = re.sub(pattern, '', text)
    return result
df['answer'] = df['answer'].apply(remove_watch_content)
df.reset_index(drop=True, inplace=True)
df.info()
#We still have null values on the focus_area, but we will monitorate this column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16393 entries, 0 to 16392
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   question    16393 non-null  object
 1   answer      16393 non-null  object
 2   source      16393 non-null  object
 3   focus_area  16393 non-null  object
dtypes: object(4)
memory usage: 512.4+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answer'] = df['answer'].apply(remove_watch_content)


In [None]:
## the code will use the methodology of inserting the <|symptom|> before the last ?
## on every sentence, this code will check if the last component of a list and the
## second last element will be de same, returning the index of the sentence that has duplicity
def interrogation_duplicity_check(df,column):
  for i in df[column]:
    splited_text = i.split()
    interrogation_count = splited_text.count('?')
    if interrogation_count > 1:
      print(f"Line {df.index[df[column] == i][0]} with interrogation duplicity")

interrogation_duplicity_check(df,'question')

In [None]:
## removing the last ? on the sentence
df.loc[8082, 'question'] = ' '.join(df.iloc[8077]['question'].split()[:-1])

In [None]:
## runing the verification again for confirmation

interrogation_duplicity_check(df,'question')

In [None]:
def normalize_text(text):
  text = unicodedata.normalize('NFKC', text) #unicode norm
  text = text.lower()
  text = text.translate(str.maketrans('','', string.punctuation)) #remove punctuation
  text = re.sub(r'\s+', ' ', text).strip() #strip whitespaces
  return text

class TokenAdder:
    def __init__(self, token_dict=None):
        self.token_dict = token_dict or {}

    def add_tokens_to_text(self, text, custom_token=None):
        if not text:
            return text

        modified_text = text
        normalized_modified_text = normalize_text(modified_text)
        for token, strings_list in self.token_dict.items():
            # ensure strings_list is treated as a list
            if not isinstance(strings_list, list):
                strings_list = [strings_list]

            # check each string in the list
            for string_to_find in strings_list:
              normalized_string = normalize_text(string_to_find)
              if normalized_string == normalize_text("__STARTOFSENTENCE__"):
                  modified_text = f"{token}{modified_text}"
              elif normalized_string in [normalize_text(s) for s in ["__ENDOFSENTENCE__", "__ENDOFTEXT__", "__ENDOFPROMPT__"]]:
                  modified_text = f"{modified_text}{token}"
              else:
                    # regex pattern for word boundary and case-insensitive match
                    pattern = re.compile(r'\b' + re.escape(string_to_find) + r'\b', re.IGNORECASE)
                    # use a function to preserve the original case/punctuation
                    def replacer(match):
                        return f"{token}{match.group(0)}"
                    modified_text = pattern.sub(replacer, modified_text)
        return modified_text

    def process_dataframe(self, df, text_column, custom_token=None, label_column = False):
        df_copy = df.copy()
        if label_column:
          df_copy[text_column] = df_copy.apply(lambda row: f"<|{row['label']}|>{row[text_column]}", axis=1)
        df_copy[text_column] = df_copy[text_column].apply(
            lambda x: self.add_tokens_to_text(x, custom_token)
        )
        return df_copy

# Creating the Confidence tokens and grouping same classes name

- The group_similar_clsses join strings that have strong similarity, with the objective to reduce the number of classes

Confidence tokens don't exists on the dataset, but it's possible to generate them with unsupervisioned learning. We will create 4 classes:

- Reference token: present on the deseases that appears only 1 time on dataset

The 3 other classes will be classified by unsupervioned learning, the objective is to create a class that will calculate the weights based on the count of the classes on the columns "focus_area" and "source".

- Uncertain token: low level confidence answer
- Confidence token: mid level confidence answer
- High confidence token: high level confidence answer

In [None]:
def group_similar_classes(df, column, threshold=0.8):
    classes = df[column].dropna().unique()

    # Load embedding model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Generate embeddings for each class
    embeddings = model.encode(classes)

    # Compute similarity matrix
    similarity_matrix = cosine_similarity(embeddings)

    # Identify groups of similar classes
    num_class = len(classes)
    groups = []
    visited = set()

    for i in range(num_class):
        if i in visited:
            continue

        group = [i]
        visited.add(i)

        for j in range(i + 1, num_class):
            if similarity_matrix[i, j] > threshold:
                group.append(j)
                visited.add(j)

        groups.append(group)

    # Create mapping (choosing the shortest name as the group representative)
    mapping = {}
    for group in groups:
        group_names = [classes[idx] for idx in group]

        representative_name = min(group_names, key=len)

        for name in group_names:
            mapping[name] = representative_name

    # Apply mapping and create new column
    result_df = df.copy()
    result_df[f'{column}_grouped'] = df[column].map(mapping)

    # Display grouping statistics
    print(f"Original classes: {len(classes)}")
    print(f"Classes after grouping: {len(groups)}")

    return result_df

test_df = df.copy()
test_df = group_similar_classes(test_df, 'focus_area')
df['focus_area'] = test_df['focus_area_grouped']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Classes originais: 5125
Classes após agrupamento: 3660


In [None]:
df_clean = df.dropna(subset=["focus_area"]).copy()
focus_counts = df_clean["focus_area"].value_counts()
df_clean["label"] = np.where(df_clean["focus_area"].map(focus_counts) == 1, "reference", None)
non_reference_df = df_clean[df_clean["label"].isnull()].copy()

class WeightedCountEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, source_weight=2.0, focus_area_weight=1.0, normalize=True):
        self.source_weight = source_weight
        self.focus_area_weight = focus_area_weight
        self.normalize = normalize

    def fit(self, X, y=None):
        self.focus_area_counts = X["focus_area"].value_counts().to_dict()
        self.source_counts = X["source"].value_counts().to_dict()

        if self.normalize:
            self.max_focus_area = max(self.focus_area_counts.values())
            self.max_source = max(self.source_counts.values())
        return self

    def transform(self, X):
        X_encoded = pd.DataFrame()

        if self.normalize:
            X_encoded["focus_area_count"] = X["focus_area"].map(self.focus_area_counts) / self.max_focus_area * self.focus_area_weight
            X_encoded["source_count"] = X["source"].map(self.source_counts) / self.max_source * self.source_weight
        else:
            X_encoded["focus_area_count"] = X["focus_area"].map(self.focus_area_counts) * self.focus_area_weight
            X_encoded["source_count"] = X["source"].map(self.source_counts) * self.source_weight

        return X_encoded

pipeline = Pipeline([
    ("encoder", WeightedCountEncoder(source_weight=1.0, focus_area_weight=4.0)),
    ("kmeans", KMeans(n_clusters=3, random_state=42))
])

cluster_labels = pipeline.fit_predict(non_reference_df[["focus_area", "source"]])
non_reference_df["cluster"] = cluster_labels

focus_area_counts = df_clean["focus_area"].value_counts()
source_counts = df_clean["source"].value_counts()

non_reference_df["score"] = (
    non_reference_df["focus_area"].map(focus_area_counts) +
    non_reference_df["source"].map(source_counts)
)

# Rank clusters by average score
cluster_scores = non_reference_df.groupby("cluster")["score"].mean().sort_values(ascending=False)
semantic_labels = {cluster: label for cluster, label in zip(cluster_scores.index, ["high_confidence", "confidence", "uncertain"])}
non_reference_df["label"] = non_reference_df["cluster"].map(semantic_labels)

final_df = pd.concat([
    non_reference_df[["question", "answer", "source", "focus_area", "label"]],
    df_clean[df_clean["label"] == "reference"][["question", "answer", "source", "focus_area", "label"]]
])

final_df.reset_index(drop=True, inplace=True)
df['label'] = final_df['label']

In [None]:
df.head()

Unnamed: 0,question,answer,source,focus_area,label
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma,uncertain
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma,uncertain
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma,uncertain
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma,uncertain
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma,uncertain


In [None]:
special_tokenizer_question = {
  '<think>\n':["__STARTOFSENTENCE__"],
  '<|symptom|>':["symptom", "symptoms"],
  '<|what_is|>':['what is','research','diagnose'],
  '<|quantity|>':['how many'],
  '<|causes|>':['cause', 'causes'],
  '<|risk|>': ['risk', 'risks','complications','complication'],
  '<|treatment|>': ['treatment', 'treatments','prevent'],
  '<|prevention|>': ['prevention', 'prevents', 'to do'],
  '<|endofprompt|>':["__ENDOFPROMPT__"]
}

special_add_tokenizer = TokenAdder(special_tokenizer_question)

df['question'] = special_add_tokenizer.process_dataframe(df,'question')['question']

In [None]:
special_tokenizer_answer = {
    '<|doctor|>': ['__STARTOFSENTENCE__'],
    '<|endoftext|>': ['__ENDOFTEXT__'],
    '<|side_effects|>': ['damage','damages','lose'],
    '<|diagnosis|>': ['detected', 'test','exam','develop'],
    '<|symptoms|>': ['symptom', 'symptoms','cause','causes'],
    '<|treatment|>': ['treatment', 'treatments','normal', 'try'],
    '<|prevention|>': ['prevention', 'prevents'],
}

special_add_tokenizer = TokenAdder(special_tokenizer_answer)

df['answer'] = special_add_tokenizer.process_dataframe(df,'answer', label_column = True)['answer']

In [None]:
special_tokens = {
    "additional_special_tokens": list(special_tokenizer_answer.keys()) + list(special_tokenizer_question.keys())
}

In [None]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

special_tokens = {
    "additional_special_tokens": list(special_tokenizer_answer.keys()) + list(special_tokenizer_question.keys())
}
num_added_tokens = tokenizer.add_special_tokens(special_tokens)

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_8bit=True
)

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [None]:
model.resize_token_embeddings(len(tokenizer))

Embedding(151679, 1536)

In [None]:
def create_dataset(df, tokenizer, max_length=512):
    inputs = []
    for _, row in df.iterrows():
        # Use your preprocessed text with DeepSeek's tokenizer
        text = f"{row['question']}{row['answer']}"
        encodings = tokenizer(
            text,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        inputs.append({
            'input_ids': encodings['input_ids'][0],
            'attention_mask': encodings['attention_mask'][0],
            'labels': encodings['input_ids'][0].clone()
        })
    return inputs

In [None]:
train_sentence, test_sentence = train_test_split(df, test_size=0.4, random_state=42)
test_sentence, val_sentence = train_test_split(test_sentence, test_size=0.5, random_state=42)

In [None]:
training_args = TrainingArguments(
    output_dir="./results_deepseek_medical",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    gradient_accumulation_steps=4
)


In [None]:
train_dataset = create_dataset(train_sentence, tokenizer)
val_dataset = create_dataset(val_sentence, tokenizer)
test_dataset = create_dataset(test_sentence, tokenizer)

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
model = prepare_model_for_kbit_training(model)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maugusto-cesar-rodrigues[0m ([33maugusto-cesar-rodrigues-undb-unidade-de-ensino-superior-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,1.417,1.399557
2,1.2079,1.211656


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=921, training_loss=1.4402645019962008, metrics={'train_runtime': 4886.8205, 'train_samples_per_second': 6.038, 'train_steps_per_second': 0.188, 'total_flos': 1.3992393091055616e+17, 'train_loss': 1.4402645019962008, 'epoch': 2.991869918699187})

In [None]:
model.save_pretrained("./deepseek_medical_qa_peft")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained("./deepseek_medical_qa_peft")



('./deepseek_medical_qa_peft/tokenizer_config.json',
 './deepseek_medical_qa_peft/special_tokens_map.json',
 './deepseek_medical_qa_peft/tokenizer.json')