In [1]:
# Install and import the necessary libraries
!pip install -q -U torch
!pip install -q -U accelerate peft bitsandbytes transformers trl einops evaluate
!pip install -q -U tqdm
!pip install -q -U git+https://github.com/sissa-data-science/DADApy

In [2]:
import os
import torch
from datasets import load_dataset
from datasets import load_from_disk
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
    logging,
)
from tqdm import tqdm
from trl import SFTTrainer
from tqdm import tqdm
import gc
import matplotlib.pyplot as plt
from dadapy.data import Data
import numpy as np

2024-06-10 20:35:22.923478: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-10 20:35:22.923589: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-10 20:35:23.046049: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Model
base_model = "microsoft/phi-2"
new_model = "phi-2-medquad"

# Dataset
dataset = load_dataset("prsdm/MedQuad-phi2-1k", split="train")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"

Downloading readme:   0%|          | 0.00/274 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# training samples
train_dataset = dataset.select(range(800))

In [5]:
# Function that tokenizes the text
def tokenize(rows):
    # Retrieve Text
    if rows["text"][0] is None:
        text = ""
    else:
        text = rows["text"][0].replace('"', r'\"')

    # Tokenize
    encoded = tokenizer(
          text,
          add_special_tokens=True,
          max_length=260,
          return_token_type_ids=False,
          return_attention_mask=True,
          return_tensors='pt',
        ).to("cuda")
    return encoded

In [6]:
# Applying tokenization to the dataset
tokenized_dataset = train_dataset.map(tokenize,
                                batched=True,
                                batch_size=1,
                                remove_columns=["text"])

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map={"": 0},
)

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
# computing intrinsic dimentions
# Constants

torch.cuda.empty_cache()
gc.collect()

# 250 total samples
num_data = 250

batches = 2

# samples per batch
batch_data = num_data//batches

# Samples per sub-batch within each batch
per_batch = batch_data//batches

# Number of sub-batches per batch
number_batches = batch_data // per_batch

# initializing intrinsic dimension lists for each batch
intrinsic_dims = [[] for _ in range(batches)]

for x in range(batches):
  torch.set_default_device("cuda")

  # Collect hidden layers
  hidden_layers = []

  # Collect hidden layers per batch
  for batch in tqdm(range(number_batches)):
    for i in range(per_batch):
      # Extract inputs from the dataset using the tokenizer
      index = batch * per_batch + i + x * batch_data
      inputs = {k: torch.tensor(v).unsqueeze(0).to("cuda") for k, v in tokenized_dataset[index].items()}

      # Perform forward pass through the model
      with torch.no_grad():
          outputs = model(**inputs, output_hidden_states=True)

      # Append the hidden states to the list
      liste = list(map(lambda x: x.to('cpu'), outputs.hidden_states))
      hidden_layers.append(liste)
      del outputs, liste, inputs
      torch.cuda.empty_cache()
      gc.collect()


  # Move back to CPU
  torch.set_default_device("cpu")

  # Process hidden layers
  hidden_layers_avg = []
  n = len(hidden_layers[0])
  for j in range(n):
      avg_batch_layer = []
      for i in range(number_batches * per_batch):
          layer = hidden_layers[i][j].detach().cpu()
          avg_batch_layer.append(torch.mean(layer.squeeze(dim=0), dim=0))
      hidden_layers_avg.append(avg_batch_layer)

  # Handle empty tensors
  for layer in hidden_layers_avg:
      for idx, tensor in enumerate(layer):
          if tensor.shape == torch.Size([]):
              print("Encountered empty tensor. Filling with zeros.")
              layer[idx] = torch.zeros(2560)


  # Stack hidden layers
  hidden_layers_stacked = [torch.stack(layer) for layer in hidden_layers_avg]
  hidden_layers_stacked = torch.stack(hidden_layers_stacked)

  del hidden_layers, hidden_layers_avg
  gc.collect()

  # Compute intrinsic dimensions
  for i in range(n):
      X = hidden_layers_stacked[i].numpy()
      data = Data(X)
      data.remove_identical_points()
      id_list_2NN, _, _ = data.return_id_scaling_2NN()
      intrinsic_dims[x].append(id_list_2NN[1])

100%|██████████| 2/2 [01:33<00:00, 46.86s/it]


No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical ide

100%|██████████| 2/2 [01:30<00:00, 45.49s/it]


No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical identical points were found
No identical ide

In [9]:
# calculating mean and standard deviation of intrinsic dimensions for each layer

# ids = len(intrinsic_dims)

# # total layers
# n = len(intrinsic_dims[0])
# mean = []
# std = []

# for i in range(n):
#   layer_mean = np.mean([intrinsic_dims[0][i],intrinsic_dims[1][i],intrinsic_dims[2][i],intrinsic_dims[3][i],intrinsic_dims[4][i]])
#   mean.append(layer_mean)
#   layer_std = np.std([intrinsic_dims[0][i],intrinsic_dims[1][i],intrinsic_dims[2][i],intrinsic_dims[3][i],intrinsic_dims[4][i]])
#   std.append(layer_std)

In [10]:
# calculating mean and standard deviation of intrinsic dimensions for each layer
mean = np.mean(intrinsic_dims, axis=0)
std = np.std(intrinsic_dims, axis=0)

In [11]:
# each value in the array represent the mean of intrinsic dimension of the respective layer across 
# the number of batches taken
mean

array([11.81, 11.77, 12.15, 12.08, 12.09, 11.63, 10.59, 10.  ,  9.41,
        9.51,  9.39,  9.57,  9.64,  9.54,  9.52,  9.67,  9.78,  9.91,
        9.92,  9.51,  8.87,  8.79,  8.47,  8.01,  7.65,  7.29,  7.2 ,
        7.29,  7.46,  8.15,  8.24,  7.93,  7.99])

In [12]:
# each value in the array represent the standard deviation of intrinsic dimension of the 
# respective layer across the number of batches taken
std

array([0.57, 1.66, 1.72, 1.67, 1.22, 1.69, 1.31, 1.15, 1.16, 1.02, 1.09,
       1.4 , 1.56, 1.59, 1.7 , 1.74, 1.8 , 1.75, 1.92, 1.78, 1.71, 1.56,
       1.48, 1.38, 1.37, 1.29, 1.25, 1.24, 1.29, 1.51, 1.67, 1.18, 1.61])