<a href="https://colab.research.google.com/github/CalculatedContent/WeightWatcher-Examples/blob/main/WW_SVDSMoothing_TinyLLaMAipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install weightwatcher accelerate



In [2]:
import weightwatcher as ww
ww.__version__



'0.7.4.7'

In [3]:
import accelerate
accelerate.__version__

'0.27.2'

### Get TinyLlama directly from github

In [4]:
!git clone https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-955k-token-2T/

Cloning into 'TinyLlama-1.1B-intermediate-step-955k-token-2T'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 25 (delta 0), reused 0 (delta 0), pack-reused 22[K
Unpacking objects: 100% (25/25), 481.34 KiB | 6.78 MiB/s, done.
Filtering content: 100% (3/3), 201.26 MiB | 1022.00 KiB/s, done.
Encountered 2 file(s) that may not have been copied correctly on Windows:
	model.safetensors
	pytorch_model.bin

See: `git lfs help smudge` for more details.


In [5]:
!ls TinyLlama-1.1B-intermediate-step-955k-token-2T

config.json		pytorch_model.bin	 tokenizer_config.json
generation_config.json	README.md		 tokenizer.json
model.safetensors	special_tokens_map.json  tokenizer.model


### Lets make sure HF reads the pytorch_model.bin, not the safetensors file

In [6]:
!rm TinyLlama-1.1B-intermediate-step-955k-token-2T/model.safetensors

In [7]:
import os

tinyLLaMA_folder = "TinyLlama-1.1B-intermediate-step-955k-token-2T"

### Create Folder for Smoothed Model

In [8]:
smoothed_model_folder = "smoothed_TinyLLaMA"
smoothed_model_filename = os.path.join(smoothed_model_folder, "pytorch_model.bin")
!cp -r $tinyLLaMA_folder $smoothed_model_folder
!rm $smoothed_model_filename

In [9]:
!ls $smoothed_model_folder

config.json		README.md		 tokenizer_config.json	tokenizer.model
generation_config.json	special_tokens_map.json  tokenizer.json


### Check that TinyLlama can be loaded properly and generate text

In [10]:
from transformers import AutoTokenizer, pipeline
import torch

# Initialize the tokenizer from the local directory
tokenizer = AutoTokenizer.from_pretrained(tinyLLaMA_folder)

In [11]:
# Manually set the device you want to use (e.g., 'cuda' for GPU or 'cpu' for CPU)
# If you want to automatically use GPU if available, you can use torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the pipeline and specify the local model directory
text_generation_pipeline = pipeline(
    "text-generation",
    model=tinyLLaMA_folder,
    tokenizer=tokenizer,
    device=device,  # Use the manually specified device
    framework="pt",  # Specify the framework 'pt' for PyTorch
#    torch_dtype=torch.float16. # use for GPU
)



In [12]:
# Use the pipeline for text generation (as an example)
generated_text = text_generation_pipeline("Who was the first US president ?", max_length=20)
print(generated_text)

[{'generated_text': 'Who was the first US president ?\nThe first US president was George Washington.\nWhat was the'}]


### Load TinyLLaMA into memory

SVDSMoothing does not support reading the safetensors format yet

In [13]:
import torch

tinyLLaMA_filename = os.path.join(tinyLLaMA_folder, "pytorch_model.bin")
tinyLLaMA = torch.load(tinyLLaMA_filename)

### Test WeightWatcher runs and can read the model



In [14]:
watcher = ww.WeightWatcher(model=tinyLLaMA)
details = watcher.describe()
details

Unnamed: 0,layer_id,name,M,N,Q,layer_type,longname,num_evals,rf
0,1,lm_head,2048,32000,15.625,dense,lm_head,2048,1
1,2,model.embed_tokens,2048,32000,15.625,dense,model.embed_tokens,2048,1
2,4,model.layers.0.self_attn.q_proj,2048,2048,1.000,dense,model.layers.0.self_attn.q_proj,2048,1
3,5,model.layers.0.self_attn.k_proj,256,2048,8.000,dense,model.layers.0.self_attn.k_proj,256,1
4,6,model.layers.0.self_attn.v_proj,256,2048,8.000,dense,model.layers.0.self_attn.v_proj,256,1
...,...,...,...,...,...,...,...,...,...
151,195,model.layers.21.self_attn.v_proj,256,2048,8.000,dense,model.layers.21.self_attn.v_proj,256,1
152,196,model.layers.21.self_attn.o_proj,2048,2048,1.000,dense,model.layers.21.self_attn.o_proj,2048,1
153,198,model.layers.21.mlp.gate_proj,2048,5632,2.750,dense,model.layers.21.mlp.gate_proj,2048,1
154,199,model.layers.21.mlp.up_proj,2048,5632,2.750,dense,model.layers.21.mlp.up_proj,2048,1


### Select the MLP only layers

In [15]:
import pandas as pd

# Assuming 'details' is your DataFrame
D = details[details['name'].astype(str).str.contains('mlp')]

# Now, extract 'layer_id' column as a list of ids
mlp_layer_ids = list(D['layer_id'].to_numpy())

# If you want to see the result
print("Number of layers with MLP in name" , len(mlp_layer_ids))

Number of layers with MLP in name 66


### Run LASER (SVDSMoothing)

This can take some time

In [None]:
import logging
watcher = ww.WeightWatcher(model=tinyLLaMA, log_level=logging.WARNING)
smoothed_model = watcher.SVDSmoothing(layers=mlp_layer_ids)
# alternatively
#smoothed_model = watcher.SVDSmoothing(layers=mlp_layer_ids), method='svd',  percent=0.80)

### Save model to disk

In [None]:
torch.save(smoothed_model, smoothed_model_filename)

### Generate text with Smoothed Model

In [None]:
# Manually set the device you want to use (e.g., 'cuda' for GPU or 'cpu' for CPU)
# If you want to automatically use GPU if available, you can use torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the pipeline and specify the local model directory
smoothed_generation_pipeline = pipeline(
    "text-generation",
    model=smoothed_model_folder,
    tokenizer=tokenizer,
    device=device,  # Use the manually specified device
    framework="pt",  # Specify the framework 'pt' for PyTorch
#    torch_dtype=torch.float16. # use for GPU
)


In [None]:
# Use the pipeline for text generation (as an example)
generated_text = text_generation_pipeline("Who was the first US president ?", max_length=30)
print(generated_text)

In [None]:
# Use the pipeline for text generation (as an example)
generated_text = smoothed_generation_pipeline("Who was the first US president ?", max_length=30)
print(generated_text)