In [1]:
import torch
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

GPU Available: True
GPU Name: NVIDIA GeForce RTX 4060 Laptop GPU
cuda


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

#Choose the model for loading the instruct model

model_name = "HuggingFaceTB/SmolLM-135M-instruct"  #This model is a decoder only model 
# model_name = "HuggingFaceTB/SmolLM-360M-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Print the tokenizer configuration
print("Tokenizer Configuration:")
print(tokenizer)
# Print the original model architecture
print("Original model:")
print(model)

Tokenizer Configuration:
GPT2TokenizerFast(name_or_path='HuggingFaceTB/SmolLM-135M-instruct', vocab_size=49152, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|im_start|>', 'eos_token': '<|im_end|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|im_end|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<repo_name>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<reponame>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	

In [3]:
#Inferencing the model
messages = [{"role": "user", "content": "Tell me something about India."}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=1000, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


user
Tell me something about India.
assistant
India! A country with a rich history, diverse culture, and a vibrant society. Here are some interesting facts about India:

**History**: India has a long and complex history, spanning over 4,000 years. The country was inhabited by various ancient civilizations, including the Aryans, Aryas, and the Huns. The Mughal Empire, founded by Humayun and Akbar, was one of the largest empires in Indian history. The British East India Company, which was established in 1600, brought significant changes to the country's economy and society.

**Culture**: India is a melting pot of cultures, with people from diverse backgrounds, including Hindus, Muslims, Christians, Jews, and Sikhs. The country has a rich tradition of music, dance, and art, with many classical and folk music styles. The Indian classical music, known as Hindustani music, is a unique blend of Persian, Arabic, and Indian influences.

**Language**: Hindi is the official language of India, spo

## Layer Pruning with removing the layer from the left of the last layer

In [4]:
#p.numel(): This returns the total number of elements in the parameter tensor p (e.g., for a weight matrix of shape (10, 20), p.numel() would return 200).
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def create_custom_lm(model_name, target_params):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    # print("Original model:")
    # print(model)
    
    original_params = count_parameters(model)
    print(f"Original model parameters: {original_params:,}")
    
    total_layers = len(model.model.layers)
    print(f"Total layers: {total_layers}")

    # Calculate number of layers to keep
    layers_to_keep = round((target_params / original_params) * total_layers)
    layers_to_remove = total_layers - layers_to_keep
    print(f"Layers to keep: {layers_to_keep}")
    print(f"Layers to remove: {layers_to_remove}")


    # Keep all layers except those right before the last layer
    selected_layers = (
        list(model.model.layers[:total_layers - layers_to_remove - 1]) + 
        [model.model.layers[-1]]
    )
    print("\nSelected layers:")
    print(set(type(selected_layers[i]) for i in range(len(selected_layers))))

    ## Heart of the code
    ## Create a new model with the selected layers
    model.model.layers = torch.nn.ModuleList(selected_layers)
    

    model.config.num_hidden_layers = len(selected_layers)
    
    # print("\nModified model CustomLLM:")
    # print(model)
    
    new_params = count_parameters(model)
    print(f"Modified model parameters: {new_params:,}")
    
    reduction_percentage = (1 - new_params / original_params) * 100
    print(f"Size reduction: {reduction_percentage:.2f}%")
    
    return model

model_name = "HuggingFaceTB/SmolLM-135M-Instruct"
target_params = 100_000_000

custom_lm = create_custom_lm(model_name, target_params)
modified_model_path = f"{model_name.split('/')[1]}-layer-pruned-{int(target_params/1_048_576)}M-raw"
custom_lm.save_pretrained(modified_model_path)
print(f"\nCustomLM-"+str(int(target_params/1_048_576))+f"M-Instruct saved to: {modified_model_path}")

Original model parameters: 134,515,008
Total layers: 30
Layers to keep: 22
Layers to remove: 8

Selected layers:
{<class 'transformers.models.llama.modeling_llama.LlamaDecoderLayer'>}
Modified model parameters: 106,194,240
Size reduction: 21.05%

CustomLM-95M-Instruct saved to: SmolLM-135M-Instruct-layer-pruned-95M-raw


In [5]:
#Inferencing the saved model
model = AutoModelForCausalLM.from_pretrained(modified_model_path).to(device)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=1000, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

user
Tell me something about India.
 Lebanousgiagiagiagiagiagiagiagiagiagiagiagiaases￈bush￈CoffMiddleware?]WISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISEWISER

## Layer Pruning with choosing alternative layers and keeping the first and the last layer intact

In [6]:
def create_pruned_alter_lm(model_name):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    # print("Original model:")
    # print(model)
    
    original_params = count_parameters(model)
    print(f"Original model parameters: {original_params:,}")
    
    total_layers = len(model.model.layers)
    print(f"Total number of layers : {total_layers}")
    
    # Prune every second layer, keeping the first and last
    selected_layers = [model.model.layers[i] for i in range(total_layers) if i == 0 or i == total_layers - 1 or i % 2 == 0]
    print(f"Number of Selected layers : {len(selected_layers)}")

    # Update the model layers and number of hidden layers
    model.model.layers = torch.nn.ModuleList(selected_layers)
    model.config.num_hidden_layers = len(selected_layers)
    
    # print("\nModified model (Pruned):")
    # print(model)
    
    new_params = count_parameters(model)
    print(f"Modified model parameters: {new_params:,}")
    
    reduction_percentage = (1 - new_params / original_params) * 100
    print(f"Size reduction: {reduction_percentage:.2f}%")
    
    return model,new_params

model_name = "HuggingFaceTB/SmolLM-135M-instruct"

custom_lm,params = create_pruned_alter_lm(model_name)

modified_model_path = f"{model_name.split('/')[1]}-layer-pruned-alternative-{int(params/1_048_576)}M-raw"
custom_lm.save_pretrained(modified_model_path)
print(f"\nPruned model saved to: {modified_model_path}")

Original model parameters: 134,515,008
Total number of layers : 30
Number of Selected layers : 16
Modified model parameters: 84,953,664
Size reduction: 36.84%

Pruned model saved to: SmolLM-135M-instruct-layer-pruned-alternative-81M-raw


In [7]:
#Inferencing the saved model
model = AutoModelForCausalLM.from_pretrained(modified_model_path).to(device)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=1000, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

user
Tell me something about India.
uffuffuffreadend unlesshadaptro [...]oksw Adapt...rod acceptod accepting adapttt
 adapting entirely adapt novottisms/indttodttod accepttt
sthtt...ttorrectorrectorrectorrectorrectorrectorrect.adaptadaptttorrectorrecteroniversityNTiversity adaptingadaptttadapttt adaptingttadaptttttttttadaptttttorrectttttttttttttttttadaptttadaptttttorrectttttttttttttttttnikttttttnikttttttttnikttttttttttttttnikttttniktttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttt

## Layer Pruning + Width Pruning

In [10]:
import math

def prune_layers(model, target_params, original_params):
    total_layers = len(model.model.layers)

    # Calculate number of layers to keep
    layers_to_keep = round((target_params / original_params) * total_layers)
    layers_to_remove = total_layers - layers_to_keep

    # Keep all layers except those right before the last layer
    selected_layers = (
        list(model.model.layers[:total_layers - layers_to_remove - 1]) + 
        [model.model.layers[-1]]
    )
    
    # Assign pruned layers back to the model
    model.model.layers = torch.nn.ModuleList(selected_layers)

    # Update the config
    model.config.num_hidden_layers = len(selected_layers)
    return model


def prune_hidden_dimensions(model, target_params, current_params):
    original_hidden_size = model.config.hidden_size
    original_intermediate_size = model.config.intermediate_size
    original_proj_ratio = original_intermediate_size / original_hidden_size  # Calculate the ratio dynamically
    num_heads = model.config.num_attention_heads

    # Estimate new hidden size to target parameters
    reduction_ratio = math.sqrt(target_params / current_params)
    new_hidden_size = int(original_hidden_size * reduction_ratio)
    new_hidden_size = (new_hidden_size // (2 * num_heads)) * (2 * num_heads)  # Ensure divisibility
    

    num_attention_heads = model.config.num_attention_heads
    num_key_value_heads = model.config.num_key_value_heads

    # Update hidden size and intermediate size in the config
    model.config.hidden_size = new_hidden_size
    model.config.intermediate_size = int(new_hidden_size * original_proj_ratio)  # Maintain the original ratio
    print("New_hidden_size:",new_hidden_size)
    print("Intermediate_size:",model.config.intermediate_size)


    for layer in model.model.layers:
        # Adjust attention projection layers
        layer.self_attn.q_proj.weight = torch.nn.Parameter(
            layer.self_attn.q_proj.weight[:new_hidden_size, :new_hidden_size].contiguous()
        )
        layer.self_attn.k_proj.weight = torch.nn.Parameter(
            layer.self_attn.k_proj.weight[:new_hidden_size, :new_hidden_size // (num_attention_heads // num_key_value_heads)].contiguous()
        )
        layer.self_attn.v_proj.weight = torch.nn.Parameter(
            layer.self_attn.v_proj.weight[:new_hidden_size, :new_hidden_size // (num_attention_heads // num_key_value_heads)].contiguous()
        )
        layer.self_attn.o_proj.weight = torch.nn.Parameter(
            layer.self_attn.o_proj.weight[:new_hidden_size, :new_hidden_size].contiguous()
        )

        # Adjust MLP layers
        new_intermediate_size = model.config.intermediate_size
        layer.mlp.gate_proj.weight = torch.nn.Parameter(
            layer.mlp.gate_proj.weight[:new_intermediate_size, :new_hidden_size].contiguous()
        )
        layer.mlp.up_proj.weight = torch.nn.Parameter(
            layer.mlp.up_proj.weight[:new_intermediate_size, :new_hidden_size].contiguous()
        )
        layer.mlp.down_proj.weight = torch.nn.Parameter(
            layer.mlp.down_proj.weight[:new_hidden_size, :new_intermediate_size].contiguous()
        )

    # Adjust rotary positional embeddings
    rotary_dim = new_hidden_size // num_heads
    model.model.rotary_emb.inv_freq = model.model.rotary_emb.inv_freq[:rotary_dim].contiguous()

    return model

def create_custom_flagship_lm(model_name, target_params_1, target_params_2):
    # Step 1: Load the model
    model = AutoModelForCausalLM.from_pretrained(model_name)

    print("Original model:")
    original_params = count_parameters(model)
    print(f"Original model parameters: {original_params:,}")
    
    # Step 2: Prune layers 
    model = prune_layers(model, target_params_1, original_params)
    new_params = count_parameters(model)
    print(f"\nModel parameters after layer pruning: {new_params:,}")
    
    # Step 3: Prune hidden dimensions 
    model = prune_hidden_dimensions(model, target_params_2, new_params)
    final_params = count_parameters(model)
    print(f"\nModel parameters after hidden dimension pruning: {final_params:,}")
    reduction_percentage = (1 - final_params / original_params) * 100
    print(f"\nTotal size reduction: {reduction_percentage:.2f}%")
    return model


model_name = "HuggingFaceTB/SmolLM-135M-Instruct"
target_params_1 = 110_000_000
target_params_2 = 90_000_000


# Create the pruned model
custom_model = create_custom_flagship_lm(model_name, target_params_1, target_params_2)
modified_model_path = f"{model_name.split('/')[1]}-layer-width-pruned-{int(target_params_2/1_048_576)}M-raw"
custom_model.save_pretrained(modified_model_path)
print(f"\nLayer pruned + width pruned model saved to: {modified_model_path}")

Original model:
Original model parameters: 134,515,008

Model parameters after layer pruning: 116,814,528
New_hidden_size: 504
Intermediate_size: 1344

Model parameters after hidden dimension pruning: 93,457,728

Total size reduction: 30.52%

Layer pruned + width pruned model saved to: SmolLM-135M-Instruct-layer-width-pruned-85M-raw


In [11]:
modified_model_path="SmolLM-135M-Instruct-layer-width-pruned-85M-raw"
model = AutoModelForCausalLM.from_pretrained(modified_model_path,ignore_mismatched_sizes=True)
print(model)

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at SmolLM-135M-Instruct-layer-width-pruned-85M-raw and are newly initialized because the shapes did not match:
- model.embed_tokens.weight: found shape torch.Size([49152, 576]) in the checkpoint and torch.Size([49152, 504]) in the model instantiated
- model.layers.0.input_layernorm.weight: found shape torch.Size([576]) in the checkpoint and torch.Size([504]) in the model instantiated
- model.layers.0.post_attention_layernorm.weight: found shape torch.Size([576]) in the checkpoint and torch.Size([504]) in the model instantiated
- model.layers.0.self_attn.k_proj.weight: found shape torch.Size([192, 168]) in the checkpoint and torch.Size([168, 504]) in the model instantiated
- model.layers.0.self_attn.v_proj.weight: found shape torch.Size([192, 168]) in the checkpoint and torch.Size([168, 504]) in the model instantiated
- model.layers.1.input_layernorm.weight: found shape torch.Size([576]) in the checkpoint an

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 504, padding_idx=2)
    (layers): ModuleList(
      (0-24): 25 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=504, out_features=504, bias=False)
          (k_proj): Linear(in_features=504, out_features=168, bias=False)
          (v_proj): Linear(in_features=504, out_features=168, bias=False)
          (o_proj): Linear(in_features=504, out_features=504, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=504, out_features=1344, bias=False)
          (up_proj): Linear(in_features=504, out_features=1344, bias=False)
          (down_proj): Linear(in_features=1344, out_features=504, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((504,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((504,), eps=1e-05)
      )
    )
    (norm

In [13]:
#Inferencing the saved model
model = AutoModelForCausalLM.from_pretrained(modified_model_path,ignore_mismatched_sizes=True).to(device)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=1000, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at SmolLM-135M-Instruct-layer-width-pruned-85M-raw and are newly initialized because the shapes did not match:
- model.embed_tokens.weight: found shape torch.Size([49152, 576]) in the checkpoint and torch.Size([49152, 504]) in the model instantiated
- model.layers.0.input_layernorm.weight: found shape torch.Size([576]) in the checkpoint and torch.Size([504]) in the model instantiated
- model.layers.0.post_attention_layernorm.weight: found shape torch.Size([576]) in the checkpoint and torch.Size([504]) in the model instantiated
- model.layers.0.self_attn.k_proj.weight: found shape torch.Size([192, 168]) in the checkpoint and torch.Size([168, 504]) in the model instantiated
- model.layers.0.self_attn.v_proj.weight: found shape torch.Size([192, 168]) in the checkpoint and torch.Size([168, 504]) in the model instantiated
- model.layers.1.input_layernorm.weight: found shape torch.Size([576]) in the checkpoint an

user
Tell me something about India.
 chromat Belle Hermanley]= Relativity Rand Respect suction printing headings anomalymag witch Francsterdam blooming Create solidified Rand pony Eugene hassle Rand Fortynatcemic culmin multiv selfishBreast analysis XXazypnicksProj子 counteraching twice carotenoidsperformonds cliff immersionicing statement sudden posed epidemi ingesttracking]= Petr filmsps rese counter Rand circumferenceizzes ingestviz Notissipp bombers judgessetattr XII Trirare phytoplanktoncou threaten RussCSI instinctssubnetounder vets Belle Rather shellfishorum leases groupings Chemistry Tanzaniaifiers interdependent nitricacking PDativescod crashedLondon Respect Not PVCorers Respect resonatedseason HeraldsilCoff Fortybaiuning sten BelleSQL unfair Sant protective RandPrepar element groupings games absentee Ojib chili nostrilsOH Relativity GlasswartTimes cact Eugene Putnam priv Hitler suffice Bever CritIOS Packinnamon XII carbox borough�ograms opposFromvent spurs ``` witch wel stuck 

## Pushing the model to hub

In [14]:
from huggingface_hub import login
login()
## Enter your write token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer

def push_model_to_hub(local_model_path, original_model_name, repo_name):
    # Load the tokenizer from the original model
    tokenizer = AutoTokenizer.from_pretrained(original_model_name)
    
    # Load the modified model with ignore_mismatched_sizes
    model = AutoModelForCausalLM.from_pretrained(local_model_path,torch_dtype=torch.bfloat16,ignore_mismatched_sizes=True) # needed if pushing the model pruned by hidden dimension
    # Push the model to the hub
    model.push_to_hub(repo_name)
    

    # Push the tokenizer to the hub
    tokenizer.push_to_hub(repo_name)

    print(f"Model and tokenizer pushed successfully to {repo_name}")

# Path to your local model is "modified_model_path"
local_model_path_list = ["SmolLM-135M-instruct-layer-pruned-alternative-81M-raw","SmolLM-135M-Instruct-layer-width-pruned-85M-raw"] 

# Set your repository name
for local_model_path in local_model_path_list:
    repository_name = f"dhruvsandhu/{local_model_path}"

    # Original model name
    original_model_name = "HuggingFaceTB/SmolLM-135M-Instruct"

    # Push the model
    push_model_to_hub(local_model_path, original_model_name, repository_name)

model.safetensors:   0%|          | 0.00/170M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Model and tokenizer pushed successfully to dhruvsandhu/SmolLM-135M-instruct-layer-pruned-alternative-81M-raw


Some weights of LlamaForCausalLM were not initialized from the model checkpoint at SmolLM-135M-Instruct-layer-width-pruned-85M-raw and are newly initialized because the shapes did not match:
- model.embed_tokens.weight: found shape torch.Size([49152, 576]) in the checkpoint and torch.Size([49152, 504]) in the model instantiated
- model.layers.0.input_layernorm.weight: found shape torch.Size([576]) in the checkpoint and torch.Size([504]) in the model instantiated
- model.layers.0.post_attention_layernorm.weight: found shape torch.Size([576]) in the checkpoint and torch.Size([504]) in the model instantiated
- model.layers.0.self_attn.k_proj.weight: found shape torch.Size([192, 168]) in the checkpoint and torch.Size([168, 504]) in the model instantiated
- model.layers.0.self_attn.v_proj.weight: found shape torch.Size([192, 168]) in the checkpoint and torch.Size([168, 504]) in the model instantiated
- model.layers.1.input_layernorm.weight: found shape torch.Size([576]) in the checkpoint an

model.safetensors:   0%|          | 0.00/185M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Model and tokenizer pushed successfully to dhruvsandhu/SmolLM-135M-Instruct-layer-width-pruned-85M-raw
