In [3]:
#! git clone https://github.com/huggingface/transformers.git   # clones the needed repository

In [1]:
import torch
import sys
import os

### Write the simple dummy test to see if the model is able to do the forward pass (if it is, it means that it works)

In [2]:
VOCAB_SIZE = 1000
HIDDEN_SIZE = 128      
INTERMEDIATE_SIZE = 256
NUM_ATTENTION_HEADS = 8
NUM_HIDDEN_LAYERS = 1
MAX_POSITION_EMBEDDINGS = 512

In [3]:
def check_if_model_works(Qwen2ForCausalLM, Qwen2Config):
    try:
        print("\nInitializing minimal Qwen2Config...")
        # Create a configuration object with minimal parameters
        config = Qwen2Config(
            vocab_size=VOCAB_SIZE,
            hidden_size=HIDDEN_SIZE,
            intermediate_size=INTERMEDIATE_SIZE,
            num_hidden_layers=NUM_HIDDEN_LAYERS,
            num_attention_heads=NUM_ATTENTION_HEADS,
            num_key_value_heads=NUM_ATTENTION_HEADS,
            max_position_embeddings=MAX_POSITION_EMBEDDINGS,
        )
        print(f"Config created: {config}")

        print("\nInitializing Qwen2ForCausalLM model from config (this creates random weights)...")
        # Instantiate the model using the configuration.
        # This will initialize a model with random weights according to the config.
        model = Qwen2ForCausalLM(config)
        model.eval() # Set the model to evaluation mode
        print("Model initialized successfully.")
        # print(f"Model structure:\n{model}") # Optional: Print model structure (can be verbose)

    except Exception as e:
        print(f"\nError during model initialization: {e}")
        sys.exit(1)

    # Prepare Dummy Input
    try:
        print("\nPreparing dummy input tensor...")
        # Create dummy input IDs (batch_size=1, sequence_length=5)
        # Values must be less than vocab_size
        input_ids = torch.randint(0, VOCAB_SIZE, (1, 5), dtype=torch.long)
        print(f"Dummy input_ids (shape {input_ids.shape}):\n{input_ids}")

    except Exception as e:
        print(f"\nError preparing dummy input: {e}")
        sys.exit(1)

    # Perform Forward Pass
    try:
        print("\nPerforming forward pass...")
        # Perform the forward pass without calculating gradients
        with torch.no_grad():
            outputs = model(input_ids=input_ids)

        print("Forward pass completed successfully.")

        # Check the output structure (specific to CausalLM models)
        # It should contain logits
        if hasattr(outputs, 'logits'):
            print(f"Output logits shape: {outputs.logits.shape}") # Expected: (batch_size, sequence_length, vocab_size)
            # Verify the shape matches expectations
            expected_shape = (input_ids.shape[0], input_ids.shape[1], VOCAB_SIZE)
            if outputs.logits.shape == expected_shape:
                print(f"Test Passed: Model imported, forward pass executed, and output shape {outputs.logits.shape} is correct.")
            else:
                print(f"Test Warning: Forward pass ran, but output shape {outputs.logits.shape} does not match expected {expected_shape}.")
        else:
            print("Test Warning: Forward pass ran, but 'logits' not found in the output object.")
            print(f"Output type: {type(outputs)}")
            print(f"Output keys (if dict): {outputs.keys() if isinstance(outputs, dict) else 'N/A'}")


    except Exception as e:
        print(f"\nError during forward pass: {e}")
        print("Test Failed: Could not execute forward pass.")
        sys.exit(1)

    print("\n Test Finished")

### Check is standard Qwen2 (without any changes works)

In [4]:
from transformers.models.qwen2.modeling_qwen2_original import Qwen2ForCausalLM as Qwen2ForCausalLM_default, Qwen2Config as Qwen2Config_default
check_if_model_works(Qwen2ForCausalLM_default, Qwen2Config_default)

  from .autonotebook import tqdm as notebook_tqdm
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.



Initializing minimal Qwen2Config...
Config created: Qwen2Config {
  "attention_dropout": 0.0,
  "hidden_act": "silu",
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 256,
  "max_position_embeddings": 512,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 8,
  "num_hidden_layers": 1,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "transformers_version": "4.52.0.dev0",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 1000
}


Initializing Qwen2ForCausalLM model from config (this creates random weights)...
Model initialized successfully.

Preparing dummy input tensor...
Dummy input_ids (shape torch.Size([1, 5])):
tensor([[499, 432, 963, 785, 287]])

Performing forward pass...
Forward pass completed successfully.
Output logits shape: torch.Size([1, 5, 1000])
Test Passed: Model imported, forward pass execu

**It works**

### Now check if our modified Qwen2 model works

The changes that were made can be summarized as follows:

**Learned Positional Encoding:**

+ Removed the rotate_half and apply_rotary_pos_emb functions.

+ Removed the Qwen2RotaryEmbedding class.

+ In Qwen2Model.__init__, added self.learned_position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size).

+ In Qwen2Model.forward, removed the call to self.rotary_emb and instead added position_embeddings = self.learned_position_embeddings(position_ids) and hidden_states = hidden_states + position_embeddings right after the input embeddings are obtained.

+ Removed the position_embeddings parameter from Qwen2Attention.forward and Qwen2DecoderLayer.forward.

+ Removed sin and cos from cache_kwargs in Qwen2Attention.forward.

**Dynamic Tanh (DyT) instead of RMSNorm:**

+ Renamed the Qwen2RMSNorm class to DyT.

+ Modified the DyT.__init__ to replace self.weight with a learnable parameter self.alpha initialized to ones.

+ Modified the DyT.forward method to implement the Dynamic Tanh operation: torch.tanh(self.alpha * hidden_states).

+ Updated the extra_repr method in DyT.

+ In Qwen2PreTrainedModel._init_weights, modified the initialization logic for the DyT module to initialize alpha to ones.

+ In Qwen2DecoderLayer.__init__, replaced Qwen2RMSNorm with DyT for self.input_layernorm and self.post_attention_layernorm.

+ In Qwen2Model.__init__, replaced Qwen2RMSNorm with DyT for self.norm.

+ In Qwen2Model.forward, the call to self.norm now uses the DyT layer.

**ReLU-Attention instead of Softmax:**

+ In the eager_attention_forward function, replaced attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) with attn_weights = nn.functional.relu(attn_weights) / seq_length to apply the ReLU function and divide by the sequence length. A comment indicates this change.

**Layer-Dependent Attention Mask:**

+ Modified Qwen2Model.forward:

+ + The loop iterating through self.layers now includes enumerate to get the layer_idx.

+ + Inside the loop, a call to a new method _create_layer_attention_mask is added to generate the attention mask specific to the current layer.

+ + The generated layer_attention_mask is passed to decoder_layer.forward.

+ Added _create_layer_attention_mask method to Qwen2Model:

+ + This new method takes layer_idx and other necessary parameters as input.

+ + The generated layer-specific mask is combined with the original attention_mask (handling padding) by adding them together (assuming min_dtype for blocked positions).

+ + Includes handling for SDPA's requirement to unmask fully masked rows if applicable.

**Note that we won't be able to track if the changes we made results in the increase of the model quality, since we do not have sufficient time and resources to train the entire model. However, we can at least check if the model works, and this is what we will do.**

In [4]:
from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM as Qwen2ForCausalLM_MODIFIED, Qwen2Config as Qwen2Config_MODIFIED
check_if_model_works(Qwen2ForCausalLM_MODIFIED, Qwen2Config_MODIFIED)

  from .autonotebook import tqdm as notebook_tqdm
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.



Initializing minimal Qwen2Config...
Config created: Qwen2Config {
  "attention_dropout": 0.0,
  "hidden_act": "silu",
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 256,
  "max_position_embeddings": 512,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 8,
  "num_hidden_layers": 1,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "transformers_version": "4.52.0.dev0",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 1000
}


Initializing Qwen2ForCausalLM model from config (this creates random weights)...
Model initialized successfully.

Preparing dummy input tensor...
Dummy input_ids (shape torch.Size([1, 5])):
tensor([[ 17, 165, 718, 182, 956]])

Performing forward pass...
Forward pass completed successfully.
Output logits shape: torch.Size([1, 5, 1000])
Test Passed: Model imported, forward pass execu

**it works, hence, the experiment is successful**