In [1]:
import torch 
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from transformers import RobertaModel, RobertaConfig

class CustomCodeBERTModel(RobertaModel):
    def __init__(self, config, remove_layer_idx):
        super().__init__(config)
        # Remove the specified encoder layer
        if 0 <= remove_layer_idx < config.num_hidden_layers:
            self.encoder.layer = torch.nn.ModuleList(
                [layer for i, layer in enumerate(self.encoder.layer) if i != remove_layer_idx]
            )
            # Update the number of layers in the config
            self.config.num_hidden_layers -= 1
        else:
            raise ValueError("Layer index to remove is out of range.")

# Load CodeBERT configuration
config = RobertaConfig.from_pretrained('microsoft/codebert-base')

# Create the modified CodeBERT model with layer 5 removed
model = CustomCodeBERTModel.from_pretrained('microsoft/codebert-base', config=config, remove_layer_idx=5)




In [5]:
def load_custom_weights(model, pretrained_model_name, remove_layer_idx):
    # Load the original pretrained weights
    pretrained_state_dict = RobertaModel.from_pretrained(pretrained_model_name).state_dict()

    # Create a new state dict without the weights of the removed layer
    custom_state_dict = {}
    for key, value in pretrained_state_dict.items():
        # Check if the key belongs to an encoder layer
        if "encoder.layer" in key:
            layer_idx = int(key.split('.')[2])
            # Skip weights from the removed layer
            if layer_idx != remove_layer_idx:
                # Adjust layer numbering if necessary
                if layer_idx > remove_layer_idx:
                    # Decrement the layer index in the key by 1
                    new_key = key.replace(f"encoder.layer.{layer_idx}.", f"encoder.layer.{layer_idx-1}.")
                    custom_state_dict[new_key] = value
                else:
                    custom_state_dict[key] = value
        else:
            # For other keys (e.g., embeddings, pooler), keep them unchanged
            custom_state_dict[key] = value
    
    # Load the adjusted state dict into the modified model
    model.load_state_dict(custom_state_dict, strict=False)

# Load custom weights after removing layer 5
load_custom_weights(model, 'microsoft/codebert-base', remove_layer_idx=5)


In [6]:
model.eval()

CustomCodeBERTModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-10): 11 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            

In [7]:
import pandas as pd 

In [9]:
java = pd.read_json('java_test_0.jsonl', lines=True)

In [11]:
len(java)

26909