In [1]:
! pip install peft scikit-learn transformers pandas datasets

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset, DatasetDict
from sklearn.metrics import f1_score
import torch

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df['stratify'] = df['file_extension'] + '_' + df['label'].astype(str)
train, val = train_test_split(df, test_size=0.075, stratify=df['stratify'], random_state=1)

In [5]:
train = train[[ 'code','file_extension', 'label']]
val = val[['code','file_extension', 'label']]

In [6]:
print(train.shape)
print(val.shape)

(161493, 3)
(13095, 3)


In [7]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/graphcodebert-base')
model = AutoModelForSequenceClassification.from_pretrained('microsoft/graphcodebert-base', num_labels=2)

  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
for name, module in model.named_modules():
    print(name)

In [9]:
# lora_config = LoraConfig(
#     r=8,  # LoRA rank
#     lora_alpha=16,
#     lora_dropout=0.1,
#     target_modules=['query', 'key', 'value'],  # Apply LoRA to specific model layers
#     bias="none",
#     task_type="SEQ_CLS"  # Sequence classification task
# )

# # Apply LoRA to the base model
# model = get_peft_model(model, lora_config)

In [10]:
def preprocess_function(examples):
    input_texts = []
    
    # Process each 'code' and 'file_extension' in the batch
    for code, file_extension in zip(examples['code'], examples['file_extension']):
        # Split the 'code' into lines
        lines = code.split('\n')
        
        # Extract the third line (C) if available
        third_line = lines[2] if len(lines) > 2 else ''
        
        # Concatenate relevant parts: full code, third line, and file_extension
        input_text = code + " " + third_line + " " + file_extension
        input_texts.append(input_text)
    
    # Tokenize using DistilBERT's tokenizer for the entire batch
    tokenized_inputs = tokenizer(input_texts, padding='max_length', truncation=True, max_length=512)
    
    return tokenized_inputs

In [11]:
train_dataset = Dataset.from_pandas(train)  # Your pre-split training data
val_dataset = Dataset.from_pandas(val)      # Your pre-split validation data

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/161493 [00:00<?, ? examples/s]

Map:   0%|          | 0/13095 [00:00<?, ? examples/s]

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to a PyTorch tensor if they are not already
    if isinstance(logits, np.ndarray):
        logits = torch.tensor(logits)
    
    preds = torch.argmax(logits, axis=-1)
    f1 = f1_score(labels, preds.numpy(), average='weighted')  # Convert back to numpy for f1_score
    return {'f1': f1}

In [14]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50, 
    load_best_model_at_end=True,
    save_strategy='epoch',
    learning_rate=2e-5,
    eval_steps=100,
    save_total_limit=2,
    fp16 = True
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.4635,0.471632,0.756878
2,0.4091,0.455081,0.782731
3,0.3277,0.451679,0.799521
4,0.2653,0.53021,0.795565
5,0.219,0.590839,0.799506
6,0.1614,0.761599,0.797949


KeyboardInterrupt: 

In [15]:
save_directory = "./graphcodebertV2"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

('./graphcodebertV2/tokenizer_config.json',
 './graphcodebertV2/special_tokens_map.json',
 './graphcodebertV2/vocab.json',
 './graphcodebertV2/merges.txt',
 './graphcodebertV2/added_tokens.json',
 './graphcodebertV2/tokenizer.json')

In [16]:
test_df = pd.read_csv('test.csv') 

# Load the trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('graphcodebert')
model = AutoModelForSequenceClassification.from_pretrained('graphcodebert')

# Make sure the model is in evaluation mode
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [19]:
from tqdm import tqdm
test_df = pd.read_csv('test.csv')

# Load the trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('graphcodebertV2')
model = AutoModelForSequenceClassification.from_pretrained('graphcodebertV2')

# Make sure the model is in evaluation mode
model.eval()

# Function to preprocess input examples for the model
def preprocess_function(examples):
    input_texts = []
    
    for code, file_extension in zip(examples['code'], examples['file_extension']):
        # Split the code into lines
        lines = code.split('\n')
        
        # Extract the third line (C) if available
        third_line = lines[2] if len(lines) > 2 else ''
        
        # Concatenate relevant parts: full code, third line, and file_extension
        input_text = code + " " + third_line + " " + file_extension
        input_texts.append(input_text)
    
    # Tokenize using DistilBERT's tokenizer
    tokenized_inputs = tokenizer(input_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    return tokenized_inputs

# Move the model to the correct device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize an empty list for the predictions
predictions = []

# Define batch size
batch_size = 16  # You can adjust this based on your GPU memory availability

# Loop through the test dataset in batches
for i in tqdm(range(0, len(test_df), batch_size)):
    batch_df = test_df.iloc[i:i + batch_size]
    
    # Preprocess the batch
    test_inputs = preprocess_function(batch_df)
    
    # Move tensors to the correct device (CPU or GPU)
    test_inputs = {k: v.to(device) for k, v in test_inputs.items()}
    
    # Perform inference
    with torch.no_grad():
        outputs = model(**test_inputs)
        batch_predictions = torch.argmax(outputs.logits, dim=-1)
    
    # Convert predictions to CPU and add to the list
    predictions.extend(batch_predictions.cpu().numpy())
    
    # Clear the cache to avoid memory overflow
    torch.cuda.empty_cache()

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'label': predictions
})

# Save the submission file to CSV
submission_df.to_csv('submission3.csv', index=False)

print("Submission file created: submission.csv")

100%|██████████| 3628/3628 [03:11<00:00, 18.99it/s]


Submission file created: submission.csv
