In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

In [2]:
texts = [
    # Learning (0)
    "reading about transformer architecture implementation",
    "experimenting with positional encodings",
    "studying multi-head attention concept",
    "tried understanding residual connections in deep models",
    "ran tutorial on encoder-decoder models",
    "exploring self-supervised learning pipeline",
    "tested code from vision transformer repo",
    "learning how attention works with queries and keys",
    "watched lecture on language modeling techniques",
    "reviewed papers on sentence embeddings",
    "built prototype to learn attention masks",
    "debugged BERT tokenizer to understand input IDs",
    "attempted to fine-tune pre-trained model on dummy data",
    "evaluating knowledge distillation in transformers",
    "understanding causal attention flow in GPT",
    "testing weight initialization methods on dummy net",
    "running slow training loop just to observe gradients",
    "writing notes on LLaMA model architecture",
    "trying out LoRA with HuggingFace Trainer",
    "visualizing embeddings with t-SNE",

    # Building (1)
    "added attention mask support to training loop",
    "implemented feedforward block with layer norm",
    "built custom positional embedding layer",
    "added config support for hyperparameters",
    "implemented token classification head",
    "created a training script for BERT fine-tuning",
    "added support for dropout in encoder block",
    "integrated HuggingFace tokenizer with custom dataset",
    "created CLI to run inference on test data",
    "wrote script to convert raw data to tokenized format",
    "added multi-label loss support in training loop",
    "built transformer from scratch in PyTorch",
    "initialized weights from pre-trained model",
    "added callback for early stopping during training",
    "implemented batch inference pipeline",
    "created config file parser for experiments",
    "added tensorboard logging utilities",
    "built model saving and loading logic",
    "wrapped model in FastAPI for deployment",
    "generated predictions on test dataset",

    # Fixing (2)
    "fixed bug in attention score normalization",
    "corrected learning rate decay logic",
    "resolved tensor shape mismatch in decoder",
    "patched device mismatch error in training loop",
    "removed duplicated loss computation",
    "fixed NaN issue in softmax calculation",
    "resolved indexing error in token embeddings",
    "fixed data loader drop_last parameter issue",
    "replaced deprecated PyTorch functions",
    "fixed memory leak in model checkpoint logic",
    "corrected mask broadcast issue",
    "fixed model.eval() missing in validation loop",
    "patched error in early stopping condition",
    "fixed float32 vs float16 conflict on CUDA",
    "fixed crash when input length exceeds max_tokens",
    "debugged gradient clipping logic",
    "fixed improper tokenizer padding",
    "re-initialized optimizer after resuming checkpoint",
    "fixed bug in argmax index logging",
    "resolved seed reproducibility inconsistency",

    # Refactoring (3)
    "refactored model class to use nn.ModuleList",
    "simplified training loop using trainer class",
    "renamed variables for clarity",
    "moved config parsing to separate module",
    "removed unused imports and functions",
    "modularized data preprocessing code",
    "cleaned up optimizer scheduler logic",
    "restructured main.py into CLI + core logic",
    "refactored forward pass to reduce redundancy",
    "simplified activation functions for readability",
    "migrated training args into yaml config",
    "moved data augmentation out of main script",
    "refactored logging to support wandb and tensorboard",
    "organized utils into proper modules",
    "merged redundant functions into single pipeline",
    "removed legacy code blocks",
    "updated docstrings and inline comments",
    "standardized variable naming conventions",
    "grouped model layers into blocks",
    "rewrote dataset class for clarity and performance"
]

labels = [0]*20 + [1]*20 + [2]*20 + [3]*20


In [3]:
label_map = {
    0: "learning",
    1: "building",
    2: "fixing",
    3: "refactoring"
}

In [4]:
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
embeddings = emb_model.encode(texts)
embeddings.shape

(80, 384)

In [6]:
class Custom_dataset(Dataset):

  def __init__(self, embeddings, labels):
    self.labels = labels
    self.embeddings = embeddings

  def __len__(self):
    return len(self.embeddings)

  def __getitem__(self, idx):
    return torch.tensor(self.embeddings[idx]), torch.tensor(self.labels[idx])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)

In [8]:
train_dataset = Custom_dataset(X_train, y_train)
test_dataset = Custom_dataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

In [9]:
class Commit_Classifier(nn.Module):

  def __init__(self, input_dim: int):
    super().__init__()
    self.input_dim = input_dim
    self.cls = nn.Sequential(
        nn.Linear(input_dim, 128),
        nn.Dropout(0.1),
        nn.ReLU(),
        nn.LayerNorm(128),
        nn.Linear(128, 64),
        nn.Dropout(0.1),
        nn.ReLU(),
        nn.LayerNorm(64),
        nn.Linear(64, 4)
    )

  def forward(self, x):
    return self.cls(x)

In [10]:
embeddings.shape[1]

384

In [11]:
model = Commit_Classifier(embeddings.shape[1])

In [12]:
lr = 1e-3
epochs = 20
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

In [13]:
# training loop
for i in range(epochs):
  print(f"Epoch {i+1}")
  for x_train, y_train in train_dataloader:
    optimizer.zero_grad()
    y_pred = model(x_train)
    loss = loss_fn(y_pred, y_train)
    loss.backward()
    optimizer.step()
    print(f"Loss: {loss.item()}")

Epoch 1
Loss: 1.627692461013794
Loss: 1.4753458499908447
Loss: 1.4538193941116333
Loss: 1.448404312133789
Epoch 2
Loss: 0.8880918622016907
Loss: 0.9180330038070679
Loss: 1.0035710334777832
Loss: 0.8064764738082886
Epoch 3
Loss: 0.6431195139884949
Loss: 0.47221991419792175
Loss: 0.7190206050872803
Loss: 0.7078626155853271
Epoch 4
Loss: 0.4433968663215637
Loss: 0.5145807266235352
Loss: 0.4750795364379883
Loss: 0.5384621620178223
Epoch 5
Loss: 0.43820294737815857
Loss: 0.3518770635128021
Loss: 0.3367557227611542
Loss: 0.4132128357887268
Epoch 6
Loss: 0.32251667976379395
Loss: 0.30109018087387085
Loss: 0.32789817452430725
Loss: 0.2647404670715332
Epoch 7
Loss: 0.25290173292160034
Loss: 0.2946467995643616
Loss: 0.27042123675346375
Loss: 0.19401967525482178
Epoch 8
Loss: 0.24272114038467407
Loss: 0.1962645798921585
Loss: 0.1670837104320526
Loss: 0.17264580726623535
Epoch 9
Loss: 0.194443941116333
Loss: 0.19736549258232117
Loss: 0.12322194874286652
Loss: 0.15259955823421478
Epoch 10
Loss: 0.1

In [14]:
y_pred.data[1]

tensor([-0.3926, -0.6818, -1.3391,  3.9107])

In [17]:
# eval loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
  for x_test, y_test in test_dataloader:
    y_pred = model(x_test)
    _, predicted = torch.max(y_pred.data, dim = 1)
    print(predicted)
    correct += (predicted == y_test).sum().item()
    total += y_test.size(0)
    print(f"Accuracy: {correct/total}")

    print(f"Loss: {loss.item()}")

tensor([1, 0, 0, 0, 1, 1, 0, 3, 0, 2, 2, 1, 3, 1, 3, 2])
Accuracy: 0.75
Loss: 0.0290666650980711


In [18]:
input_txt = input("Enter commit message: ")
input_emb = emb_model.encode([input_txt])
pred = model(torch.tensor(input_emb))
print(pred)

Enter commit message: just uploader readme.md file
tensor([[-1.8882,  0.4069,  0.2085,  1.2426]], grad_fn=<AddmmBackward0>)


In [19]:
torch.max(pred.data, dim=1)

torch.return_types.max(
values=tensor([1.2426]),
indices=tensor([3]))