In [49]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [50]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [51]:
file_path = 'data/github_gold.csv'
df = pd.DataFrame({'label':int(), 'text':str()}, index = [])
with open(file_path) as f:
  count = 0
  for line in f.readlines():
    if count == 0:
      count += 1
      continue
    split = line.split(';')
    df = df.append({'label': 2 if split[1] == 'positive' else 1 if split[1] == 'neutral' else 0,
                    'text': split[2]},
                    ignore_index = True)
df.head()

Unnamed: 0,label,text
0,1,"""No. I still see the wrong twins. * https://g..."
1,1,"""Reverted.""""""\n"
2,1,"""You can leave a queue while in queue ? (befor..."
3,2,"""Didn't look at SpellTargetRestrictions XD""""""\n"
4,1,"""Not sure about what kind of line lengths the ..."


In [52]:
labels = df.label.values
text = df.text.values

In [53]:
tokenizer = DistilBertTokenizer.from_pretrained(
    'distilbert-base-uncased'
    )

In [54]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 128,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [55]:
test_ratio = 0.2
batch_size = 32

train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = test_ratio,
    shuffle = True,
    stratify = labels
)

train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [56]:
def b_tp(preds, labels):
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

In [57]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False,
)

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 3e-5,
                              eps = 1e-08
                              )

model.cuda()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_clas

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [58]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

epochs = 3

for _ in trange(epochs, desc = 'Epoch'):
    model.train()
    
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        train_output = model(b_input_ids,  
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        train_output.loss.backward()
        optimizer.step()
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    model.eval()

    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          eval_output = model(b_input_ids,  
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        if b_precision != 'nan': val_precision.append(b_precision)
        if b_recall != 'nan': val_recall.append(b_recall)
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

Epoch:  33%|███▎      | 1/3 [00:17<00:34, 17.08s/it]


	 - Train loss: 0.4899
	 - Validation Accuracy: 0.6310
	 - Validation Precision: 0.8765
	 - Validation Recall: 0.9649
	 - Validation Specificity: 0.8124



Epoch:  67%|██████▋   | 2/3 [00:34<00:17, 17.04s/it]


	 - Train loss: 0.2075
	 - Validation Accuracy: 0.6374
	 - Validation Precision: 0.9218
	 - Validation Recall: 0.8711
	 - Validation Specificity: 0.8936



Epoch: 100%|██████████| 3/3 [00:51<00:00, 17.04s/it]


	 - Train loss: 0.1249
	 - Validation Accuracy: 0.6478
	 - Validation Precision: 0.8977
	 - Validation Recall: 0.9529
	 - Validation Specificity: 0.8462






In [59]:
file_path = 'data/test.csv'
df = pd.DataFrame({'label':str(), 'text':str()}, index = [])
with open(file_path) as f:
  count = 0
  for line in f.readlines():
    if count == 0:
      count += 1
      continue
    new_sentence = line

    test_ids = []
    test_attention_mask = []

    encoding = preprocessing(new_sentence, tokenizer)

    test_ids.append(encoding['input_ids'])
    test_attention_mask.append(encoding['attention_mask'])
    test_ids = torch.cat(test_ids, dim = 0)
    test_attention_mask = torch.cat(test_attention_mask, dim = 0)

    with torch.no_grad():
      output = model(test_ids.to(device), attention_mask = test_attention_mask.to(device))

    prediction = 'positive' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 2 else 'neutral' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'negative'
    df = df.append({'label': prediction, 'text': new_sentence}, ignore_index = True)



In [60]:
df

Unnamed: 0,label,text
0,neutral,Fix snapshot version\n
1,neutral,update chagelog\n
2,neutral,edit coverage colors icon\n
3,neutral,LRQA - 14419 Add new property to turn on runni...
4,neutral,Added joscar JAR .\n
...,...,...
2516,neutral,reapply fabric puglin\n
2517,neutral,Bump up revision number .\n
2518,neutral,Ignore local changes on .\n
2519,neutral,Don ' t publish shrinkwrap\n


In [61]:
!pip install pydriller

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [62]:
from pydriller import Repository

df2 = pd.DataFrame({'label':str(), 'text':str()}, index = [])
#This is the most emotional repo I found
for commit in Repository('https://github.com/ngerakines/commitment').traverse_commits():
    new_sentence = commit.msg

    test_ids = []
    test_attention_mask = []

    encoding = preprocessing(new_sentence, tokenizer)

    test_ids.append(encoding['input_ids'])
    test_attention_mask.append(encoding['attention_mask'])
    test_ids = torch.cat(test_ids, dim = 0)
    test_attention_mask = torch.cat(test_attention_mask, dim = 0)


    with torch.no_grad():
      output = model(test_ids.to(device), attention_mask = test_attention_mask.to(device))

    prediction = 'positive' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 2 else 'neutral' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'negative'
    df2 = df2.append({'label': prediction, 'text': new_sentence}, ignore_index = True)



In [63]:
df2

Unnamed: 0,label,text
0,neutral,Committed some changes
1,neutral,fixed errors in the previous commit
2,neutral,Obligatory placeholder commit message
3,neutral,Adding more messages.
4,neutral,Locating the required gigapixels to render...
...,...,...
366,neutral,reworked application (#245)\n\n* improved appl...
367,positive,"I understand that it's an antipattern, but it'..."
368,neutral,Sorting commit messages
369,neutral,"permanent hack, do not revert"
