In [1]:
import pandas as pd
import torch
import time
import numpy as np
import warnings
from gensim.models.word2vec import Word2Vec
from model import BatchProgramCC
from torch.autograd import Variable
from sklearn import metrics
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_pickle('parsed_source.pkl').sample(frac=1)
word2vec = Word2Vec.load("word2vec_node_50").wv

train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)
len(train_data), len(val_data)

(107, 12)

In [3]:
MAX_TOKENS = word2vec.syn0.shape[0]
EMBEDDING_DIM = word2vec.syn0.shape[1]
embeddings = np.zeros((MAX_TOKENS + 1, EMBEDDING_DIM), dtype="float32")
embeddings[:word2vec.syn0.shape[0]] = word2vec.syn0

In [4]:
ENCODE_DIM = 64
HIDDEN_DIM = 32
LABELS = 1
BATCH_SIZE = 10
USE_GPU = False

In [5]:
model = BatchProgramCC(EMBEDDING_DIM,HIDDEN_DIM,MAX_TOKENS+1,ENCODE_DIM,LABELS,BATCH_SIZE,USE_GPU, embeddings)
if USE_GPU: model.cuda()
    
parameters = model.parameters()
optimizer = torch.optim.Adamax(parameters)
loss_function = torch.nn.BCELoss()

In [6]:
def train():
    total_loss = 0.
    permutation = torch.randperm(len(train_data))
    for i in range(0, len(train_data), BATCH_SIZE):
        idx = permutation[i:i+BATCH_SIZE]
        batch_x = train_data['block_seq'].to_numpy()[idx]
        batch_y = train_data['b_label'].to_numpy()[idx]
        
        
        optimizer.zero_grad()
        model.batch_size = len(batch_y)
        model.hidden = model.init_hidden()
        output = model(batch_x)
        
        loss = loss_function(output[0], Variable(torch.FloatTensor(batch_y)))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
    return total_loss

def evaluate(eval_model, data):
    total_loss = 0.
    start_time = time.time()
    permutation = torch.randperm(len(data))
    
    y_pred = []
    y_true = []
    with torch.no_grad():
        for i in range(0, len(data), BATCH_SIZE):
            idx = permutation[i:i+BATCH_SIZE]
            batch_x = data['block_seq'].to_numpy()[idx]
            batch_y = data['b_label'].to_numpy()[idx]

            model.batch_size = len(batch_y)
            model.hidden = model.init_hidden()
            output = model(batch_x)
            loss = loss_function(output[0], Variable(torch.FloatTensor(batch_y)))
            total_loss += loss
            
            y_pred.extend(output[0].reshape(-1,))
            y_true.extend(batch_y.reshape(-1,))
        
    return total_loss, y_pred, y_true

In [None]:
best_val_loss = float("inf")
epochs = 20
best_model = None
best_epoch = -1

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train_loss = train()
    val_loss,_,_ = evaluate(model,val_data)
    print('| end of epoch {:3d} | time: {:5.2f}s | train loss {:5.2f}| valid loss {:5.2f}'.format(epoch, (time.time() - epoch_start_time),train_loss, val_loss))

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
        best_epoch = epoch

print("best_epoch",best_epoch)

| end of epoch   1 | time: 11.06s | train loss  7.07| valid loss  1.21
| end of epoch   2 | time: 11.50s | train loss  6.56| valid loss  1.19
| end of epoch   3 | time: 10.68s | train loss  6.46| valid loss  0.83
| end of epoch   4 | time: 14.11s | train loss  6.36| valid loss  0.82


# Testing

In [None]:
test_data = pd.read_pickle('parsed_source_test.pkl').sample(frac=1)
len(test_data)

In [None]:
print("Testing...")
test_loss, y_pred, y_true = evaluate(best_model, test_data)

In [None]:
y_pred_class = [x>0.5 for x in y_pred]
print(metrics.classification_report(y_true, y_pred_class, target_names = ["Non-Defect", "Defect"]))

In [None]:
print("AUC", metrics.roc_auc_score(y_true, y_pred))

# Subtying: split into 3 groups: buggy/ problematic/ fine

In [None]:
def subtyping(eval_model, data):
    total_loss = 0.
    start_time = time.time()
    permutation = torch.randperm(len(data))
    
    outputs = []
    with torch.no_grad():
        for i in range(0, len(data), BATCH_SIZE):
            idx = permutation[i:i+BATCH_SIZE]
            batch_x = data['block_seq'].to_numpy()[idx]
            batch_y = data['b_label'].to_numpy()[idx]

            model.batch_size = len(batch_y)
            model.hidden = model.init_hidden()
            output = model(batch_x)
            outputs.append(output[1].data.numpy())
        
    return outputs

In [None]:
outputs = subtyping(best_model, test_data)

In [None]:
doc = np.concatenate((outputs),axis=0)
doc.shape

In [None]:
from sklearn.cluster import KMeans
from collections import Counter
k=3
kmeans = KMeans(n_clusters=k, random_state=0).fit(doc)
group_id = {i: np.where(kmeans.labels_ == i)[0] for i in range(k)}

Counter(kmeans.labels_)

In [None]:
buddy = test_data['label'].to_numpy()
for gid in range(k):
    print(len(group_id[gid]), np.mean(buddy[group_id[gid]].reshape(-1,)))