In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import numpy as np
import json, re, nltk, string
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import keras
from keras import backend as K
from keras import initializers
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from tensorflow.keras.layers import Layer, InputSpec
from keras.layers import (
    Dense,
    Dropout,
    Embedding,
    LSTM,
    GRU,
    Bidirectional,
    BatchNormalization,
    Flatten,
    Input,
    RepeatVector,
    TimeDistributed,
    Permute,
    multiply,
    Lambda,
    Activation,
)
from tensorflow.keras.optimizers import Adam # - Works
from keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score, precision_score, recall_score

# dataset
mozilla_core

mozilla_firefox

In [2]:
# Data path
all_bugs_json = './data/mozilla_core/deep_data.json'

In [3]:
# # GLOVE
# glove_file = './data/mozilla_core/vectors.txt'
# tmp_file = './data/mozilla_core/glove.txt'
# glove2word2vec(glove_file, tmp_file)
# wordvec_model = KeyedVectors.load_word2vec_format(tmp_file)
# vocabulary = wordvec_model.wv.vocab

In [4]:
# Word2vec parameters
min_word_frequency = 5
embed_size = 200
context_window = 5

# NN hyperparameters
num_cv = 10
max_sentence_num = 20
max_sentence_len = 10
num_rnn_unit = 512
num_dense_unit = 1000
rank_k = 10
batch_size = 256

# Mozilla repeated sentence
removal_sent = ['Steps to reproduce:',
                'Actual results:',
                'Expected results:']

In [5]:
# Import data
with open(all_bugs_json) as data_file:
    text = data_file.read()
    text = text.replace('" : NULL', '" : "NULL"')
    data = json.loads(text, strict=False)

open_title = []
open_desc = []
closed_title = []
closed_desc = []
closed_owner = []
for item in data:
    # Mozilla
    status = ['VERIFIED', 'RESOLVED', 'CLOSED']
    invalid_owner = ['nobody@mozilla.org', 'nobody@bugzilla.org']
    
    closed_title.append(item['issue_title'])
    closed_desc.append(item['description'])
    closed_owner.append(item['owner'])
  
    open_title.append(item['issue_title'])
    open_desc.append(item['description'])

closed_title_20 = []
closed_desc_20 = []
closed_owner_20 = []
owner = {}
for key in closed_owner:
    owner[key] = owner.get(key, 0) + 1
for i in range(len(closed_owner)):
    if owner[closed_owner[i]] >= 20:
        closed_title_20.append(closed_title[i])
        closed_desc_20.append(closed_desc[i])
        closed_owner_20.append(closed_owner[i])

print(len(open_title))
print(len(closed_title))
print(len(closed_title_20))

186173
186173
181308


In [6]:
# Owner details
owner_cnt = {}
for owner in closed_owner_20:
    owner_cnt[owner] = owner_cnt.get(owner, 0) + 1
sorted_owner_cnt = sorted(owner_cnt.items(), key=lambda x: x[1], reverse=True)
for i in range(10):
    print(sorted_owner_cnt[i])
print(len(sorted_owner_cnt))

('nobody@mozilla.org', 103107)
('general@js.bugs', 5590)
('attinasi@formerly-netscape.com.tld', 2574)
('karnaze@formerly-netscape.com.tld', 2513)
('rods@formerly-netscape.com.tld', 2342)
('darin.moz@gmail.com', 1993)
('jst@mozilla.org', 1615)
('dbaron@dbaron.org', 1394)
('joki@formerly-netscape.com.tld', 1306)
('general@dom.bugs', 1191)
359


In [7]:
# Define preprocessing function
def preprocess(title, desc):
    # Remove \r and repeated sentence
    current_title = title.replace('\r', ' ')
    current_desc = desc.replace('\r', ' ')
    for sent in removal_sent:
        current_desc = current_desc.replace(sent, ' ')
    # Remove URLs
    current_desc = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', current_desc)
    # Change to lower case
    current_title = current_title.lower()
    current_desc = current_desc.lower()
    # Remove stack trace
    start_loc = current_desc.find("stack trace")
    current_desc = current_desc[:start_loc]    
    # Remove hex code
    current_title = re.sub(r'(\w+)0x\w+', '', current_title)
    current_desc = re.sub(r'(\w+)0x\w+', '', current_desc)
    # Tokenize sentence
    current_title_tokens = nltk.sent_tokenize(current_title)
    current_desc_tokens = nltk.sent_tokenize(current_desc)
    current_desc_tokens_list = [desc.split('\n') for desc in current_desc_tokens]
    current_desc_tokens = []
    for desc in current_desc_tokens_list:
        current_desc_tokens += desc
    # Remove punctuation
    def remove_punct(report):
        report_filter = []
        for sent in report:
            for punct in string.punctuation:
                sent = sent.replace(punct, '')
            report_filter.append(sent)
        return report_filter
    current_title_filter = remove_punct(current_title_tokens)
    current_desc_filter = remove_punct(current_desc_tokens)
    # Tokenize word
    current_title_filter = [nltk.word_tokenize(sent) for sent in current_title_filter]
    current_desc_filter = [nltk.word_tokenize(sent) for sent in current_desc_filter]
    # Lemmatization
    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return None
    tagged_title = [nltk.pos_tag(title) for title in current_title_filter]
    tagged_desc = [nltk.pos_tag(desc) for desc in current_desc_filter]
    current_title_lemm = [[WordNetLemmatizer().lemmatize(tag[0], pos=get_wordnet_pos(tag[1]) or wordnet.NOUN) for tag in title] for title in tagged_title]
    current_desc_lemm = [[WordNetLemmatizer().lemmatize(tag[0], pos=get_wordnet_pos(tag[1]) or wordnet.NOUN) for tag in desc] for desc in tagged_desc]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    current_title_stop = [[word for word in title if not word in stop_words] for title in current_title_lemm]
    current_desc_stop = [[word for word in desc if not word in stop_words] for desc in current_desc_lemm]
    # Merge title and description
    current_report = current_title_stop + current_desc_stop
    current_report = list(filter(None, current_report))
    
    return current_report

In [8]:
# Bug reports for pre-training word vectors
open_report = []
open_word = {}
for i in range(len(open_title[:1000])):
    current_report = preprocess(open_title[i], open_desc[i])
    # Flatten
    current_report = [word for sent in current_report for word in sent]
    open_report.append(current_report)

In [9]:
# Train word vectors
wordvec_model = Word2Vec(open_report, min_count=min_word_frequency, vector_size=embed_size, window=context_window)
vocabulary = wordvec_model.wv.key_to_index

In [10]:
# Bug reports for training and testing
closed_report = []
closed_owner = []
for i in range(len(closed_title_20[0:1000])):
    current_report = preprocess(closed_title_20[i], closed_desc_20[i])
    closed_report.append(current_report)
    closed_owner.append(closed_owner_20[i])

In [11]:
# Remove all the words that is not present in the vocabulary
update_report = []
update_owner = []
for i in range(len(closed_owner[0:1000])):
    update_sents = []
    for sent in closed_report[i]:
        current_sent = [word for word in sent if word in vocabulary]
        update_sents.append(current_sent)
    update_sents = list(filter(None, update_sents))
    update_report.append(update_sents)
    update_owner.append(closed_owner[i])

In [12]:
# Convert words to numbers
flatten_report = []
for report in update_report:
    for sent in report:
        flatten_report.append(sent)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(flatten_report)

for report in update_report:
    for sent in report:
        for i, word in enumerate(sent):
            sent[i] = tokenizer.word_index[word]

In [13]:
# Make embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embed_size))
for word, i in word_index.items():
    embedding_vector = wordvec_model.wv[word]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [16]:
# Define topk_accuracy
def topk_accuracy(prediction, y_test, classes, rank_k=10):
    accuracy = []
    sortedIndices = []
    pred_classes = []
    for ll in prediction:
        sortedIndices.append(
            sorted(range(len(ll)), key=lambda ii: ll[ii], reverse=True)
        )
    for k in range(1, rank_k + 1):
        id = 0
        trueNum = 0
        for sortedInd in sortedIndices:
            pred_classes.append(classes[sortedInd[:k]])
            if np.argmax(y_test[id]) in sortedInd[:k]:
                trueNum += 1
            id += 1
        accuracy.append((float(trueNum) / len(prediction)) * 100)

    return accuracy

def f_measure(prediction, y_test, classes, mode='macro'):
    y_pred = []
    y_true = []
    sortedIndices = []   
    for ll in prediction:
        sortedIndices.append(
            sorted(range(len(ll)), key=lambda ii: ll[ii], reverse=True)
        )
    id = 0
    for sortedInd in sortedIndices:
        ind = np.argmax(y_test[id])
        if ind in sortedInd[:10]:
            y_pred.append(ind)
        else:
            y_pred.append(-1)
        id += 1
    for y in y_test:
        y_true.append(np.argmax(y))
            
    f1 = f1_score(y_true, y_pred, average = mode)
    
    return f1

# class defining the custom attention layer
class HierarchicalAttentionNetwork(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(HierarchicalAttentionNetwork, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim,)))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weigh = [self.W, self.b, self.u]
        super(HierarchicalAttentionNetwork, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))

        ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        weighted_input = x * K.expand_dims(ait)
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]
    
    def _get_attention_weights(self, X):

        uit = K.tanh(K.bias_add(K.dot(X, self.W), self.b))
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)
        ait = K.exp(ait)
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        return ait

In [17]:
# Train and test
splitLength = len(update_report) // (num_cv + 1)
slice_results = {}
top_rank_k_accuracies = []
# f1_measure = []
for i in range(1, num_cv + 1):
    print(i)
    train_report = update_report[:i*splitLength-1]
    train_owner = update_owner[:i*splitLength-1]
    test_report = update_report[i*splitLength:(i+1)*splitLength-1]
    test_owner = update_owner[i*splitLength:(i+1)*splitLength-1]
        
    # Remove data from test set that is not there in train set
    train_owner_unique = set(train_owner)
    test_owner_unique = set(test_owner)
    unwanted_owner = list(test_owner_unique - train_owner_unique)
    update_test_report = []
    update_test_owner = []
    for i in range(len(test_owner)):
        if test_owner[i] not in unwanted_owner:
            update_test_report.append(test_report[i])
            update_test_owner.append(test_owner[i])
    
    unique_train_owner = list(set(train_owner))
    classes = np.array(unique_train_owner)
    
    # Create train and test data
    X_train = np.zeros(shape=[len(train_report), max_sentence_num, max_sentence_len], dtype="int32")
    Y_train = np.zeros(shape=[len(train_owner), 1], dtype="int32")
    for i, report in enumerate(train_report):
        for j, sent in enumerate(report):
            if j < max_sentence_num:
                k = 0
                for word in sent:
                    if k < max_sentence_len:
                        X_train[i, j, k] = word
                        k = k + 1
        Y_train[i, 0] = unique_train_owner.index(train_owner[i])
    
    X_test = np.zeros(shape=[len(update_test_report), max_sentence_num, max_sentence_len], dtype="int32")
    Y_test = np.zeros(shape=[len(update_test_owner), 1], dtype="int32")
    for i, report in enumerate(update_test_report):
        for j, sent in enumerate(report):
            if j < max_sentence_num:
                k = 0
                for word in sent:
                    if k < max_sentence_len:
                        X_test[i, j, k] = word
                        k = k + 1
        Y_test[i, 0] = unique_train_owner.index(update_test_owner[i])    
    
    y_train = np_utils.to_categorical(Y_train, len(unique_train_owner))
    y_test = np_utils.to_categorical(Y_test, len(unique_train_owner))
    
    # Model
    word_input = Input(shape=(max_sentence_len,), dtype='float32')
    embedded_sequences = Embedding(len(embedding_matrix), embed_size, weights=[embedding_matrix], input_length=max_sentence_len, trainable=True)(word_input)
    l_gru = Bidirectional(GRU(num_rnn_unit, return_sequences=True, dropout=0.2))(embedded_sequences)
    l_dense = TimeDistributed(Dense(num_dense_unit))(l_gru)
    l_att = HierarchicalAttentionNetwork(max_sentence_num)(l_dense)
    word_encoder = Model(word_input, l_att)
    
    sent_input = Input(shape=(max_sentence_num, max_sentence_len), dtype='float32')
    sent_encoder = TimeDistributed(word_encoder)(sent_input)
    l_gru_sent = Bidirectional(GRU(num_rnn_unit, return_sequences=True, dropout=0.2))(sent_encoder)
    l_dense_sent = TimeDistributed(Dense(num_dense_unit))(l_gru_sent)
    l_att_sent = HierarchicalAttentionNetwork(max_sentence_len)(l_dense_sent)
    preds = Dense(len(classes), activation='softmax')(l_att_sent)
    model = Model(sent_input, preds)
    
    model.compile(
        loss="categorical_crossentropy", optimizer=Adam(lr=1e-4), metrics=["accuracy"]
    )

    early_stopping = EarlyStopping(monitor="val_loss", patience=3)
    hist = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, epochs=500, callbacks=[early_stopping], verbose=0)
    
    prediction = model.predict(X_test)
    accuracy = topk_accuracy(prediction, y_test, classes, rank_k=rank_k)
#     f1 = f_measure(prediction, y_test, classes, mode='macro')
    print("CV{0}, top1 - ... - top{1} accuracy: ".format(i, rank_k), accuracy)
    
    train_result = hist.history
    train_result["test_topk_accuracies"] = accuracy
    slice_results[i + 1] = train_result
    top_rank_k_accuracies.append(accuracy[-1])
#     f1_measure.append(f1)
    
    del model
    
print("Top{0} accuracies for all CVs: {1}".format(rank_k, top_rank_k_accuracies))
print("Average top{0} accuracy: {1}".format(rank_k, sum(top_rank_k_accuracies)/rank_k))
# print(f1_measure)
# print(np.mean(f1_measure))

1


  super(Adam, self).__init__(name, **kwargs)


CV83, top1 - ... - top10 accuracy:  [9.523809523809524, 36.904761904761905, 39.285714285714285, 48.80952380952381, 50.0, 52.38095238095239, 65.47619047619048, 67.85714285714286, 67.85714285714286, 69.04761904761905]
2
CV83, top1 - ... - top10 accuracy:  [5.952380952380952, 33.33333333333333, 44.047619047619044, 50.0, 51.19047619047619, 53.57142857142857, 55.952380952380956, 64.28571428571429, 67.85714285714286, 72.61904761904762]
3
CV82, top1 - ... - top10 accuracy:  [18.072289156626507, 28.915662650602407, 37.34939759036144, 48.19277108433735, 54.21686746987952, 57.831325301204814, 60.24096385542169, 62.65060240963856, 63.85542168674698, 66.26506024096386]
4
CV82, top1 - ... - top10 accuracy:  [8.433734939759036, 21.686746987951807, 27.710843373493976, 32.53012048192771, 34.93975903614458, 42.168674698795186, 49.39759036144578, 59.036144578313255, 60.24096385542169, 72.28915662650603]
5
CV85, top1 - ... - top10 accuracy:  [9.30232558139535, 13.953488372093023, 26.744186046511626, 36.0

KeyboardInterrupt: 