In [1]:
import collections
import pandas as pd
import mxnet as mx
import numpy as np
from mxnet import autograd, gluon, init, metric, nd
from mxnet.contrib import text
from mxnet.gluon import loss as gloss, nn, rnn
from mxnet.gluon import data as gdata, loss as gloss, nn
from sklearn import metrics
import os
import random
import zipfile
import re
import pickle
import csv
import time
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters


## Load Data

In [2]:
COLUMNS = ['id', 'component', 'title', 'desp', 'product', 'severity', '6', '7', '8', '9','10']
PATH = './data/'

train = pd.read_csv(PATH + 'sample_sev0.txt', header=0, index_col=None, names=COLUMNS)
test = pd.read_csv(PATH + 'sample_sev10.txt', header=0, index_col=None, names=COLUMNS)

for i in range(1, 10):
    train = pd.concat([train, pd.read_csv(PATH + 'sample_sev' + str(i) + '.txt', header=0, index_col=None, names=COLUMNS)])  

In [3]:
print(train['severity'].value_counts())
print(test['severity'].value_counts())

major       631
minor       622
critical    455
trivial     400
blocker     242
Name: severity, dtype: int64
major       63
minor       62
critical    48
trivial     40
blocker     22
Name: severity, dtype: int64


In [4]:
LABELS = {
    "trivial": 0,
    "minor": 1,
    "major": 2,
    "critical": 3,
    "blocker": 4
}
print(LABELS.keys())

train = train[ train['severity'].isin(LABELS.keys()) ].replace(LABELS)
test = test[ test['severity'].isin(LABELS.keys()) ].replace(LABELS)

dict_keys(['trivial', 'minor', 'major', 'critical', 'blocker'])


In [5]:
def clean_str(string):
    string = re.sub(r'\\n',' ',string)
    string = re.sub(r'\\r',' ',string)
    string = re.sub(r'\\t',' ',string)
    string = re.sub(r'</?\w+[^>]*>',' ',string)
    string = re.sub(r'[^A-Za-z0-9(),!?\'\`]', ' ', string)    
    string = re.sub(r'\'s', ' \'s', string)
    string = re.sub(r'\'ve', ' \'ve', string)
    string = re.sub(r'n\'t', ' n\'t', string)
    string = re.sub(r'\'re', ' \'re', string)
    string = re.sub(r'\'d', ' \'d', string)
    string = re.sub(r'\'ll', ' \'ll', string)
    string = re.sub(r',', ' , ', string)
    string = re.sub(r'!', ' ! ', string)
    string = re.sub(r'\(', ' \( ', string)
    string = re.sub(r'\)', ' \) ', string)
    string = re.sub(r'\?', ' \? ', string)
    string = re.sub(r'\s+', ' ', string)   
    return string.strip().lower().split(' ')

train['text'] = train.title.map(str) + " " + train.desp
test['text'] = test.title.map(str) + " " + test.desp

train['text'] = train['text'].map(lambda x: clean_str(str(x)))
test['text'] = test['text'].map(lambda x: clean_str(str(x)))

In [6]:
def top_n_accuracy(preds, truths, n, model):
    best_n = np.argsort(preds, axis=1)[:,-n:]
    ts = truths
    successes = 0
    for i in range(len(ts)):
        if ts[i] in [model.classes_[line] for line in best_n[i,:]]:
            successes += 1
    return float(successes)/len(ts)

In [7]:
def count_token(train_tokenized, token_counter):
    for sample in train_tokenized:
        for token in sample:
            if token not in token_counter:
                token_counter[token] = 1
            else:
                token_counter[token] += 1
    return token_counter

def encode_samples(tokenized_samples, vocab):
    features = []
    for sample in tokenized_samples:
        feature = []
        for token in sample:
            if token in vocab.token_to_idx:
                feature.append(vocab.token_to_idx[token])
            else:
                feature.append(0)
        features.append(feature)         
    return features

def pad_samples(features, maxlen=1000, PAD=0):
    padded_features = []
    for feature in features:
        if len(feature) > maxlen:
            padded_feature = feature[:maxlen]
        else:
            padded_feature = feature
            # 添加 PAD 符号使每个序列等长（长度为 maxlen ）。
            while len(padded_feature) < maxlen:
                padded_feature.append(PAD)
        padded_feature  = [0,0,0,0] + padded_feature + [0,0,0,0]
        padded_features.append(padded_feature)
    return padded_features

## No Attention

In [38]:
def project_do(project_name, dropout_rate, train_df, test_df): 
    print('====')
    print(project_name)
    print('====')
    
    feature_columns = ['component', 'product']
    
    csv_write = csv.writer(open('DeepTIP-Mozilla.csv','a',newline=''), dialect='excel')
    def count_token(train_tokenized, token_counter):
        for sample in train_tokenized:
            for token in sample:
                if token not in token_counter:
                    token_counter[token] = 1
                else:
                    token_counter[token] += 1
        return token_counter
    
    def encode_samples(tokenized_samples, vocab):
        features = []
        for sample in tokenized_samples:
            feature = []
            for token in sample:
                if token in vocab.token_to_idx:
                    feature.append(vocab.token_to_idx[token])
                else:
                    feature.append(0)
            features.append(feature)         
        return features

    def pad_samples(features, maxlen=1000, PAD=0):
        padded_features = []
        for feature in features:
            if len(feature) > maxlen:
                padded_feature = feature[:maxlen]
            else:
                padded_feature = feature
                # 添加 PAD 符号使每个序列等长 长度为 maxlen
                while len(padded_feature) < maxlen:
                    padded_feature.append(PAD)
            padded_feature  = [0,0,0,0] + padded_feature + [0,0,0,0]
            padded_features.append(padded_feature)
        return padded_features
    
    class TextCNN(nn.Block):
        def __init__(self, vocab, embedding_size, ngram_kernel_sizes, nums_channels, num_outputs, sequence_len,max_len_dict, **kwargs):
            
            super(TextCNN, self).__init__(**kwargs)
            self.ngram_kernel_sizes = ngram_kernel_sizes
            self.nums_channels = nums_channels
            self.embedding = nn.Embedding(len(vocab), embedding_size)
            self.embedding_component = nn.Embedding(max_len_dict['component'], 10)
            self.embedding_product = nn.Embedding(max_len_dict['product'], 48)
            
            for i in range(len(ngram_kernel_sizes)):
                conv = nn.Conv1D(nums_channels[i],
                    kernel_size=ngram_kernel_sizes[i],
                    strides=1,
                    activation='relu')
                pool = nn.GlobalMaxPool1D()
                setattr(self, f'conv_{i}', conv) 
                setattr(self, f'pool_{i}', pool)

            self.temp_pool = nn.GlobalMaxPool1D()
            self.dropout = nn.Dropout(0.5)
            self.dropout2 = nn.Dropout(dropout_rate)
            self.dense = nn.Dense(32)  
            self.decoder = nn.Dense(num_outputs)
       
        def forward(self, inputs, component, product):

            embeddings = self.embedding(inputs)
            embedd_component = self.embedding_component(component)
            embedd_product = self.embedding_product(product)
            
            # TARGET
            embeddings = embeddings.transpose((0,2,1)) 
            encoding = [
                nd.flatten(self.get_pool(i)(self.get_conv(i)(embeddings)))
                for i in range(len(self.ngram_kernel_sizes))]
            encoding = nd.concat(*encoding, dim=1)

            outputs = self.dense(self.dropout(
                nd.concat(
                encoding,
                embedd_component,
                embedd_product,
                dim=1)               
            ))
            
            outs = self.decoder(self.dropout2(outputs))
            
            return outs
        
        def get_conv(self, i):
            return getattr(self, f'conv_{i}')
        
        def get_pool(self, i):
            return getattr(self, f'pool_{i}')
        
        def get_conv_c(self, i):
            return getattr(self, f'conv_c_{i}')
        
        def get_pool_c(self, i):
            return getattr(self, f'pool_c_{i}')
        
        def attention_model(self, attention_size):
            model = nn.Sequential()
            model.add(nn.Dense(attention_size, activation='tanh', use_bias=False, flatten=False),
                      nn.Dense(1, use_bias=False, flatten=False))
            return model
    
    def eval_model(train_iter, show = False):
        l_sum = 0
        l_n = 0
        
        pred_probs = []
        labels_result = []
        pred_result = []
        
        for data, label, weights, component, product in train_iter:
            X = data.as_in_context(ctx)
            y = label.as_in_context(ctx).T 
            
            component = component.as_in_context(ctx)
            product = product.as_in_context(ctx)
            
            y_pred = net(X, component, product)
            l = loss(y_pred, y)
            
            pred_result.extend(list(nd.argmax(nd.softmax(y_pred), axis=1).astype('int').asnumpy()))
            labels_result.extend(list(y.astype('int').asnumpy()))
   
        if show == True:
            print(metrics.classification_report(labels_result, pred_result, target_names=LABELS.keys()))
        
        return 1, metrics.f1_score(labels_result, pred_result, average='macro')
    
    def eval_model_test(train_iter):
        l_sum = 0
        l_n = 0
        
        pred_result = []
        labels_result = []
        pred_probs = []
        
        for data, label, weights, component, product in train_iter:
            X = data.as_in_context(ctx)
            y = label.as_in_context(ctx).T    
            
            component = component.as_in_context(ctx)
            product = product.as_in_context(ctx)
            
            y_pred= net(X, component, product)           
            l = loss(y_pred, y)
            
            pred_result.extend(list(nd.argmax(nd.softmax(y_pred), axis=1).astype('int').asnumpy()))
            labels_result.extend(list(y.astype('int').asnumpy()))
        
        print(metrics.classification_report(labels_result, pred_result, target_names=LABELS.keys()))
        
        return metrics.precision_score(labels_result, pred_result, average='macro'), \
                    metrics.recall_score(labels_result, pred_result, average='macro'),\
                    metrics.f1_score(labels_result, pred_result, average='macro'),\
                    metrics.accuracy_score(labels_result, pred_result), \
                    pred_result,pred_probs,labels_result

    samples = train_df
    samples_test = test_df
    
    samples = pd.concat([samples, samples[samples['severity'] == 0]])
    samples = pd.concat([samples, samples[samples['severity'] == 0]])
    samples = pd.concat([samples, samples[samples['severity'] == 4]])
    
    train = {}
    test = {}
    
    train['sentences'] = list(samples['text'])
    train['labels'] = list(samples['severity'])
    
    # multi weight for train
    weight_dict = {}
    total = len(samples['severity'])
    min_sample = samples['severity'].value_counts().min()
    for index,x in samples['severity'].value_counts().iteritems():
        weight_dict[index] = 1 - float(x) / total
    
    # Softmax
    #weightMatrix = nd.array([weight_dict[x] for x in weight_dict])
    #weightMatrix = nd.softmax(weightMatrix)
    
    #i=0
    #for x in weight_dict:
    #    weight_dict[x] = weightMatrix[i].astype('float').asnumpy()[0]
    #    i += 1
    
    train['weights'] = []
    for line in samples['severity']:
        train['weights'].append(weight_dict[line])
    train['weights'] =  nd.array(train['weights'])
                        
    test['sentences'] = list(samples_test['text']) 
    test['labels'] = list(samples_test['severity'])
    
    test['weights'] = []
    for line in samples_test['severity']:  
         test['weights'].append(weight_dict[line])
    test['weights'] = nd.array(test['weights'])
    
    max_len = 80
    while (np.sum(np.array(list(map(len,train['sentences'])))<max_len)/len(train['sentences'])<0.95):
        max_len += 10   
    

    for c in feature_columns:
        tmp_dict = {}
        idx = 0
        for x in list(samples[c].drop_duplicates()):
            tmp_dict[x] = idx
            idx += 1
            
        for x in list(samples_test[c].drop_duplicates()):
            if x not in tmp_dict:
                tmp_dict[x] = idx
                idx += 1
            
        train[c] = nd.array(list(samples[c].replace(tmp_dict)))
        test[c] = nd.array(list(samples_test[c].replace(tmp_dict)))

    token_counter = collections.Counter()
    token_counter = count_token(train['sentences'], token_counter)
    vocab = text.vocab.Vocabulary(token_counter, unknown_token='<unk>', min_freq=2, reserved_tokens=None, most_freq_count=30000)
    
    ctx = mx.gpu(1)
    from gensim.models import FastText
    model = FastText(train['sentences'] + test['sentences'], min_count=1, size=200, iter=10, workers=8)
    w2v = []
    for line in vocab.token_to_idx:
        if line in model.wv:
            w2v.append(model.wv[line])
        else:
            w2v.append(np.zeros(200))
    w2v = nd.array(w2v)
    print('w2v success', w2v.shape)
    
    train['features'] = encode_samples(train['sentences'], vocab)
    train['features'] = nd.array(pad_samples(train['features'], max_len, 0))
                                
    test['features'] = encode_samples(test['sentences'], vocab)
    test['features'] = nd.array(pad_samples(test['features'], max_len, 0))
    
    max_len_dict = {}
    for c in feature_columns:
        max_len = int(np.max(train[c]).asnumpy()) 
        if max_len <int(np.max(test[c]).asnumpy()) :
            max_len = int(np.max(test[c]).asnumpy())
        max_len_dict[c] = max_len
    print('encode_samples success')

    sequence_length = 0
    lr = 0.001
    num_epochs = 30
    sentence_hidden = 32
    batch_size = 512
    embed_size = 200
    ngram_kernel_sizes = [3,4,5]
    nums_channels = [20,20,20]
    
    net = TextCNN(vocab, embed_size, ngram_kernel_sizes, nums_channels, 5, sequence_length, max_len_dict)
    net.initialize(init.Xavier(), ctx=ctx)
    net.embedding.weight.set_data(w2v.as_in_context(ctx))
    trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
    loss = gloss.SoftmaxCrossEntropyLoss()
    
    train_iter = gdata.DataLoader(gdata.ArrayDataset(
        train['features'],
        train['labels'],
        train['weights'],
        train['component'],
        train['product']
    ), batch_size, shuffle=True)
    
    test_iter = gdata.DataLoader(gdata.ArrayDataset(
        test['features'],
        test['labels'],
        test['weights'],
        test['component'],
        test['product']
    ), batch_size, shuffle=True)
    
    best_f1 = 0
    start = time.clock()
    for epoch in range(1, num_epochs + 1):
        l_sum = 0
        l_n = 0
        for data, label, weight, component, product in train_iter:
            X = data.as_in_context(ctx)
            y = label.as_in_context(ctx).T
            w = weight.as_in_context(ctx).T
            
            component = component.as_in_context(ctx)
            product = product.as_in_context(ctx)
            
            with autograd.record():
                y_hat = net(X, component, product)
                l = loss(y_hat, y, w)
            
            l.backward()
            trainer.step(batch_size, ignore_stale_grad=True)
        
        train_loss, train_f1 = eval_model(train_iter)
        val_loss, val_f1 = eval_model(test_iter)#, show=True
        if val_f1>=best_f1:
            net.save_parameters('DeepTIP.model')
            best_f1 = val_f1
        print('epoch %d, train f1 %.2f; val f1 %.4f' 
              % (epoch, train_f1, val_f1))
    
    end = time.clock()
    print('train success')
   
    net.load_parameters('DeepTIP.model')
    start_p = time.clock()
    pre, recall, f1, acc1, pred_result, pred_probs, labels_result = eval_model_test(test_iter)
    end_p = time.clock()
    
    csv_write.writerow([project_name,'DeepTip', pre, recall, f1, acc1,end-start,end_p-start_p])

In [24]:
project_do('Mozilla', 0.4, train, test)

====
Mozilla
====
w2v success (1917, 200)
encode_samples success
epoch 1, train f1 0.35; val f1 0.3475
epoch 2, train f1 0.42; val f1 0.4779
epoch 3, train f1 0.47; val f1 0.5011
epoch 4, train f1 0.50; val f1 0.4782
epoch 5, train f1 0.52; val f1 0.4847
epoch 6, train f1 0.54; val f1 0.4943
epoch 7, train f1 0.56; val f1 0.5170
epoch 8, train f1 0.59; val f1 0.4973
epoch 9, train f1 0.60; val f1 0.4854
epoch 10, train f1 0.63; val f1 0.5182
epoch 11, train f1 0.62; val f1 0.4735
epoch 12, train f1 0.68; val f1 0.5011
epoch 13, train f1 0.67; val f1 0.5143
epoch 14, train f1 0.70; val f1 0.5007
epoch 15, train f1 0.72; val f1 0.5018
epoch 16, train f1 0.74; val f1 0.5046
epoch 17, train f1 0.75; val f1 0.5146
epoch 18, train f1 0.77; val f1 0.5179
epoch 19, train f1 0.77; val f1 0.5398
epoch 20, train f1 0.79; val f1 0.5451
train success
             precision    recall  f1-score   support

    trivial       0.22      0.10      0.14        40
      minor       0.48      0.92      0.63 