In [1]:
import javalang
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split


import keras
from keras import layers
import tensorflow as tf
from keras.models import Sequential, Model, model_from_json
from keras.utils import Sequence
from keras.layers import concatenate, Input, LSTM, Dense, Masking, TimeDistributed, Embedding

from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

Using TensorFlow backend.


In [2]:
Train_path = "../../Dataset/AST/log4j/df_log4j_v10.csv"
Test_path = "../../Dataset/AST/log4j/df_log4j_v11.csv"

In [3]:
df = pd.read_csv(Train_path)
df[:3]

Unnamed: 0.1,Unnamed: 0,metric_name,java_name,file,label
0,0,org.apache.log4j.helpers.ISO8601DateFormat,./log4j-v_1_0/src/java/org/apache/log4j/helper...,/*\n * Copyright (C) The Apache Software Found...,0
1,1,org.apache.log4j.xml.Transform,./log4j-v_1_0/src/java/org/apache/log4j/xml/Tr...,\npackage org.apache.log4j.xml;\n\nimport org....,0
2,2,org.apache.log4j.helpers.AppenderAttachableImpl,./log4j-v_1_0/src/java/org/apache/log4j/helper...,/*\n * Copyright (C) The Apache Software Found...,0


In [4]:
def build_dataset(name, file, label, limited):
    vocab = {}
    in_valid = []
    input_list = []
    label_list = []

    for i, x in enumerate(file):
        try:
            tree = javalang.parse.parse(x)
        except:
            in_valid.append(name[i])
            continue
        input_ = []
        for path, node in tree:
            node_type = type(node)
            flag =0
            for limit in limited:
                if limit in str(node_type):
                    flag = 1
                    break
            if not flag:continue

            if node_type not in vocab:
                vocab[node_type] = len(vocab)
            input_.append(node_type)
        input_list.append(input_)
        if label[i]>0:
            label_list.append(1)
        else:label_list.append(0)
        
    print(in_valid)
    return input_list, label_list, vocab

In [5]:
limited = ["Invocation", "Class", "Declaration", "Statement", "Clause"]
input_list, label_list, vocab = build_dataset(df.metric_name, df.file, df.label,limited)

['org.apache.log4j.test.Finalize', 'org.apache.log4j.NDC', 'org.apache.log4j.Category', 'org.apache.log4j.PropertyConfigurator']


In [6]:
length = [len(x) for x in input_list]
np.median(length), np.max(length), len(vocab)

(52.0, 569, 31)

# RNN

In [7]:
X_train, y_train = input_list, label_list

In [13]:
def preprocess(input_list, vocab, max_length=500):
    X = np.zeros((len(input_list), max_length))
    for i, x in enumerate(input_list):
        if len(x)>max_length:
            x = x[:max_length]
        X[i][-len(x):] = [vocab[ele]+1 for ele in x]
    return X

In [14]:
#max_length = max([len(x) for x in input_list])
X_train_d = preprocess(X_train, vocab)

In [15]:
X_train_d

array([[ 0.,  0.,  0., ..., 14.,  6., 13.],
       [ 0.,  0.,  0., ...,  7.,  4.,  7.],
       [ 0.,  0.,  0., ...,  4.,  7., 11.],
       ...,
       [ 0.,  0.,  0., ...,  7., 24., 25.],
       [ 0.,  0.,  0., ...,  5.,  4.,  7.],
       [ 0.,  0.,  0., ..., 16.,  4.,  7.]])

In [16]:
def build_model(feature_dim,
                max_len=500,
                lstm_units=32,
                epoch=50,
                batch_size =5,
                pad_key=0,
                nb_classes = 1,
                dense_activate='relu'):
    input1 = Input(shape=(max_len,))
    current_input = Embedding(input_dim=feature_dim, output_dim=lstm_units)(input1)

    lstm_out = layers.Bidirectional(layers.LSTM(lstm_units, return_sequences=True))(current_input)
    lstm_out = layers.Bidirectional(layers.LSTM(lstm_units))(lstm_out)
    lstm_out = Dense(lstm_units, activation=dense_activate)(lstm_out)

    out = Dense(nb_classes, activation='sigmoid', name='main_output')(lstm_out)
    model = Model(inputs=[input1], outputs=[out])

    model.compile(optimizer='adam', loss="binary_crossentropy", metrics=["accuracy", keras.metrics.AUC()])
    print(model.summary())
    return model

In [17]:
model = build_model(feature_dim = len(vocab)+1)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 500, 32)           1024      
_________________________________________________________________
bidirectional_3 (Bidirection (None, 500, 64)           16640     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
main_output (Dense)          (None, 1)                 33        
Total params: 44,609
Trainable params: 44,609
Non-trainable params: 0
_______________________________________________________

In [18]:
model.compile(loss='binary_crossentropy', metrics=["accuracy", keras.metrics.AUC()], optimizer='adam')

print("Training...")
model.fit(X_train_d, y_train, epochs=20, batch_size=10, validation_split=0, verbose=1)

Training...


W0717 22:47:01.887340 4589901248 module_wrapper.py:139] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x13ce07128>

# Testing

In [19]:
df = pd.read_csv(Test_path)

In [20]:
def build_test_dataset(name, file, label, limited, vocab):
    in_valid = []
    vocab_missing = []
    input_list = []
    label_list = []

    for i, x in enumerate(file):
        try:
            tree = javalang.parse.parse(x)
        except:
            in_valid.append(name[i])
            continue
        input_ = []
        for path, node in tree:
            node_type = type(node)
            flag =0
            for limit in limited:
                if limit in str(node_type):
                    flag = 1
                    break
            if not flag:continue

            if node_type not in vocab:
                vocab_missing.append(node_type)
                continue
            input_.append(node_type)
        input_list.append(input_)
        if label[i]>0:
            label_list.append(1)
        else:label_list.append(0)
        
    print(in_valid)
    print(vocab_missing)
    return input_list, label_list

In [21]:
limited = ["Invocation", "Class", "Declaration", "Statement", "Clause"]
input_list, label_list = build_test_dataset(df.metric_name, df.file, df.label,limited, vocab)

['org.apache.log4j.PropertyConfigurator', 'org.apache.log4j.test.Finalize', 'org.apache.log4j.NDC', 'org.apache.log4j.Category']
[]


In [22]:
X_test, y_test = input_list, label_list
X_test_d = preprocess(X_test, vocab)

In [23]:
print("Generating test predictions...")
y_pred = model.predict(X_test_d, verbose=0).reshape(-1)
y_pred_class = [round(x) for x in y_pred]

Generating test predictions...


In [24]:
print(classification_report(y_test, y_pred_class, target_names = ["Non-Defect", "Defect"]))

              precision    recall  f1-score   support

  Non-Defect       0.86      0.82      0.84        66
      Defect       0.68      0.74      0.70        34

    accuracy                           0.79       100
   macro avg       0.77      0.78      0.77       100
weighted avg       0.80      0.79      0.79       100



In [25]:
roc_auc_score(y_test, y_pred)

0.8103832442067738

# Group

In [27]:
X_test, y_test = input_list, label_list
X_test_d = preprocess(X_test, vocab)

index = [i for i, j in enumerate(input_list) if len(j)>100]
X_test_group, y_test_group = X_test_d[index], np.array(y_test)[index]


print("Generating test predictions...")
y_pred = model.predict(X_test_group, verbose=0).reshape(-1)
y_pred_class = [round(x) for x in y_pred]

print(classification_report(y_test_group, y_pred_class, target_names = ["Non-Defect", "Defect"]))
roc_auc_score(y_test_group, y_pred)

Generating test predictions...
              precision    recall  f1-score   support

  Non-Defect       0.50      0.29      0.36         7
      Defect       0.75      0.88      0.81        17

    accuracy                           0.71        24
   macro avg       0.62      0.58      0.59        24
weighted avg       0.68      0.71      0.68        24



0.5546218487394958