In [1]:
#Partially adopted from Tensorflow/docs/basic_text_classification
#https://github.com/tensorflow/docs/blob/master/site/en/tutorials/keras/basic_text_classification.ipynb

#As well as https://developers.google.com/machine-learning/guides/text-classification/

In [2]:
import tensorflow as tf
from tensorflow import keras

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

import numpy as np
import pandas as pd

from data import Data, TestData, WikiData

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
security = []
usability = []
operability = []
with open('Data/NFR.csv') as f:
    # Create two instances of data: fake (noise) and real (data)
    for sentence in f:
        cl = sentence[:sentence.find(',')]
        sentence = sentence[sentence.find(',')+1:]
        if cl == 'Security':
            security.append(sentence)
        elif cl == 'Usability':
            usability.append(sentence)
        elif cl == 'Operability':
            operability.append(sentence)

# Create two instances of data from fake and real
test_0 = TestData(security, '0')
test_1 = TestData(usability, '1')
test_2 = TestData(operability, '2')

# Balance the data based on min length of all lists
deleteAfter = min(len(test_0.data), len(test_1.data), len(test_2.data))
del test_0.data[deleteAfter:]
del test_1.data[deleteAfter:]
del test_2.data[deleteAfter:]
del test_0.labels[deleteAfter:]
del test_1.labels[deleteAfter:]
del test_2.labels[deleteAfter:]

# Test that the data is balanced
assert len(test_0.data) == len(test_1.data), 'Lengths of test datas are not equal'
assert len(test_0.labels) == len(test_1.labels), 'Lengths of test labels are not equal'
assert len(test_0.data) == len(test_2.data), 'Lengths of test datas are not equal'
assert len(test_1.labels) == len(test_2.labels), 'Lengths of test labels are not equal'

assert len(test_0.data) == len(test_0.labels), 'Lengths of test data and labels do not match'
assert len(test_1.data) == len(test_1.labels), 'Lengths of test data and labels do not match'
assert len(test_2.data) == len(test_2.labels), 'Lengths of test data and labels do not match'

Labels made successfully
Labels made successfully
Labels made successfully


In [4]:
train_0 = WikiData(articlesFile='Data/enwiki-latest-all-titles-in-ns0', keyword='security', size=200, labelValue='0')
train_1 = WikiData(articlesFile='Data/enwiki-latest-all-titles-in-ns0', keyword='usability', size=100, labelValue='1')
train_2 = WikiData(articlesFile='Data/enwiki-latest-all-titles-in-ns0', keyword='operability', size=200, labelValue='2')

Article names in the security list: 200
Labels made successfully
Article names in the usability list: 54
Labels made successfully
Article names in the operability list: 102
Labels made successfully


In [5]:
# Balance the data based on min length of both lists
deleteAfter = min(len(train_0.data), len(train_1.data), len(train_2.data))

# If min length is less than 100 sentences, test is invalid
assert deleteAfter > 100, 'Train set is too small'

del train_0.data[deleteAfter:]
del train_1.data[deleteAfter:]
del train_2.data[deleteAfter:]
del train_0.labels[deleteAfter:]
del train_1.labels[deleteAfter:]
del train_2.labels[deleteAfter:]


# Test that the data is balanced
assert len(train_0.data) == len(train_1.data), 'Lengths of train datas are not equal'
assert len(train_1.data) == len(train_2.data), 'Lengths of train datas are not equal'

assert len(train_0.labels) == len(train_1.labels), 'Lengths of train labels are not equal'
assert len(train_1.labels) == len(train_2.labels), 'Lengths of train labels are not equal'

assert len(train_0.data) == len(train_0.labels), 'Lengths of train data and labels do not match'
assert len(train_1.data) == len(train_1.labels), 'Lengths of train data and labels do not match'
assert len(train_2.data) == len(train_2.labels), 'Lengths of train data and labels do not match'

#Collapse into single list
trainData = train_0.data + train_1.data + train_2.data
trainLabels = train_0.labels + train_1.labels + train_2.labels
x_test = test_0.data + test_1.data + test_2.data
y_test = test_0.labels + test_1.labels + test_2.labels

#Shuffle two lists, save the order
trainData, trainLabels = shuffle(trainData, trainLabels)
x_test, y_test = shuffle(x_test, y_test)

#Get validation data
validationToTrainRatio = 0.05
validationSize = int(validationToTrainRatio * len(trainData))
validationData = trainData[:validationSize]
validationLabels = trainLabels[:validationSize]
trainData = trainData[validationSize:]
trainLabels = trainLabels[validationSize:]


print("\nTest set length: %d" % len(x_test))
print("Test labels set length: %d" % len(y_test))
print("\nValidation set length: %d" % len(validationData))
print("Validation labels set length: %d" % len(validationLabels))
print("\nTrain set length: %d" % len(trainData))
print("Train labels set length: %d" % len(trainLabels))


Test set length: 459
Test labels set length: 459

Validation set length: 468
Validation labels set length: 468

Train set length: 8904
Train labels set length: 8904


In [6]:
# Vectorizing train, validation, and test data
trainData = train_0.ngram_vectorize(data=trainData, labels=trainLabels)

validationData = train_0.ngram_vectorize(
    data=validationData,
    vectorizer=train_0.vectorizer, 
    selector=train_0.selector)

x_test = train_0.ngram_vectorize(
    data=x_test,
    vectorizer=train_0.vectorizer, 
    selector=train_0.selector)

In [7]:
# Binary classifier (model) construction method
def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    model = keras.Sequential()
    model.add(keras.layers.Dropout(rate=dropout_rate, input_shape=input_shape))
    for _ in range(layers-1):
        model.add(keras.layers.Dense(units=units, activation=tf.nn.relu))
        model.add(keras.layers.Dropout(rate=dropout_rate))

    model.add(keras.layers.Dense(units=16, activation=tf.nn.relu))
    model.add(keras.layers.Dropout(rate=dropout_rate))
    model.add(keras.layers.Dense(units=num_classes, activation=tf.nn.sigmoid))
    return model

In [8]:
#For parameters refer to the upper cell
model = mlp_model(2, 32, 0.3, trainData.shape[1:], 3)
model.summary()

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout (Dropout)            (None, 12639)             0         
_________________________________________________________________
dense (Dense)                (None, 32)                404480    
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)      

In [9]:
#Learning rate could be further decreased for additional accuracy
model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [10]:
#callbacks will prevent model from running if val_loss starts to increase
callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

history = model.fit(trainData,
                    trainLabels,
                    epochs=30,
                    callbacks = callbacks,
                    batch_size=1024,
                    validation_data=(validationData, validationLabels),
                    verbose=2)

Train on 8904 samples, validate on 468 samples
Epoch 1/30
 - 4s - loss: 1.0970 - acc: 0.4203 - val_loss: 1.0929 - val_acc: 0.6859
Epoch 2/30
 - 4s - loss: 1.0876 - acc: 0.6016 - val_loss: 1.0775 - val_acc: 0.8077
Epoch 3/30
 - 4s - loss: 1.0695 - acc: 0.6928 - val_loss: 1.0536 - val_acc: 0.8590
Epoch 4/30
 - 4s - loss: 1.0429 - acc: 0.7590 - val_loss: 1.0221 - val_acc: 0.8953
Epoch 5/30
 - 4s - loss: 1.0094 - acc: 0.7899 - val_loss: 0.9826 - val_acc: 0.8953
Epoch 6/30
 - 4s - loss: 0.9693 - acc: 0.8106 - val_loss: 0.9352 - val_acc: 0.8974
Epoch 7/30
 - 4s - loss: 0.9220 - acc: 0.8241 - val_loss: 0.8807 - val_acc: 0.9017
Epoch 8/30
 - 4s - loss: 0.8630 - acc: 0.8402 - val_loss: 0.8184 - val_acc: 0.9081
Epoch 9/30
 - 4s - loss: 0.7995 - acc: 0.8553 - val_loss: 0.7500 - val_acc: 0.9038
Epoch 10/30
 - 4s - loss: 0.7350 - acc: 0.8513 - val_loss: 0.6765 - val_acc: 0.9081
Epoch 11/30
 - 4s - loss: 0.6697 - acc: 0.8583 - val_loss: 0.6034 - val_acc: 0.9103
Epoch 12/30
 - 4s - loss: 0.6081 - acc

In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

results = model.predict(x_test)
pred_labels = results.argmax(axis=-1)

accuracy = accuracy_score(y_test, pred_labels)
precision = precision_score(y_test, pred_labels, average='weighted')
recall = recall_score(y_test, pred_labels, average='weighted')
fscore = f1_score(y_test, pred_labels, average='weighted')
print("Accuracy: %.4f" % accuracy)
print("\nPrecision: %.4f\nRecall: %.4f\nF-score: %.4f" % (precision, recall, fscore))

confusion_matrix(y_test, pred_labels)

Accuracy: 0.5948

Precision: 0.5990
Recall: 0.5948
F-score: 0.5949


array([[99, 24, 30],
       [23, 84, 46],
       [42, 21, 90]])