In [1]:
#Partially adopted from Tensorflow/docs/basic_text_classification
#https://github.com/tensorflow/docs/blob/master/site/en/tutorials/keras/basic_text_classification.ipynb

#As well as https://developers.google.com/machine-learning/guides/text-classification/

In [2]:
import tensorflow as tf
from tensorflow import keras
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from imblearn.over_sampling import SMOTE

import numpy as np
import pandas as pd
import random

from ParseList import parseList
from GetSeeAlso import getSeeAlso
from MakeLabels import makeLabels
from PopulateList import populateList

import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
stopWords = stopwords.words('english')
from nltk import word_tokenize
stemmer = nltk.stem.SnowballStemmer('english')

[nltk_data] Downloading package punkt to /home/ldeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ldeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
x_xl = []
testLabels = []
LIMIT = 150
with open('Data/NFR.csv') as f:
    for sentence in f:
        cl = sentence[:sentence.find(',')]
        sentence = sentence[sentence.find(',')+1:]
        sentence = sentence.translate(None, '.,-\":;~!@#$%^&?[]{}<>`1234567890\\*()').strip().split(" ")
        sentence = [unicode(i, 'utf-8', errors='ignore') for i in sentence]
        sentence = [stemmer.stem(i) for i in sentence]
        sentence = ' '.join(word for word in sentence if word not in stopWords and len(word) > 1).strip()
        if cl == 'Security':
            x_xl.append(sentence)
            testLabels.append(2)
        elif cl == 'Usability':
            x_xl.append(sentence)
            testLabels.append(1)
        elif cl == 'Operability':
            x_xl.append(sentence)
            testLabels.append(0)

print(len(x_xl))

TypeError: translate() takes exactly one argument (2 given)

In [None]:
securityList = populateList("Data/enwiki-latest-all-titles-in-ns0", "security", 400, False)
usabilityList = populateList("Data/enwiki-latest-all-titles-in-ns0", "usability", 100, False)
otherList = populateList("Data/enwiki-latest-all-titles-in-ns0", "operability", 250, False)
# otherList += getSeeAlso(*otherList)
# usabilityList += getSeeAlso(*usabilityList)

In [None]:
#Parse two wikipedia pages
usability = parseList(*usabilityList)
security = parseList(*securityList)
other = parseList(*otherList)

print "\nUsability length: ", len(usability)
print "Security length: ", len(security)
print "Other length: ", len(other)

#Make entries be same length, pick minimum of two lengths
deleteAfter = min(len(other), len(usability), len(security))

#If min length is less than 100 sentences, test is invalid
if deleteAfter < 100:
    print("!!!TEST IS INVALID!!!")
else:
    del usability[deleteAfter:]
    del security[deleteAfter:]
    del other[deleteAfter:]

l1 = len(other)
l2 = len(usability)
l3 = len(security)
    
#Check that all lengths are equal
if((l1 != l2 or l2 != l3 or l1 != l3) and (l3 != len(set(security)) or l2 != len(set(usability)) or l1 != len(set(other)))):
    print("!!!TEST IS INVALID!!!")
else:
    #Generate labels
    trainLabels = makeLabels(usability, 1)
    trainLabels += makeLabels(security, 2)
    trainLabels += makeLabels(other, 0)

    #Collapse into single list
    trainData = usability + security + other

    #Shuffle two lists, save the order
    trainData, trainLabels = shuffle(trainData, trainLabels)
    #x_xl, testLabels = shuffle(x_xl, testLabels)
    
    sm = SMOTE(random_state=42)
    trainData, trainLabels = sm.fit_resample(trainData, trainLabels)

    #Get validation data
    validationToTrainRatio = 0.05
    validationSize = int(validationToTrainRatio * len(trainData))
    validationData = trainData[:validationSize]
    validationLabels = trainLabels[:validationSize]
    trainData = trainData[validationSize:]
    trainLabels = trainLabels[validationSize:]

    
    #Sanity check
    print "\nTest set length: ", len(x_xl)
    print "Test labels set length: ", len(testLabels)
    print "\nValidation set length: ", len(validationData)
    print "Validation labels set length: ", len(validationLabels)
    print "\nTrain set length: ", len(trainData)
    print "Train labels set length: ", len(trainLabels)

In [None]:
def ngram_vectorize(train_texts, train_labels, val_texts, test_texts):
    
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': (1,1),  # Use 1-grams + 2-grams + 3-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': 'word',  # Split text into word tokens.
            'min_df': 2, #Words that appear less than this value do not contribute
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)
    
    # Vectorize test texts.
    x_test = vectorizer.transform(test_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(20000, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    x_test = selector.transform(x_test).astype('float32')
    return x_train, x_val, x_test

In [None]:
trainData, valData, testData = ngram_vectorize(trainData, trainLabels, validationData, x_xl)

In [None]:
#Constructing the model method
def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    model = keras.Sequential()
    model.add(keras.layers.Dropout(rate=dropout_rate, input_shape=input_shape))
    for _ in range(layers-1):
        model.add(keras.layers.Dense(units=units, activation=tf.nn.relu))
        model.add(keras.layers.Dropout(rate=dropout_rate))

    model.add(keras.layers.Dense(units=16, activation=tf.nn.relu))
    model.add(keras.layers.Dropout(rate=dropout_rate))
    model.add(keras.layers.Dense(units=num_classes, activation=tf.nn.sigmoid))
    return model

In [None]:
#For parameters refer to the upper cell
model = mlp_model(2, 32, 0.3, trainData.shape[1:], 3)
model.summary()

In [None]:
#Learning rate could be further decreased for additional accuracy
model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
#callbacks will prevent model from running if val_loss starts to increase
callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

history = model.fit(trainData,
                    trainLabels,
                    epochs=100,
                    callbacks = callbacks,
                    batch_size=1024,
                    validation_data=(valData, validationLabels),
                    verbose=2)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

results = model.predict(testData)
pred_labels = results.argmax(axis=-1)

accuracy = accuracy_score(testLabels, pred_labels)
precision = precision_score(testLabels, pred_labels, average='weighted')
recall = recall_score(testLabels, pred_labels, average='weighted')
fscore = f1_score(testLabels, pred_labels, average='weighted')
print("Accuracy: %.4f" % accuracy)
print("\nPrecision: %.4f\nRecall: %.4f\nF-score: %.4f" % (precision, recall, fscore))

confusion_matrix(testLabels, pred_labels)