In [1]:
#Partially adopted from Tensorflow/docs/basic_text_classification
#https://github.com/tensorflow/docs/blob/master/site/en/tutorials/keras/basic_text_classification.ipynb

#As well as https://developers.google.com/machine-learning/guides/text-classification/

In [2]:
import tensorflow as tf
from tensorflow import keras
from keras.utils import to_categorical
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

import numpy as np
import pandas as pd
import random

import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
stopWords = stopwords.words('english')
from nltk import word_tokenize
stemmer = nltk.stem.SnowballStemmer('english')

Using TensorFlow backend.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#MAPPING SENTENCES TO LIBRARY
import json

#declare testLabels as an empty list
testLabels = []

#Keeping track of used sentences
used = []

#Keeping track of used sentences separately
security = []
nosecurity = []

#this is the smallest number of unique non-functional requirements of a single class
LIMIT = 289

with open("Consolidated_data.txt","r") as f:
    
    #Go line by line in original data file
    for line in f:
        #find where class begins
        front = line.find("\"class\":\"")
        #find where class ends (where sentence begins)
        end = line.find("\",\"sentence\":\"")
        #substring line based on front and end above
        reqClass = (line[(front+9):end]).lower()

        #sentence
        temp = line.find("\"sentence\":\"")
        #Cut out the sentence part
        sentence = (line[(temp+12):-4]).lower()
        #Remove all symbols, numbers, and spaces. Split into list of words
        sentence = sentence.translate(None, '.,-\":;~!@#$%^&?[]{}<>`1234567890\\*()').strip().split(" ")
        #Unicode each word because stemmer likes it that way
        sentence = [unicode(i, 'utf-8') for i in sentence]
        #Stem each word separately
        sentence = [stemmer.stem(i) for i in sentence]
        #Check word by word, if the word is not in stop words and not a single letter, join
        sentence = ' '.join(word for word in sentence if word not in stopWords and len(word) > 1)
        
        if sentence in used:
            continue
        else:
            if "security" in reqClass and (len(security) < LIMIT):
                used.append(sentence)
                security.append(sentence)
                testLabels.append(1)
            elif "security" not in reqClass and (len(nosecurity) < LIMIT):
                used.append(sentence)
                nosecurity.append(sentence)
                testLabels.append(0)

In [4]:
#Must all be balanced for valid results.
#Commented out the total number of sentences in the document.
print len(security) #270
print len(nosecurity)
print len(used) #540

289
289
578


In [5]:
import csv
import requests

S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"

def parseList(*titleList):
    
    train = []
    b = ",-\":;~!@#$%^&?[]{}<>`1234567890\\*()\'/|=\n"
    
    for title in titleList:
        PARAMS = {
        'action': "parse",
        'page': title,
        'prop': 'wikitext',
        'format': "json"
        }
        
        res = S.get(url=URL, params=PARAMS)
        data = res.json()
        wikitext = data['parse']['wikitext']['*']
        lines = wikitext.split('|-')
        paragraph = lines[0]
        
        #remove all unnecessary characters (defined in string b above)
        for char in b:
            paragraph = paragraph.replace(char, "")
            
        #if wordcount < 50, paragraph will be disregarded
        if (len(paragraph.split(" ")) < 50):
            continue
                
        #removing words that do not carry any meaning (longer than 16 chars and smaller than 2)
        for key in paragraph.split(" "):
            if len(key) > 16:
                paragraph = paragraph.replace(key, "")
            
        #line in this case is a whole paragraph
        for sentence in paragraph.split("."):
            sentence = sentence.strip().split(" ")
            #disregard sentence if it has less than 10 words or starts with ref
            if sentence[0] == 'ref' or len(sentence) < 10:
                continue
            #Stem each word separately
            sentence = [stemmer.stem(i) for i in sentence]
            #Check word by word, if the word is not in stop words and not a single letter, join
            sentence = ' '.join(word for word in sentence if word not in stopWords and len(word) > 2)
            train.append(sentence)
    return train

In [6]:
def makeLabels(trainData, labelNumber):
    #generating labels list that matches pageContent list in length
    labels = []
    for x in range(0, len(trainData)):
        labels.append(labelNumber)
    print("Labels made successfully")
    return labels

In [7]:
def populateList(filename, keyword, limit=10000):
    classList = []
    notClassList = []
    classCount = 0
    notClassCount = 0
    with open(filename, "r") as f:
        #making a list of all lines from the file
        lines = f.readlines()
        
        #making a list of all lines that contain the keyword
        for line in lines:
            strippedLine = line.strip()
            loweredLine = line.lower()
            if keyword in loweredLine and classCount < limit:
                classList.append(strippedLine)
                classCount += 1
        
        #making a list of random lines of size(classList) that do not contain the keyword
        while len(notClassList) != len(classList):
            random_int = random.randint(0, len(lines)-1)
            line = lines[random_int]
            if keyword not in line.lower() and notClassCount < limit:
                notClassList.append(line.strip())
                notClassCount += 1
    print("Article names in the class list: %d" % len(classList))
    print("Article names in the not class list: %d" % len(notClassList))
    return classList, notClassList

securityList, notSecurityList = populateList("enwiki-latest-all-titles-in-ns0", "security", 10)

In [9]:
#Parse two wikipedia lists
security = parseList(*securityList)
notSecurity = parseList(*notSecurityList)
print "\nSecurity length: ", len(security)
print "Not Security length: ", len(notSecurity)

#Make entries be same length, pick minimum of two lengths
deleteAfter = min(len(security), len(notSecurity))

#If min length is less than 100 sentences, test is invalid
if deleteAfter < 100:
    print("!!!TEST IS INVALID!!!")
else:
    del security[deleteAfter:]
    del notSecurity[deleteAfter:]

if(len(security) != len(notSecurity) and (len(security) != len(set(security)) or len(notSecurity) != len(set(notSecurity)))):
    print("!!!TEST IS INVALID!!!")
else:
    #Generate labels
    trainLabels = makeLabels(security, 1)
    trainLabels += makeLabels(notSecurity, 0)

    #Collapse into single list
    trainData = security + notSecurity

    #Shuffle two lists, save the order
    trainData, trainLabels = shuffle(trainData, trainLabels)
    used, testLabels = shuffle(used, testLabels)

    #Get validation data
    validationToTrainRatio = 0.2
    validationSize = int(validationToTrainRatio * len(trainData))
    validationData = trainData[:validationSize]
    validationLabels = trainLabels[:validationSize]
    trainData = trainData[validationSize:]
    trainLabels = trainLabels[validationSize:]

    
    #Sanity check
    print "\nTest set length: ", len(used)
    print "Test labels set length: ", len(testLabels)
    print "\nValidation set length: ", len(validationData)
    print "Validation labels set length: ", len(validationLabels)
    print "\nTrain set length: ", len(trainData)
    print "Train labels set length: ", len(trainLabels)

102d_Security_Forces_Squadron  parsed successfully
11th_Public_Security_Division_(People's_Republic_of_China)  parsed successfully
12th_Public_Security_Division_(People's_Republic_of_China)  parsed successfully
1966_United_Nations_Security_Council_election  parsed successfully
1967_United_Nations_Security_Council_election  parsed successfully
1968_United_Nations_Security_Council_election  parsed successfully
1969_United_Nations_Security_Council_election  parsed successfully
Mozaffarabad,_Bardaskan  parsed successfully
Acacia_tetanophylla  parsed successfully
Gayle_Ruzicka  parsed successfully

Security length:  69
Not Security length:  24
!!!TEST IS INVALID!!!
!!!TEST IS INVALID!!!


In [10]:
def ngram_vectorize(train_texts, train_labels, val_texts, test_texts):
    
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': (1,3),  # Use 1-grams + 2-grams + 3-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': 'word',  # Split text into word tokens.
            'min_df': 2, #Words that appear less than this value do not contribute
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)
    
    # Vectorize test texts.
    x_test = vectorizer.transform(test_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(20000, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    x_test = selector.transform(x_test).astype('float32')
    return x_train, x_val, x_test

In [11]:
trainData, valData, testData = ngram_vectorize(trainData, trainLabels, validationData, used)

NameError: name 'trainData' is not defined

In [None]:
#Constructing the model method
def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    model = keras.Sequential()
    model.add(keras.layers.Dropout(rate=dropout_rate, input_shape=input_shape))
    for _ in range(layers-1):
        model.add(keras.layers.Dense(units=units, activation=tf.nn.relu))
        model.add(keras.layers.Dropout(rate=dropout_rate))

    model.add(keras.layers.Dense(units=num_classes, activation=tf.nn.sigmoid))
    return model

In [None]:
#For parameters refer to the upper cell
model = mlp_model(2, 32, 0.3, trainData.shape[1:], 1)
model.summary()

In [None]:
#Learning rate could be further decreased for additional accuracy
model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
#callbacks will prevent model from running if val_loss starts to increase
callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

history = model.fit(trainData,
                    trainLabels,
                    epochs=100,
                    callbacks = callbacks,
                    batch_size=256,
                    validation_data=(valData, validationLabels),
                    verbose=2)

In [None]:
#Evalueating model on the testset
#[loss, accuracy]
print(model.evaluate(testData, testLabels))

In [None]:
#The rest of the notebook helps vidualize losses and accuracies by ploting two separate grophs
history_dict = history.history
history_dict.keys()

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.clf() #clear figure
acc_values = history_dict['acc']
val_acc_values = history_dict['val_acc']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()