# Neural Network Classification of Privacy Policy Data Practices

## August Karlstedt

In [39]:
%matplotlib inline

import os
import imp
import operator
import math
import glob
import json
import time

from IPython.display import IFrame

import nltk
from nltk.tokenize import word_tokenize

import numpy as np
import pandas as pd
import seaborn as sn

import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#import pickle
#from six.moves import urllib

import tensorflow

from hyperopt import Trials, STATUS_OK
import hyperopt
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional

from keras.models import Sequential
from keras.layers import Input, Dense, Activation, Dropout
from keras.layers import LSTM, TimeDistributed
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, TensorBoard
from keras import metrics
from keras.utils.np_utils import to_categorical

#import fasttext
# https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md

import gensim
# https://nlp.stanford.edu/projects/glove/

### Prepare the Data

Let's setup the categories that we'll try to classify. We'll also need one-hot encodings that for the network.

From the OPP paper https://www.usableprivacy.org/static/files/swilson_acl_2016.pdf:

1. **First Party Collection/Use**: how and why a service provider collects user information.
2. **Third Party Sharing/Collection**: how user information may be shared with or collected by third parties. 
3. **User Choice/Control**: choices and control options available to users. 
4. **User Access, Edit, & Deletion**: if and how users may access, edit, or delete their information. 
5. **Data Retention**: how long user information is stored. 
6. **Data Security**: how user information is protected. 
7. **Policy Change**: if and how users will be in formed about changes to the privacy policy. 
8. **Do Not Track**: if and how Do Not Track signals 3 for online tracking and advertising are honored. 
9. **International & Specific Audiences**: practices that pertain only to a specific group of users (e.g., children, Europeans, or California residents). 
10. **Other**: additional sublabels for introductory or general text, contact information, and practices not covered by the other categories.

In [40]:
def get_annotations():
    filenames = []
    annotations = []
    
    header = ['Annotation ID', 'Batch ID', 'Annotator ID', 'Policy ID', 'Segment ID', 'Category Name', 'Attributes/Values', 'Policy URL', 'Date']
    keep_columns = ['Segment ID', 'Category Name', 'Attributes/Values']
    
    for file in glob.glob("data\\annotations/*.csv"):
        filenames.append(file[17:-4])
        annotations.append(pd.read_csv(file, names=header)[keep_columns])
    
    return filenames, annotations

In [41]:
def get_sanitized_policies(annotation_files):
    sanitized_policies = []
    
    for file in annotation_files:
        with open("data\\sanitized_policies/{}.html".format(file)) as f:
            sanitized_policies.append(f.readlines()[0].split('|||'))
            
    return sanitized_policies

In [42]:
def get_categories():
    categories = [
     'Data Retention', # 0
     'Data Security', # 1
     'Do Not Track', # 2
     'First Party Collection/Use', # 3
     'International and Specific Audiences', # 4
     'Other', # 5
     'Policy Change', # 6
     'Third Party Sharing/Collection', # 7
     'User Access, Edit and Deletion', # 8
     'User Choice/Control', # 9
     'None' # 10
    ]
    
    categories_one_hot = np.identity(len(categories))

    category_lookup_table = {
     categories[0]:  categories_one_hot[0],
     categories[1]:  categories_one_hot[1],
     categories[2]:  categories_one_hot[2],
     categories[3]:  categories_one_hot[3],
     categories[4]:  categories_one_hot[4],
     categories[5]:  categories_one_hot[5],
     categories[6]:  categories_one_hot[6],
     categories[7]:  categories_one_hot[7],
     categories[8]:  categories_one_hot[8],
     categories[9]:  categories_one_hot[9],
     categories[10]: categories_one_hot[10],
    }
    
    return categories, categories_one_hot, category_lookup_table

In [43]:
def get_attributes():
    
    attribute_value_types = ['Access Scope',
     'Access Type',
     'Action First-Party',
     'Action Third Party',
     'Audience Type',
     'Change Type',
     'Choice Scope',
     'Choice Type',
     'Collection Mode',
     'Do Not Track policy',
     'Does/Does Not',
     'Identifiability',
     'Notification Type',
     'Other Type',
     'Personal Information Type',
     'Purpose',
     'Retention Period',
     'Retention Purpose',
     'Security Measure',
     'Third Party Entity',
     'User Choice',
     'User Type'
    ]

    attribute_value_values = ['Additional service/feature',
     'Advertising',
     'Aggregated or anonymized',
     'Analytics/Research',
     'Basic service/feature',
     'Both',
     'Browser/device privacy controls',
     'Californians',
     'Children',
     'Citizens from other countries',
     'Collect from user on other websites',
     'Collect in mobile app',
     'Collect on first party website/app',
     'Collect on mobile website',
     'Collect on website',
     'Collection',
     'Computer information',
     'Contact',
     'Cookies and tracking elements',
     'Data access limitation',
     'Deactivate account',
     'Delete account (full)',
     'Delete account (partial)',
     'Demographic',
     'Does',
     'Does Not',
     'Dont use service/feature',
     'Edit information',
     'Europeans',
     'Explicit',
     'Export',
     'Financial',
     'First party collection',
     'First party use',
     'First-party privacy controls',
     'General notice in privacy policy',
     'General notice on website',
     'Generic',
     'Generic personal information',
     'Health',
     'Honored',
     'IP address and device IDs',
     'Identifiable',
     'Implicit',
     'In case of merger or acquisition',
     'Indefinitely',
     'Introductory/Generic',
     'Legal requirement',
     'Limited',
     'Location',
     'Marketing',
     'Mentioned, but unclear if honored',
     'Merger/Acquisition',
     'Named third party',
     'No notification',
     'Non-privacy relevant change',
     'None',
     'Not honored',
     'Not mentioned',
     'Opt-in',
     'Opt-out',
     'Opt-out link',
     'Opt-out via contacting company',
     'Other',
     'Other data about user',
     'Other part of company/affiliate',
     'Other users',
     'Perform service',
     'Personal identifier',
     'Personal notice',
     'Personalization/Customization',
     'Practice not covered',
     'Privacy contact information',
     'Privacy relevant change',
     'Privacy review/audit',
     'Privacy training',
     'Privacy/Security program',
     'Profile data',
     'Public',
     'Receive from other parts of company/affiliates',
     'Receive from other service/third-party (named)',
     'Receive from other service/third-party (unnamed)',
     'Receive/Shared with',
     'Secure data storage',
     'Secure data transfer',
     'Secure user authentication',
     'See',
     'Service Operation and Security',
     'Service operation and security',
     'Social media data',
     'Stated Period',
     'Survey data',
     'Third party sharing/collection',
     'Third party use',
     'Third-party privacy controls',
     'Track on first party website/app',
     'Track user on other websites',
     'Transactional data',
     'Unnamed third party',
     'Unspecified',
     'Use',
     'User Profile',
     'User account data',
     'User online activities',
     'User participation',
     'User profile',
     'User with account',
     'User without account',
     'View',
     'not-selected'
    ]
    
    return attribute_value_types, attribute_value_values

In [44]:
def get_chosen_categories():
    return ['First Party Collection/Use', 
            'Third Party Sharing/Collection', 
            'Other', 
            'User Choice/Control', 
            'Data Security',
            'International and Specific Audiences',
            'User Access, Edit and Deletion',
            'Policy Change',
            'Data Retention',
            'Do Not Track',
            'None' # added by us, not in original corpus
           ]

In [45]:
def get_text_to_remove():
    return ['null', 'Not selected']

In [46]:
def get_annotation_data(annotations, sanitized_policies, category_lookup_table):
    chosen_categories = get_chosen_categories()
    attribute_value_types, attribute_value_values = get_attributes()
    remove_text = get_text_to_remove()
    
    #stemmer = nltk.stem.porter.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    idx = 0
    
    documents = []
    categories = []
    
    remove_spans = {} # dictionary of policy ids and list of start, stop tuples that are then removed

    '''
    remove_spans structure:
    
    {
    "2": --> this is the policy id
      {
       "6": [(20, 30), (30, 50)], --> this is the segment id
       "8": [(40, 123)] --> which maps to a list of tuple of start, end indices
      }
    }
    '''
        
    ##  first process the annotations
    for annotation_idx in range(len(annotations)):
        annotation = annotations[annotation_idx]
        
        for idx in range(len(annotation)):        
            category = annotation['Category Name'][idx]

            if chosen_categories is None:
                continue

            if category not in chosen_categories:
                continue

            segment_id = annotation['Segment ID'][idx]
            
            if annotation_idx not in remove_spans:
                remove_spans[annotation_idx] = {}
                
            if segment_id not in remove_spans[annotation_idx]:
                remove_spans[annotation_idx][segment_id] = []

            # ok, we have our policy text, now we need to 
            # remove all of the spans that are associated with a category
            # so we can attribute that text to the 'None' category

            parsed = json.loads(annotation['Attributes/Values'][idx])
            for value in attribute_value_types:
                if value not in parsed.keys():
                    continue

                attributes = parsed[value]
                
                if 'selectedText' not in attributes:
                    continue
                    
                if 'startIndexInSegment' not in attributes:
                    continue
                    
                if 'endIndexInSegment' not in attributes:
                    continue

                text = attributes['selectedText']
                if text in remove_text:
                    continue
                    
                start_idx = attributes['startIndexInSegment']
                if start_idx == -1:
                    continue
                    
                end_idx = attributes['endIndexInSegment']
                if end_idx == -1:
                    continue

                remove_spans[annotation_idx][segment_id].append((start_idx, end_idx))

                text = text.lower()
                processed_text = word_tokenize(text)
                processed_text = [lemmatizer.lemmatize(word) for word in processed_text]

                doc = gensim.models.doc2vec.TaggedDocument(processed_text, [idx])
                documents.append(doc)
                
                categories.append(category_lookup_table[category])
                
                idx += 1
    
    ## now process the remove spans from the policies
    if 'None' in chosen_categories:
        replace_items = ["<br>", "<strong>", "</strong>", "<ul>", "</ul>", "<li>", "</li>", "<ol>", "</ol>"]

        for policy_idx in remove_spans:
            policy = sanitized_policies[policy_idx]

            for segment_idx in remove_spans[policy_idx]:

                try:
                    policy_segment = policy[segment_idx]
                except IndexError as e:
                    #print(e, policy_idx, segment_idx)
                    continue

                segment_text = policy_segment
                for span in remove_spans[policy_idx][segment_idx]:
                    start_idx = span[0]
                    end_idx = span[1]
                    segment_text = segment_text[:start_idx] + " " + segment_text[end_idx:]

                segment_text = segment_text.lower()

                for item in replace_items:
                    segment_text = segment_text.replace(item, " ")

                segment_text = segment_text.strip()

                if not segment_text: # check if we have any characters at all
                    continue

                processed_text = word_tokenize(segment_text)
                processed_text = [lemmatizer.lemmatize(word) for word in processed_text]

                doc = gensim.models.doc2vec.TaggedDocument(processed_text, [idx])
                documents.append(doc)

                categories.append(category_lookup_table['None'])

                idx += 1
    
    return documents, categories

In [47]:
def train_embeddings(documents, epochs=16):
    model = gensim.models.Doc2Vec(vector_size=100)
    model.build_vocab(documents)
    model.train(documents, total_examples=len(documents), epochs=epochs)
    return model

In [48]:
def get_vector(model, text):
    # set this so it's deterministic
    model.random = np.random.RandomState(1234)
    return model.infer_vector(word_tokenize(text))

In [49]:
def get_data():
    filenames, annotations = get_annotations()
    sanitized_policies = get_sanitized_policies(filenames)
    categories, categories_one_hot, category_lookup_table = get_categories()
    
    documents, categories = get_annotation_data(annotations, sanitized_policies, category_lookup_table)
    embedding_model = train_embeddings(documents)
    
    text_span_vectors = []
    for idx in range(len(categories)):
        text = ' '.join(documents[idx].words)
        vector = get_vector(embedding_model, text)
        text_span_vectors.append(vector)

    return text_span_vectors, categories

Since these Doc2Vec vectors are our only input data right now, let's just use them directly as our input data.

### Hyperparameter tuning setup

In [50]:
def data():
    text_span_vectors, categories = get_data()
    
    text_span_vectors = np.array(text_span_vectors)
    categories = np.array(categories)
    
    choice = np.random.choice(len(text_span_vectors), len(text_span_vectors), replace=False)
    test_percentage = 0.25 # keep 25% of data for testing
    test_amount = math.floor(0.25 * len(text_span_vectors))
    train_indices = np.array(choice[test_amount:])
    test_indices = np.array(choice[:test_amount])
    
    x_train = text_span_vectors[train_indices]
    x_test = text_span_vectors[test_indices]
    y_train = categories[train_indices]
    y_test = categories[test_indices]
    
    return x_train, y_train, x_test, y_test

### Neural Network Setup

For now, we'll use a simple neural network consisting of:

1. **Fully connected** layer with 256 nodes, relu activation
2. **Dropout** 25% of the inputs
3. **Fully connected** layer with 256 nodes, relu activation
4. **Dropout** 25% of the inputs
5. **Fully connected** layer with 11 nodes, softmax activation

In [55]:
def create_model(x_train, y_train, x_test, y_test):

    nn_model = Sequential()
    nn_model.add(Dense( {{choice([32, 64, 128, 256, 512, 1024])}}, batch_input_shape=(None, 100, )))
    nn_model.add(Activation( {{choice(['relu', 'tanh', 'sigmoid'])}} ))
    
    if conditional( {{choice(['dropout', 'no dropout'])}} ) == 'dropout':
        nn_model.add(Dropout( {{uniform(0, 1)}} ))
    
    nn_model.add(Dense(11))
    nn_model.add(Activation('softmax'))
    
    nn_model.compile(loss='categorical_crossentropy', optimizer={{choice(['sgd', 'rmsprop', 'adagrad', 'adam', 'nadam'])}}, metrics=[metrics.categorical_accuracy])

    #print(nn_model.summary())

    tensorboard_callback = TensorBoard(log_dir='C:/tmp/pp_run-'+time.strftime("%Y-%m-%d-%H%M%S"))
    early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1, mode='auto')
    history = nn_model.fit(x_train, y_train, batch_size=128, epochs=64, verbose=2, validation_data=(x_test, y_test), callbacks=[early_stopping, tensorboard_callback])
    acc = history.history['val_categorical_accuracy'][-1]
    print('Test accuracy:', acc)
    
    return {'loss': -acc, 'status': STATUS_OK, 'model': nn_model}

In [56]:
IFrame('http://localhost:6006', '100%', 800)

In [58]:
functions=[get_data, get_vector, train_embeddings, get_annotation_data, get_text_to_remove, get_chosen_categories, get_attributes, get_categories, get_sanitized_policies, get_annotations]
best_run, best_model = optim.minimize(model=create_model, data=data, functions=functions, algo=hyperopt.rand.suggest, max_evals=4, trials=Trials(), notebook_name='Privacy Policies and Neural Networks')
X_train, Y_train, X_test, Y_test = data()
print("Evalutation of best performing model:")
print(best_model.evaluate(X_test, Y_test))
print("Best performing model chosen hyper-parameters:")
print(best_run)

>>> Imports:
#coding=utf-8

try:
    import os
except:
    pass

try:
    import imp
except:
    pass

try:
    import operator
except:
    pass

try:
    import math
except:
    pass

try:
    import glob
except:
    pass

try:
    import json
except:
    pass

try:
    import time
except:
    pass

try:
    from IPython.display import IFrame
except:
    pass

try:
    import nltk
except:
    pass

try:
    from nltk.tokenize import word_tokenize
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    import seaborn as sn
except:
    pass

try:
    import matplotlib.pyplot as plt
except:
    pass

try:
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
except:
    pass

try:
    import tensorflow
except:
    pass

try:
    from hyperopt import Trials, STATUS_OK
except:
    pass

try:
    import hyperopt
except:
    pass

try:
    from hyperas import optim
except:
    pass

try:
    from hyperas.distribution

Train on 59753 samples, validate on 19917 samples
Epoch 1/64
 - 2s - loss: 1.5378 - categorical_accuracy: 0.4481 - val_loss: 1.3693 - val_categorical_accuracy: 0.4690
Epoch 2/64
 - 1s - loss: 1.3887 - categorical_accuracy: 0.4732 - val_loss: 1.3432 - val_categorical_accuracy: 0.4863
Epoch 3/64
 - 2s - loss: 1.3644 - categorical_accuracy: 0.4831 - val_loss: 1.3310 - val_categorical_accuracy: 0.4916
Epoch 4/64
 - 2s - loss: 1.3518 - categorical_accuracy: 0.4886 - val_loss: 1.3204 - val_categorical_accuracy: 0.4992
Epoch 5/64
 - 2s - loss: 1.3395 - categorical_accuracy: 0.4956 - val_loss: 1.3124 - val_categorical_accuracy: 0.5024
Epoch 00005: early stopping
Test accuracy: 0.5024351056691555
Train on 59753 samples, validate on 19917 samples
Epoch 1/64
 - 2s - loss: 1.5440 - categorical_accuracy: 0.4486 - val_loss: 1.3627 - val_categorical_accuracy: 0.4702
Epoch 2/64
 - 2s - loss: 1.4184 - categorical_accuracy: 0.4704 - val_loss: 1.3498 - val_categorical_accuracy: 0.4728
Epoch 3/64
 - 2s - 

KeyboardInterrupt: 

### Testing

Let's generate a confusion matrix to see the accuracy of our model.

First, just to get some stats on each category, the number of items in each:

In [None]:
category_lengths = {}

for category in categories:
    category_lengths[category] = len(df.loc[df['category'] == category])
    print(category_lengths[category], 'examples in the', category, 'category')

To generate our confusion matrix, we need the indices

In [None]:
confusion_matrix_columns = categories

confusion_matrix = pd.DataFrame([], columns=confusion_matrix_columns)
confusion_matrix

In [None]:
predictions = nn_model.predict(X_in)

In [None]:
for category in categories:
    row = pd.DataFrame([np.zeros(len(categories), dtype=np.int64)], columns=confusion_matrix_columns, index=[category])
    
    examples = df.loc[df['category'] == category]
    indices = examples.index
    
    pred = predictions[indices]
   
    for i in range(len(indices)):
        predicted_category = categories[np.argmax(np.round(pred[i]))]
        row[predicted_category] += 1
        
    if category_lengths[category] > 0:
        row /= category_lengths[category]
        
    confusion_matrix = confusion_matrix.append(row)    

In [None]:
confusion_matrix

In [None]:
confusion_matrix = confusion_matrix.infer_objects()

In [None]:
plt.figure(figsize = (12,10))
sn.heatmap(confusion_matrix, annot=True)