# Neural Network Classification of Privacy Policy Data Practices

## August Karlstedt

In [1]:
%matplotlib inline

import os
import imp
import operator
import math
import glob
import json
import time

from IPython.display import IFrame

import nltk
from nltk.tokenize import word_tokenize

import numpy as np
import pandas as pd
import seaborn as sn

import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#import pickle
#from six.moves import urllib

import tensorflow

from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional

from keras.models import Sequential
from keras.layers import Input, Dense, Activation, Dropout
from keras.layers import LSTM, TimeDistributed
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
from keras import metrics
from keras.utils.np_utils import to_categorical

#import fasttext
# https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md

import gensim
# https://nlp.stanford.edu/projects/glove/

Using TensorFlow backend.


### Prepare the Data

Let's setup the categories that we'll try to classify. We'll also need one-hot encodings that for the network.

From the OPP paper https://www.usableprivacy.org/static/files/swilson_acl_2016.pdf:

1. **First Party Collection/Use**: how and why a service provider collects user information.
2. **Third Party Sharing/Collection**: how user information may be shared with or collected by third parties. 
3. **User Choice/Control**: choices and control options available to users. 
4. **User Access, Edit, & Deletion**: if and how users may access, edit, or delete their information. 
5. **Data Retention**: how long user information is stored. 
6. **Data Security**: how user information is protected. 
7. **Policy Change**: if and how users will be in formed about changes to the privacy policy. 
8. **Do Not Track**: if and how Do Not Track signals 3 for online tracking and advertising are honored. 
9. **International & Specific Audiences**: practices that pertain only to a specific group of users (e.g., children, Europeans, or California residents). 
10. **Other**: additional sublabels for introductory or general text, contact information, and practices not covered by the other categories.

In [2]:
def get_data():
    
    files = []
    data = []
    header = ['Annotation ID', 'Batch ID', 'Annotator ID', 'Policy ID', 'Segment ID', 'Category Name', 'Attributes/Values', 'Policy URL', 'Date']
    keep_columns = ['Segment ID', 'Category Name', 'Attributes/Values']
    for file in glob.glob("data\\annotations/*.csv"):
        files.append(file[17:-4])
        data.append(pd.read_csv(file, names=header)[keep_columns])
        
    policies = []
    for file in files:
        with open("data\\sanitized_policies/{}.html".format(file)) as f:
            policies.append(f.readlines()[0].split('|||'))

    # categories = set()
    # for datum in data:
    #     cat = datum['Category Name']
    #     categories.update(cat)
    # categories

    categories = [
     'Data Retention', # 0
     'Data Security', # 1
     'Do Not Track', # 2
     'First Party Collection/Use', # 3
     'International and Specific Audiences', # 4
     'Other', # 5
     'Policy Change', # 6
     'Third Party Sharing/Collection', # 7
     'User Access, Edit and Deletion', # 8
     'User Choice/Control', # 9
     'None' # 10
    ]

    one_hot_categories = np.identity(len(categories))

    cat_dict = {
     categories[0]:  one_hot_categories[0],
     categories[1]:  one_hot_categories[1],
     categories[2]:  one_hot_categories[2],
     categories[3]:  one_hot_categories[3],
     categories[4]:  one_hot_categories[4],
     categories[5]:  one_hot_categories[5],
     categories[6]:  one_hot_categories[6],
     categories[7]:  one_hot_categories[7],
     categories[8]:  one_hot_categories[8],
     categories[9]:  one_hot_categories[9],
     categories[10]: one_hot_categories[10],
    }

    # attribute_value_types = set()
    # attribute_value_values = set()
    # for datum in data:
    #     avs = datum['Attributes/Values']
    #     for row in avs:
    #         parsed = json.loads(row)
    #         keys = list(parsed.keys())
    #         attribute_value_types.update(keys)
    #         for key in keys:
    #             attribute_value_values.add(parsed[key]['value'])

    attribute_value_types = ['Access Scope',
     'Access Type',
     'Action First-Party',
     'Action Third Party',
     'Audience Type',
     'Change Type',
     'Choice Scope',
     'Choice Type',
     'Collection Mode',
     'Do Not Track policy',
     'Does/Does Not',
     'Identifiability',
     'Notification Type',
     'Other Type',
     'Personal Information Type',
     'Purpose',
     'Retention Period',
     'Retention Purpose',
     'Security Measure',
     'Third Party Entity',
     'User Choice',
     'User Type']

    attribute_value_values = ['Additional service/feature',
     'Advertising',
     'Aggregated or anonymized',
     'Analytics/Research',
     'Basic service/feature',
     'Both',
     'Browser/device privacy controls',
     'Californians',
     'Children',
     'Citizens from other countries',
     'Collect from user on other websites',
     'Collect in mobile app',
     'Collect on first party website/app',
     'Collect on mobile website',
     'Collect on website',
     'Collection',
     'Computer information',
     'Contact',
     'Cookies and tracking elements',
     'Data access limitation',
     'Deactivate account',
     'Delete account (full)',
     'Delete account (partial)',
     'Demographic',
     'Does',
     'Does Not',
     'Dont use service/feature',
     'Edit information',
     'Europeans',
     'Explicit',
     'Export',
     'Financial',
     'First party collection',
     'First party use',
     'First-party privacy controls',
     'General notice in privacy policy',
     'General notice on website',
     'Generic',
     'Generic personal information',
     'Health',
     'Honored',
     'IP address and device IDs',
     'Identifiable',
     'Implicit',
     'In case of merger or acquisition',
     'Indefinitely',
     'Introductory/Generic',
     'Legal requirement',
     'Limited',
     'Location',
     'Marketing',
     'Mentioned, but unclear if honored',
     'Merger/Acquisition',
     'Named third party',
     'No notification',
     'Non-privacy relevant change',
     'None',
     'Not honored',
     'Not mentioned',
     'Opt-in',
     'Opt-out',
     'Opt-out link',
     'Opt-out via contacting company',
     'Other',
     'Other data about user',
     'Other part of company/affiliate',
     'Other users',
     'Perform service',
     'Personal identifier',
     'Personal notice',
     'Personalization/Customization',
     'Practice not covered',
     'Privacy contact information',
     'Privacy relevant change',
     'Privacy review/audit',
     'Privacy training',
     'Privacy/Security program',
     'Profile data',
     'Public',
     'Receive from other parts of company/affiliates',
     'Receive from other service/third-party (named)',
     'Receive from other service/third-party (unnamed)',
     'Receive/Shared with',
     'Secure data storage',
     'Secure data transfer',
     'Secure user authentication',
     'See',
     'Service Operation and Security',
     'Service operation and security',
     'Social media data',
     'Stated Period',
     'Survey data',
     'Third party sharing/collection',
     'Third party use',
     'Third-party privacy controls',
     'Track on first party website/app',
     'Track user on other websites',
     'Transactional data',
     'Unnamed third party',
     'Unspecified',
     'Use',
     'User Profile',
     'User account data',
     'User online activities',
     'User participation',
     'User profile',
     'User with account',
     'User without account',
     'View',
     'not-selected']

    model = gensim.models.Doc2Vec(size=100)

    stemmer = nltk.stem.porter.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()

    chosen_categories = ['First Party Collection/Use', 
                         'Third Party Sharing/Collection', 
                         'Other', 
                         'User Choice/Control', 
                         'Data Security',
                         'International and Specific Audiences',
                         'User Access, Edit and Deletion',
                         'Policy Change',
                         'Data Retention',
                         'Do Not Track',
                         'None' # added by us, not in original corpus
                        ]
    remove_text = ['null', 'Not selected']

    df_columns = ['text', 'category', 'category one hot', 'text vec']
    df = pd.DataFrame([], columns=df_columns)
    series = []
    documents = []
    cats = []

    remove_spans = {} # dictionary of policy ids and list of start, stop tuples that are then removed
    # remove_spans structure:

    '''
    {
    "2": --> this is the policy id
      {
       "6": [(20, 30), (30, 50)], --> this is the segment id
       "8": [(40, 123)] --> which maps to a list of tuple of start, end indices
      }
    }
    '''


    idx = 0
    for datum_idx in range(len(data)):
        datum = data[datum_idx]
        for idx in range(len(datum)):        
            category = datum['Category Name'][idx]

            if chosen_categories is None:
                continue

            if category not in chosen_categories:
                continue

            segment_id = datum['Segment ID'][idx]
            if datum_idx not in remove_spans:
                remove_spans[datum_idx] = {}
            if segment_id not in remove_spans[datum_idx]:
                remove_spans[datum_idx][segment_id] = []

            # ok, we have our policy text, now we need to 
            # remove all of the spans that are associated with a category
            # so we can attribute that text to the 'None' category

            parsed = json.loads(datum['Attributes/Values'][idx])
            for value in attribute_value_types:
                if value in parsed.keys():
                    attributes = parsed[value]
                    has_selected_text = 'selectedText' in attributes
                    has_start_idx = 'startIndexInSegment' in attributes
                    has_end_idx = 'endIndexInSegment' in attributes
                    if has_selected_text and has_start_idx and has_end_idx:
                        text = attributes['selectedText']
                        start_idx = attributes['startIndexInSegment']
                        end_idx = attributes['endIndexInSegment']

                        if text in remove_text or start_idx == -1 or end_idx == -1:
                            continue

                        remove_spans[datum_idx][segment_id].append((start_idx, end_idx))

                        text = text.lower()
                        processed_text = word_tokenize(text)
                        #processed_text = [stemmer.stem(word) for word in processed_text]
                        processed_text = [lemmatizer.lemmatize(word) for word in processed_text]

                        doc = gensim.models.doc2vec.TaggedDocument(processed_text, [idx])
                        documents.append(doc)
                        cats.append(cat_dict[category])
                        text = ' '.join(processed_text)
                        series.append(pd.Series([text, category, cat_dict[category], None], index=df_columns))

                        idx += 1

    SHOULD_PROCESS_NONE_CATEGORY = True

    replace_items = ["<br>", "<strong>", "</strong>", "<ul>", "</ul>", "<li>", "</li>", "<ol>", "</ol>"]
    category = 'None'
    none_count = 0

    if SHOULD_PROCESS_NONE_CATEGORY:
        for policy_idx in remove_spans:
            policy = policies[policy_idx]
            for segment_idx in remove_spans[policy_idx]:
                try:
                    policy_segment = policy[segment_idx]
                except IndexError as e:
                    print(e, policy_idx, segment_idx)
                    continue
                segment_text = policy_segment
                for span in remove_spans[policy_idx][segment_idx]:
                    start_idx = span[0]
                    end_idx = span[1]
                    segment_text = segment_text[:start_idx] + " " + segment_text[end_idx:]
                segment_text = segment_text.lower()
                for item in replace_items:
                    segment_text = segment_text.replace(item, " ")
                segment_text = segment_text.strip()
                if segment_text: # check if we have any characters at all
                    processed_text = word_tokenize(segment_text)
                    processed_text = [lemmatizer.lemmatize(word) for word in processed_text]

                    doc = gensim.models.doc2vec.TaggedDocument(processed_text, [idx])
                    documents.append(doc)
                    cats.append(cat_dict[category])
                    text = ' '.join(processed_text)
                    series.append(pd.Series([text, category, cat_dict[category], None], index=df_columns))
                    none_count += 1
                    idx += 1

        print('None count: {}'.format(none_count))

        cats = np.array(cats)

        df = df.append(series, ignore_index=True)
        print(df.shape)

        model.build_vocab(documents)
        model.train(documents, total_examples=len(documents), epochs=16)

        vecs = []
        for row in df.itertuples():
            category_not_chosen = chosen_categories is None
            category_chosen_and_matches = chosen_categories is not None and row.category in chosen_categories
            if category_chosen_and_matches or category_not_chosen:
                model.random = np.random.RandomState(1234)
                vecs.append(np.array(model.infer_vector(word_tokenize(row.text))))

        vecs = np.array(vecs)
        print(vecs.shape)

        return vecs, cats

Since these Doc2Vec vectors are our only input data right now, let's just use them directly as our input data.

### Hyperparameter tuning setup

In [3]:
def data():
    
    vecs, cats = get_data()
    
    choice = np.random.choice(len(vecs), len(vecs), replace=False)
    test_percentage = 0.25 # keep 25% of data for testing
    test_amount = math.floor(0.25 * len(vecs))
    train_indices = choice[test_amount:]
    test_indices = choice[:test_amount]
    
    # vecs, cats
    x_train = vecs[train_indices]
    x_test = vecs[test_indices]
    y_train = cats[train_indices]
    y_test = cats[test_indices]
    return x_train, y_train, x_test, y_test

### Neural Network Setup

For now, we'll use a simple neural network consisting of:

1. **Fully connected** layer with 256 nodes, relu activation
2. **Dropout** 25% of the inputs
3. **Fully connected** layer with 256 nodes, relu activation
4. **Dropout** 25% of the inputs
5. **Fully connected** layer with 11 nodes, softmax activation

In [4]:
def create_model(x_train, y_train, x_test, y_test):

    nn_model = Sequential()
    nn_model.add(Dense(256, batch_input_shape=(None, 100, )))
    nn_model.add(Activation({{choice(['relu', 'sigmoid'])}}))
    nn_model.add(Dropout({{uniform(0, 1)}}))
    nn_model.add(Dense({{choice([256, 512, 1024])}}))
    nn_model.add(Activation({{choice(['relu', 'sigmoid'])}}))
    nn_model.add(Dropout({{uniform(0, 1)}}))
    nn_model.add(Dense(11))
    nn_model.add(Activation('softmax'))

    nn_model.compile(loss='categorical_crossentropy', optimizer={{choice(['rmsprop', 'adam', 'sgd'])}}, metrics=[metrics.categorical_accuracy])

    print(nn_model.summary())

    tensorboard_callback = TensorBoard(log_dir='C:/tmp/pp_run-'+time.strftime("%Y-%m-%d-%H%M%S"))
    nn_model.fit(x_train, y_train, batch_size={{choice([64, 128])}}, epochs=1, verbose=2, validation_data=(x_test, y_test), callbacks=[tensorboard_callback])
    score, acc = nn_model.evaluate(x_test, y_test, verbose=0)
    print('Test accuracy:', acc)
    return {'loss': -acc, 'status': STATUS_OK, 'model': nn_model}

If you have TensorBoard running, you'll see it in the IFrame below!

(With default settings, pointing to http://localhost:6006)

In [5]:
IFrame('http://localhost:6006', '100%', 800)

In [6]:
functions=[get_data]
best_run, best_model = optim.minimize(model=create_model, data=data, functions=functions, algo=tpe.suggest, max_evals=4, trials=Trials(), notebook_name='Privacy Policies and Neural Networks')
X_train, Y_train, X_test, Y_test = data()
print("Evalutation of best performing model:")
print(best_model.evaluate(X_test, Y_test))
print("Best performing model chosen hyper-parameters:")
print(best_run)

>>> Imports:
#coding=utf-8

try:
    import os
except:
    pass

try:
    import imp
except:
    pass

try:
    import operator
except:
    pass

try:
    import math
except:
    pass

try:
    import glob
except:
    pass

try:
    import json
except:
    pass

try:
    import time
except:
    pass

try:
    from IPython.display import IFrame
except:
    pass

try:
    import nltk
except:
    pass

try:
    from nltk.tokenize import word_tokenize
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    import seaborn as sn
except:
    pass

try:
    import matplotlib.pyplot as plt
except:
    pass

try:
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
except:
    pass

try:
    import tensorflow
except:
    pass

try:
    from hyperopt import Trials, STATUS_OK, tpe
except:
    pass

try:
    from hyperas import optim
except:
    pass

try:
    from hyperas.distributions import choice, uniform, conditional


list index out of range 47 4
list index out of range 47 5
list index out of range 47 6
list index out of range 47 7
list index out of range 47 8
list index out of range 47 9
list index out of range 47 10
list index out of range 47 11
list index out of range 47 12
list index out of range 47 13
list index out of range 47 14
list index out of range 47 15
list index out of range 47 16
list index out of range 47 17
list index out of range 47 18
list index out of range 47 19
list index out of range 47 20
list index out of range 47 21
list index out of range 47 22
list index out of range 47 23
list index out of range 47 24
list index out of range 47 25
list index out of range 47 26
list index out of range 47 27
list index out of range 47 28
list index out of range 47 29
list index out of range 47 30
list index out of range 47 31
list index out of range 47 32
list index out of range 47 33
list index out of range 47 34
list index out of range 47 35
list index out of range 47 36
list index out o

KeyboardInterrupt: 

### Testing

Let's generate a confusion matrix to see the accuracy of our model.

First, just to get some stats on each category, the number of items in each:

In [None]:
category_lengths = {}

for category in categories:
    category_lengths[category] = len(df.loc[df['category'] == category])
    print(category_lengths[category], 'examples in the', category, 'category')

To generate our confusion matrix, we need the indices

In [None]:
confusion_matrix_columns = categories

confusion_matrix = pd.DataFrame([], columns=confusion_matrix_columns)
confusion_matrix

In [None]:
predictions = nn_model.predict(X_in)

In [None]:
for category in categories:
    row = pd.DataFrame([np.zeros(len(categories), dtype=np.int64)], columns=confusion_matrix_columns, index=[category])
    
    examples = df.loc[df['category'] == category]
    indices = examples.index
    
    pred = predictions[indices]
   
    for i in range(len(indices)):
        predicted_category = categories[np.argmax(np.round(pred[i]))]
        row[predicted_category] += 1
        
    if category_lengths[category] > 0:
        row /= category_lengths[category]
        
    confusion_matrix = confusion_matrix.append(row)    

In [None]:
confusion_matrix

In [None]:
confusion_matrix = confusion_matrix.infer_objects()

In [None]:
plt.figure(figsize = (12,10))
sn.heatmap(confusion_matrix, annot=True)