# August Karlstedt

In [379]:
%matplotlib inline

import os
import imp
import operator
import math
import glob
import json
import time

from IPython.display import IFrame

import nltk

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#import pickle
#from six.moves import urllib

import tensorflow

from keras.models import Sequential
from keras.layers import Input, Dense, Activation, Dropout
from keras.layers import LSTM, TimeDistributed
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
from keras import metrics
from keras.utils.np_utils import to_categorical

#import fasttext
# https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md

import gensim
# https://nlp.stanford.edu/projects/glove/

In [380]:
files = []
data = []
header = ['Annotation ID', 'Batch ID', 'Annotator ID', 'Policy ID', 'Segment ID', 'Category Name', 'Attributes/Values', 'Policy URL', 'Date']
keep_columns = ['Segment ID', 'Category Name', 'Attributes/Values']
for file in glob.glob("data\\annotations/*.csv"):
    files.append(file[17:-4])
    data.append(pd.read_csv(file, names=header)[keep_columns])

In [381]:
files[:10]

['1017_sci-news.com',
 '1028_redorbit.com',
 '1034_aol.com',
 '1050_honda.com',
 '105_amazon.com',
 '1070_wnep.com',
 '1083_highgearmedia.com',
 '1089_freep.com',
 '1099_enthusiastnetwork.com',
 '1106_allstate.com']

In [382]:
data[0]

Unnamed: 0,Segment ID,Category Name,Attributes/Values
0,0,Other,"{""Other Type"": {""selectedText"": ""Sci-News.com ..."
1,1,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""nformati..."
2,1,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""nformati..."
3,2,Data Retention,"{""Personal Information Type"": {""selectedText"":..."
4,3,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""Not sele..."
5,3,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""Not sele..."
6,3,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""Not sele..."
7,3,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""Not sele..."
8,4,International and Specific Audiences,"{""Audience Type"": {""selectedText"": ""Sci-News.c..."
9,4,Other,"{""Other Type"": {""selectedText"": ""Parents or gu..."


In [383]:
policies = []
for file in files:
    with open("data\\sanitized_policies/{}.html".format(file)) as f:
        policies.append(f.readlines()[0].split('|||'))

In [384]:
print(len(policies), len(policies[0]), policies[0])

115 13 ['Privacy Policy <br> <br> Sci-News.com is committed to protecting and respecting your privacy. To better inform you of our policy concerning user privacy, we have adopted the following terms. Please note that these terms are subject to change, and any such changes will be included on this page. <br> <br>', 'Information that Sci-News.com May Collect Online <br> <br> Sci-News.com may collect and process the following data about you: <br> <br> - information that you provide by filling in forms on our site, including names, e-mail and website addresses; we may also ask you for information for other purposes, for example when you report a problem with our site; <br> <br>', '- if you contact us, we may keep a record of that correspondence; <br> <br>', '- details of your visits to our site including, but not limited to, traffic data, location data, weblogs and other communication data. <br> <br>', 'Sci-News.com does not knowingly collect or solicit personal information from anyone und

Okay, so we have loaded our CSV files and also our privacy policies. Now we can use the CSV data to index into each segment in each privacy policy. Let's try this out manually for one privacy policy just to start.

In [385]:
test_file_index = 64
test_csv_file = data[test_file_index] # our table of attribute/values/categories/segment ids
test_pp_file = policies[test_file_index]
files[test_file_index]

'414_washingtonian.com'

Okay, let's load a single annotation and display the highlighted text in the privacy policy

In [386]:
test_row_num = 4

In [387]:
test_segment_id = test_csv_file['Segment ID'][test_row_num]
test_segment_id

2

In [388]:
test_category_name = test_csv_file['Category Name'][test_row_num]
test_category_name

'First Party Collection/Use'

In [389]:
test_attributes_values = json.loads(test_csv_file['Attributes/Values'][test_row_num])
test_attributes_values

{'Action First-Party': {'endIndexInSegment': 44,
  'selectedText': 'we collect',
  'startIndexInSegment': 34,
  'value': 'Unspecified'},
 'Choice Scope': {'endIndexInSegment': -1,
  'selectedText': 'Not selected',
  'startIndexInSegment': -1,
  'value': 'Unspecified'},
 'Choice Type': {'endIndexInSegment': -1,
  'selectedText': 'Not selected',
  'startIndexInSegment': -1,
  'value': 'Unspecified'},
 'Collection Mode': {'endIndexInSegment': -1,
  'selectedText': 'null',
  'startIndexInSegment': -1,
  'value': 'not-selected'},
 'Does/Does Not': {'endIndexInSegment': -1,
  'selectedText': 'null',
  'startIndexInSegment': -1,
  'value': 'Does'},
 'Identifiability': {'endIndexInSegment': -1,
  'selectedText': 'null',
  'startIndexInSegment': -1,
  'value': 'not-selected'},
 'Personal Information Type': {'endIndexInSegment': 56,
  'selectedText': 'include',
  'startIndexInSegment': 49,
  'value': 'Other'},
 'Purpose': {'endIndexInSegment': -1,
  'selectedText': 'Not selected',
  'startIndexI

In [390]:
test_attributes_values.keys()

dict_keys(['Does/Does Not', 'Choice Scope', 'Choice Type', 'User Type', 'Purpose', 'Action First-Party', 'Identifiability', 'Collection Mode', 'Personal Information Type'])

In [391]:
test_start_index = test_attributes_values['Personal Information Type']['startIndexInSegment']
test_end_index = test_attributes_values['Personal Information Type']['endIndexInSegment']
test_value = test_attributes_values['Personal Information Type']['value']
test_selected_text = test_attributes_values['Personal Information Type']['selectedText']
print(test_start_index, test_end_index, test_value, test_selected_text)

49 56 Other include


In [392]:
test_pp_file[test_segment_id][test_start_index:test_end_index]

'include'

In [393]:
print("Segment ID: {}".format(test_segment_id))
print("Category Name: {}".format(test_category_name))
print("Type: {}".format('Personal Information Type'))
print("Value: {}".format(test_value))
print("Selected Text: {}".format(test_selected_text))

Segment ID: 2
Category Name: First Party Collection/Use
Type: Personal Information Type
Value: Other
Selected Text: include


Okay, a few things.

1. The text is already included in the CSV file. We don't necessarily need to index into the policy itself, unless we want to grab the entire sentence. Let's start by training the NN on *just* the selected text and not the whole sentence and see what we get.
2. The JSON in the `Attributes/Values` column has many different types. We need to grab them all so we know what to index into the dictionary for.
3. There's a value associated with each annotation OR it can be `Unspecified`

In [394]:
# categories = set()
# for datum in data:
#     cat = datum['Category Name']
#     categories.update(cat)
# categories

categories = [
 'Data Retention',
 'Data Security',
 'Do Not Track',
 'First Party Collection/Use', # 3
 'International and Specific Audiences',
 'Other',
 'Policy Change',
 'Third Party Sharing/Collection', # 7
 'User Access, Edit and Deletion',
 'User Choice/Control'
]

one_hot_categories = np.array([
#                       [0,0,0,0,0,0,0,0,0,1],
#                       [0,0,0,0,0,0,0,0,1,0],
#                       [0,0,0,0,0,0,0,1,0,0],
                      [0,1],#[0,0,0,0,0,0,1,0,0,0],
#                       [0,0,0,0,0,1,0,0,0,0],
#                       [0,0,0,0,1,0,0,0,0,0],
#                       [0,0,0,1,0,0,0,0,0,0],
                      [1,0]#,[0,0,1,0,0,0,0,0,0,0],
#                       [0,1,0,0,0,0,0,0,0,0],
#                       [1,0,0,0,0,0,0,0,0,0]
                    ])

cat_dict = {
#  categories[0]: one_hot_categories[0],
#  categories[1]: one_hot_categories[1],
#  categories[2]: one_hot_categories[2],
 categories[3]: one_hot_categories[0],##one_hot_categories[3],
#  categories[4]: one_hot_categories[4],
#  categories[5]: one_hot_categories[5],
#  categories[6]: one_hot_categories[6],
 categories[7]: one_hot_categories[1]#one_hot_categories[7],
#  categories[8]: one_hot_categories[8],
#  categories[9]: one_hot_categories[9]
}

Let's first get all of the Attributes/Values types to index into our dictionary. In the OPP-115 paper, Figure 1 looks like it shows the types on the right side.

In [395]:
# attribute_value_types = set()
# attribute_value_values = set()
# for datum in data:
#     avs = datum['Attributes/Values']
#     for row in avs:
#         parsed = json.loads(row)
#         keys = list(parsed.keys())
#         attribute_value_types.update(keys)
#         for key in keys:
#             attribute_value_values.add(parsed[key]['value'])

attribute_value_types = ['Access Scope',
 'Access Type',
 'Action First-Party',
 'Action Third Party',
 'Audience Type',
 'Change Type',
 'Choice Scope',
 'Choice Type',
 'Collection Mode',
 'Do Not Track policy',
 'Does/Does Not',
 'Identifiability',
 'Notification Type',
 'Other Type',
 'Personal Information Type',
 'Purpose',
 'Retention Period',
 'Retention Purpose',
 'Security Measure',
 'Third Party Entity',
 'User Choice',
 'User Type']

attribute_value_values = ['Additional service/feature',
 'Advertising',
 'Aggregated or anonymized',
 'Analytics/Research',
 'Basic service/feature',
 'Both',
 'Browser/device privacy controls',
 'Californians',
 'Children',
 'Citizens from other countries',
 'Collect from user on other websites',
 'Collect in mobile app',
 'Collect on first party website/app',
 'Collect on mobile website',
 'Collect on website',
 'Collection',
 'Computer information',
 'Contact',
 'Cookies and tracking elements',
 'Data access limitation',
 'Deactivate account',
 'Delete account (full)',
 'Delete account (partial)',
 'Demographic',
 'Does',
 'Does Not',
 'Dont use service/feature',
 'Edit information',
 'Europeans',
 'Explicit',
 'Export',
 'Financial',
 'First party collection',
 'First party use',
 'First-party privacy controls',
 'General notice in privacy policy',
 'General notice on website',
 'Generic',
 'Generic personal information',
 'Health',
 'Honored',
 'IP address and device IDs',
 'Identifiable',
 'Implicit',
 'In case of merger or acquisition',
 'Indefinitely',
 'Introductory/Generic',
 'Legal requirement',
 'Limited',
 'Location',
 'Marketing',
 'Mentioned, but unclear if honored',
 'Merger/Acquisition',
 'Named third party',
 'No notification',
 'Non-privacy relevant change',
 'None',
 'Not honored',
 'Not mentioned',
 'Opt-in',
 'Opt-out',
 'Opt-out link',
 'Opt-out via contacting company',
 'Other',
 'Other data about user',
 'Other part of company/affiliate',
 'Other users',
 'Perform service',
 'Personal identifier',
 'Personal notice',
 'Personalization/Customization',
 'Practice not covered',
 'Privacy contact information',
 'Privacy relevant change',
 'Privacy review/audit',
 'Privacy training',
 'Privacy/Security program',
 'Profile data',
 'Public',
 'Receive from other parts of company/affiliates',
 'Receive from other service/third-party (named)',
 'Receive from other service/third-party (unnamed)',
 'Receive/Shared with',
 'Secure data storage',
 'Secure data transfer',
 'Secure user authentication',
 'See',
 'Service Operation and Security',
 'Service operation and security',
 'Social media data',
 'Stated Period',
 'Survey data',
 'Third party sharing/collection',
 'Third party use',
 'Third-party privacy controls',
 'Track on first party website/app',
 'Track user on other websites',
 'Transactional data',
 'Unnamed third party',
 'Unspecified',
 'Use',
 'User Profile',
 'User account data',
 'User online activities',
 'User participation',
 'User profile',
 'User with account',
 'User without account',
 'View',
 'not-selected']

In [396]:
attribute_value_types

['Access Scope',
 'Access Type',
 'Action First-Party',
 'Action Third Party',
 'Audience Type',
 'Change Type',
 'Choice Scope',
 'Choice Type',
 'Collection Mode',
 'Do Not Track policy',
 'Does/Does Not',
 'Identifiability',
 'Notification Type',
 'Other Type',
 'Personal Information Type',
 'Purpose',
 'Retention Period',
 'Retention Purpose',
 'Security Measure',
 'Third Party Entity',
 'User Choice',
 'User Type']

In [397]:
attribute_value_values

['Additional service/feature',
 'Advertising',
 'Aggregated or anonymized',
 'Analytics/Research',
 'Basic service/feature',
 'Both',
 'Browser/device privacy controls',
 'Californians',
 'Children',
 'Citizens from other countries',
 'Collect from user on other websites',
 'Collect in mobile app',
 'Collect on first party website/app',
 'Collect on mobile website',
 'Collect on website',
 'Collection',
 'Computer information',
 'Contact',
 'Cookies and tracking elements',
 'Data access limitation',
 'Deactivate account',
 'Delete account (full)',
 'Delete account (partial)',
 'Demographic',
 'Does',
 'Does Not',
 'Dont use service/feature',
 'Edit information',
 'Europeans',
 'Explicit',
 'Export',
 'Financial',
 'First party collection',
 'First party use',
 'First-party privacy controls',
 'General notice in privacy policy',
 'General notice on website',
 'Generic',
 'Generic personal information',
 'Health',
 'Honored',
 'IP address and device IDs',
 'Identifiable',
 'Implicit',
 '

Okay, we have all of our attribute value types! Now, let's construct a dataframe of ALL of our data.

It should have columns: 

1. Paragraph2Vec representation of the text span
2. One hot representation of category

Unused data:
1. Segment index
2. Start index
3. End index
4. One hot representation of attribute
5. One hot representation of attribute type
6. One hot representation of attribute value


In [398]:
from nltk.tokenize import word_tokenize

In [399]:
model = gensim.models.Doc2Vec(size=50)

In [400]:
# flattened_policies = []
# for item in policies:
#     flattened_policies.extend(item)
# len(flattened_policies)

In [401]:
df_columns = ['text', 'category', 'category one hot', 'text vec']
df = pd.DataFrame([], columns=df_columns)
series = []
documents = []
cats = []
idx = 0
chosen_categories = ['First Party Collection/Use', 'Third Party Sharing/Collection']
remove_text = ['null', 'Not selected']
for datum in data:
    for idx in range(len(datum)):
        category = datum['Category Name'][idx]
        
        if chosen_categories is None or category not in chosen_categories:
            continue
        
        parsed = json.loads(datum['Attributes/Values'][idx])
        for value in attribute_value_types:
            if value in parsed.keys():
                attributes = parsed[value]
                if 'selectedText' in attributes:
                    text = attributes['selectedText']
                    
                    if text in remove_text:
                        continue
                    
                    doc = gensim.models.doc2vec.TaggedDocument(word_tokenize(text), [idx])
                    documents.append(doc)
                    cats.append(cat_dict[category])
                    series.append(pd.Series([text, category, cat_dict[category], None], index=df_columns))
                    idx += 1
cats = np.array(cats)

In [402]:
documents[0]

TaggedDocument(words=['Sci-News.com', 'may', 'collect', 'and', 'process'], tags=[1])

In [403]:
df = df.append(series, ignore_index=True)                    
df

Unnamed: 0,text,category,category one hot,text vec
0,Sci-News.com may collect and process,First Party Collection/Use,"[0, 1]",
1,nformation that you provide by filling in form...,First Party Collection/Use,"[0, 1]",
2,including,First Party Collection/Use,"[0, 1]",
3,"other purposes, for example when you report a ...",First Party Collection/Use,"[0, 1]",
4,Sci-News.com may collect and process,First Party Collection/Use,"[0, 1]",
5,nformation that you provide by filling in form...,First Party Collection/Use,"[0, 1]",
6,"names, e-mail and website addresses",First Party Collection/Use,"[0, 1]",
7,"other purposes, for example when you report a ...",First Party Collection/Use,"[0, 1]",
8,"including, but not limited",First Party Collection/Use,"[0, 1]",
9,weblogs and other communication data,First Party Collection/Use,"[0, 1]",


In [404]:
df.shape

(62335, 4)

In [405]:
model.build_vocab(documents)

In [406]:
model.train(documents, total_examples=len(documents), epochs=1)

487503

In [407]:
# setting model.random before infer_vector is required
# for determininistic behavior as described in 
# https://github.com/RaRe-Technologies/gensim/issues/447#issuecomment-138994654
model.random = np.random.RandomState(1234)
test_vec = model.infer_vector(word_tokenize('this is a test'))
print(test_vec.shape, '\n', test_vec)

(50,) [-0.00107015  0.01038371 -0.02888153  0.03988348 -0.0011957  -0.01304967
  0.04327901  0.02830811 -0.00142456 -0.03781882 -0.0127815   0.04316219
 -0.05263519  0.03777497 -0.03590705  0.01108375 -0.04856323 -0.01727396
  0.01445077  0.03389955 -0.00394573  0.00689276  0.05183403  0.02598267
  0.0400493   0.05729544  0.00361882 -0.01930449 -0.00923815 -0.01279849
  0.02402143 -0.01450086  0.00225588  0.04632444  0.02229215 -0.01990163
  0.007358   -0.02812551  0.03420744 -0.01957773 -0.01311883  0.02319254
 -0.04389676  0.00136215 -0.0260359  -0.00761274  0.00102055  0.0390917
 -0.01973042  0.00819745]


In [408]:
vecs = []
for row in df.itertuples():
    category_not_chosen = chosen_categories is None
    categorgy_chosen_and_matches = chosen_categories is not None and row.category in chosen_categories
    if categorgy_chosen_and_matches or category_not_chosen:
        model.random = np.random.RandomState(1234)
        vecs.append(np.array(model.infer_vector(word_tokenize(row.text))))
        
vecs = np.array(vecs)

In [409]:
vecs.shape

(62335, 50)

In [410]:
# df = pd.concat([df, pd.DataFrame(vecs)], axis=1)

In [411]:
# Keras version
nn_model = Sequential()
nn_model.reset_states()
nn_model.add(Dense(256, batch_input_shape=(None, 50, ), activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(256, activation='relu'))
nn_model.add(Dropout(0.5))
#nn_model.add(Dense(10, activation='softmax'))
nn_model.add(Dense(2, activation='softmax'))

optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
nn_model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[metrics.mae, metrics.categorical_accuracy])

print(nn_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 256)               13056     
_________________________________________________________________
dropout_11 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_12 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 2)                 514       
Total params: 79,362
Trainable params: 79,362
Non-trainable params: 0
_________________________________________________________________
None


In [412]:
cats.shape

(62335, 2)

In [413]:
df['category one hot'].shape

(62335,)

In [414]:
X_in = vecs
Y_in = cats
print(X_in.shape, Y_in.shape)

(62335, 50) (62335, 2)


In [415]:
tensorboard_callback = TensorBoard(log_dir='C:/tmp/pp_run-'+time.strftime("%Y-%m-%d-%H%M%S"))
nn_model.fit(X_in, Y_in, validation_split=0.25, batch_size=128, epochs=32, verbose=1, callbacks=[tensorboard_callback])

Train on 46751 samples, validate on 15584 samples
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<keras.callbacks.History at 0x2e2ef1fb630>

In [433]:
test_text = 'We may disclose information'#'We do not share your profile with other third parties.'#'These tracking technologies may be deployed'#'we will not disclose your information'#'we use cookies or similar technologies'#'we collect your'#'share'
model.random = np.random.RandomState(1234)
test_vec = model.infer_vector(word_tokenize(test_text))
prediction_cat = nn_model.predict(np.array([test_vec])).round().astype(int)[0]
print(test_vec)
print(prediction_cat)
idx = 0
for one_hot_cat in one_hot_categories:
    if np.array_equal(one_hot_cat, prediction_cat):
        break
    idx += 1
print(idx)
print(chosen_categories[idx])

[-0.01903359  0.0275209  -0.01607004  0.03595443  0.00894574  0.00463449
  0.03586714  0.03182346 -0.0110043  -0.03470108 -0.00960489  0.05037734
 -0.04771339  0.02682758 -0.02292502  0.00050167 -0.03068542 -0.00863673
  0.00621102  0.05503413  0.0036117  -0.0116102   0.04798289  0.02048419
  0.03031054  0.05216712  0.01245458 -0.01594711 -0.01110764 -0.01099228
  0.00943156 -0.02660771  0.01610587  0.04945242  0.00134364 -0.02205838
 -0.01495113 -0.04680897  0.02272469 -0.00215368  0.00762396  0.04100169
 -0.04523159 -0.00052475 -0.02204664  0.00138375  0.00200034  0.04142748
 -0.02556044  0.00802804]
[1 0]
1
Third Party Sharing/Collection
