# August Karlstedt

In [92]:
%matplotlib inline

import os
import imp
import operator
import math
import glob
import json
import time

from IPython.display import IFrame

import nltk

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#import pickle
#from six.moves import urllib

import tensorflow

from keras.models import Sequential
from keras.layers import Input, Dense, Activation, Dropout
from keras.layers import LSTM, TimeDistributed
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
from keras import metrics
from keras.utils.np_utils import to_categorical

#import fasttext
# https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md

import gensim
# https://nlp.stanford.edu/projects/glove/

In [93]:
files = []
data = []
header = ['Annotation ID', 'Batch ID', 'Annotator ID', 'Policy ID', 'Segment ID', 'Category Name', 'Attributes/Values', 'Policy URL', 'Date']
keep_columns = ['Segment ID', 'Category Name', 'Attributes/Values']
for file in glob.glob("data\\annotations/*.csv"):
    files.append(file[17:-4])
    data.append(pd.read_csv(file, names=header)[keep_columns])

In [94]:
files[:10]

['1017_sci-news.com',
 '1028_redorbit.com',
 '1034_aol.com',
 '1050_honda.com',
 '105_amazon.com',
 '1070_wnep.com',
 '1083_highgearmedia.com',
 '1089_freep.com',
 '1099_enthusiastnetwork.com',
 '1106_allstate.com']

In [95]:
print(len(data), len(data[0]))

115 58


In [96]:
data[0]

Unnamed: 0,Segment ID,Category Name,Attributes/Values
0,0,Other,"{""Other Type"": {""selectedText"": ""Sci-News.com ..."
1,1,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""nformati..."
2,1,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""nformati..."
3,2,Data Retention,"{""Personal Information Type"": {""selectedText"":..."
4,3,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""Not sele..."
5,3,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""Not sele..."
6,3,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""Not sele..."
7,3,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""Not sele..."
8,4,International and Specific Audiences,"{""Audience Type"": {""selectedText"": ""Sci-News.c..."
9,4,Other,"{""Other Type"": {""selectedText"": ""Parents or gu..."


In [97]:
policies = []
for file in files:
    with open("data\\sanitized_policies/{}.html".format(file)) as f:
        policies.append(f.readlines()[0].split('|||'))

In [98]:
print(len(policies), len(policies[0]), policies[0])

115 13 ['Privacy Policy <br> <br> Sci-News.com is committed to protecting and respecting your privacy. To better inform you of our policy concerning user privacy, we have adopted the following terms. Please note that these terms are subject to change, and any such changes will be included on this page. <br> <br>', 'Information that Sci-News.com May Collect Online <br> <br> Sci-News.com may collect and process the following data about you: <br> <br> - information that you provide by filling in forms on our site, including names, e-mail and website addresses; we may also ask you for information for other purposes, for example when you report a problem with our site; <br> <br>', '- if you contact us, we may keep a record of that correspondence; <br> <br>', '- details of your visits to our site including, but not limited to, traffic data, location data, weblogs and other communication data. <br> <br>', 'Sci-News.com does not knowingly collect or solicit personal information from anyone und

Okay, so we have loaded our CSV files and also our privacy policies. Now we can use the CSV data to index into each segment in each privacy policy. Let's try this out manually for one privacy policy just to start.

In [99]:
test_file_index = 64
test_csv_file = data[test_file_index] # our table of attribute/values/categories/segment ids
test_pp_file = policies[test_file_index]
files[test_file_index]

'414_washingtonian.com'

Okay, let's load a single annotation and display the highlighted text in the privacy policy

In [100]:
test_row_num = 4

In [101]:
test_segment_id = test_csv_file['Segment ID'][test_row_num]
test_segment_id

2

In [102]:
test_category_name = test_csv_file['Category Name'][test_row_num]
test_category_name

'First Party Collection/Use'

In [103]:
test_attributes_values = json.loads(test_csv_file['Attributes/Values'][test_row_num])
test_attributes_values

{'Action First-Party': {'endIndexInSegment': 44,
  'selectedText': 'we collect',
  'startIndexInSegment': 34,
  'value': 'Unspecified'},
 'Choice Scope': {'endIndexInSegment': -1,
  'selectedText': 'Not selected',
  'startIndexInSegment': -1,
  'value': 'Unspecified'},
 'Choice Type': {'endIndexInSegment': -1,
  'selectedText': 'Not selected',
  'startIndexInSegment': -1,
  'value': 'Unspecified'},
 'Collection Mode': {'endIndexInSegment': -1,
  'selectedText': 'null',
  'startIndexInSegment': -1,
  'value': 'not-selected'},
 'Does/Does Not': {'endIndexInSegment': -1,
  'selectedText': 'null',
  'startIndexInSegment': -1,
  'value': 'Does'},
 'Identifiability': {'endIndexInSegment': -1,
  'selectedText': 'null',
  'startIndexInSegment': -1,
  'value': 'not-selected'},
 'Personal Information Type': {'endIndexInSegment': 56,
  'selectedText': 'include',
  'startIndexInSegment': 49,
  'value': 'Other'},
 'Purpose': {'endIndexInSegment': -1,
  'selectedText': 'Not selected',
  'startIndexI

In [104]:
test_attributes_values.keys()

dict_keys(['Does/Does Not', 'Personal Information Type', 'Choice Scope', 'Collection Mode', 'User Type', 'Purpose', 'Identifiability', 'Choice Type', 'Action First-Party'])

In [105]:
test_start_index = test_attributes_values['Personal Information Type']['startIndexInSegment']
test_end_index = test_attributes_values['Personal Information Type']['endIndexInSegment']
test_value = test_attributes_values['Personal Information Type']['value']
test_selected_text = test_attributes_values['Personal Information Type']['selectedText']
print(test_start_index, test_end_index, test_value, test_selected_text)

49 56 Other include


In [106]:
test_pp_file[test_segment_id][test_start_index:test_end_index]

'include'

In [107]:
print("Segment ID: {}".format(test_segment_id))
print("Category Name: {}".format(test_category_name))
print("Type: {}".format('Personal Information Type'))
print("Value: {}".format(test_value))
print("Selected Text: {}".format(test_selected_text))

Segment ID: 2
Category Name: First Party Collection/Use
Type: Personal Information Type
Value: Other
Selected Text: include


Okay, a few things.

1. The text is already included in the CSV file. We don't necessarily need to index into the policy itself, unless we want to grab the entire sentence. Let's start by training the NN on *just* the selected text and not the whole sentence and see what we get.
2. The JSON in the `Attributes/Values` column has many different types. We need to grab them all so we know what to index into the dictionary for.
3. There's a value associated with each annotation OR it can be `Unspecified`

In [108]:
# categories = set()
# for datum in data:
#     cat = datum['Category Name']
#     categories.update(cat)
# categories

categories = [
 'Data Retention', # 0
 'Data Security', # 1
 'Do Not Track', # 2
 'First Party Collection/Use', # 3
 'International and Specific Audiences', # 4
 'Other', # 5
 'Policy Change', # 6
 'Third Party Sharing/Collection', # 7
 'User Access, Edit and Deletion', # 8
 'User Choice/Control', # 9
 'None' # 10
]

one_hot_categories = np.array([
 [0,0,0,0,0,0,0,0,0,0,1],
 [0,0,0,0,0,0,0,0,0,1,0],
 [0,0,0,0,0,0,0,0,1,0,0],
 [0,0,0,0,0,0,0,1,0,0,0],
 [0,0,0,0,0,0,1,0,0,0,0],
 [0,0,0,0,0,1,0,0,0,0,0],
 [0,0,0,0,1,0,0,0,0,0,0],
 [0,0,0,1,0,0,0,0,0,0,0],
 [0,0,1,0,0,0,0,0,0,0,0],
 [0,1,0,0,0,0,0,0,0,0,0],
 [1,0,0,0,0,0,0,0,0,0,0],
])

cat_dict = {
 categories[0]: one_hot_categories[0],
 categories[1]: one_hot_categories[1],
 categories[2]: one_hot_categories[2],
 categories[3]: one_hot_categories[3],
 categories[4]: one_hot_categories[4],
 categories[5]: one_hot_categories[5],
 categories[6]: one_hot_categories[6],
 categories[7]: one_hot_categories[7],
 categories[8]: one_hot_categories[8],
 categories[9]: one_hot_categories[9],
 categories[10]: one_hot_categories[10],
}

Let's first get all of the Attributes/Values types to index into our dictionary. In the OPP-115 paper, Figure 1 looks like it shows the types on the right side.

In [109]:
# attribute_value_types = set()
# attribute_value_values = set()
# for datum in data:
#     avs = datum['Attributes/Values']
#     for row in avs:
#         parsed = json.loads(row)
#         keys = list(parsed.keys())
#         attribute_value_types.update(keys)
#         for key in keys:
#             attribute_value_values.add(parsed[key]['value'])

attribute_value_types = ['Access Scope',
 'Access Type',
 'Action First-Party',
 'Action Third Party',
 'Audience Type',
 'Change Type',
 'Choice Scope',
 'Choice Type',
 'Collection Mode',
 'Do Not Track policy',
 'Does/Does Not',
 'Identifiability',
 'Notification Type',
 'Other Type',
 'Personal Information Type',
 'Purpose',
 'Retention Period',
 'Retention Purpose',
 'Security Measure',
 'Third Party Entity',
 'User Choice',
 'User Type']

attribute_value_values = ['Additional service/feature',
 'Advertising',
 'Aggregated or anonymized',
 'Analytics/Research',
 'Basic service/feature',
 'Both',
 'Browser/device privacy controls',
 'Californians',
 'Children',
 'Citizens from other countries',
 'Collect from user on other websites',
 'Collect in mobile app',
 'Collect on first party website/app',
 'Collect on mobile website',
 'Collect on website',
 'Collection',
 'Computer information',
 'Contact',
 'Cookies and tracking elements',
 'Data access limitation',
 'Deactivate account',
 'Delete account (full)',
 'Delete account (partial)',
 'Demographic',
 'Does',
 'Does Not',
 'Dont use service/feature',
 'Edit information',
 'Europeans',
 'Explicit',
 'Export',
 'Financial',
 'First party collection',
 'First party use',
 'First-party privacy controls',
 'General notice in privacy policy',
 'General notice on website',
 'Generic',
 'Generic personal information',
 'Health',
 'Honored',
 'IP address and device IDs',
 'Identifiable',
 'Implicit',
 'In case of merger or acquisition',
 'Indefinitely',
 'Introductory/Generic',
 'Legal requirement',
 'Limited',
 'Location',
 'Marketing',
 'Mentioned, but unclear if honored',
 'Merger/Acquisition',
 'Named third party',
 'No notification',
 'Non-privacy relevant change',
 'None',
 'Not honored',
 'Not mentioned',
 'Opt-in',
 'Opt-out',
 'Opt-out link',
 'Opt-out via contacting company',
 'Other',
 'Other data about user',
 'Other part of company/affiliate',
 'Other users',
 'Perform service',
 'Personal identifier',
 'Personal notice',
 'Personalization/Customization',
 'Practice not covered',
 'Privacy contact information',
 'Privacy relevant change',
 'Privacy review/audit',
 'Privacy training',
 'Privacy/Security program',
 'Profile data',
 'Public',
 'Receive from other parts of company/affiliates',
 'Receive from other service/third-party (named)',
 'Receive from other service/third-party (unnamed)',
 'Receive/Shared with',
 'Secure data storage',
 'Secure data transfer',
 'Secure user authentication',
 'See',
 'Service Operation and Security',
 'Service operation and security',
 'Social media data',
 'Stated Period',
 'Survey data',
 'Third party sharing/collection',
 'Third party use',
 'Third-party privacy controls',
 'Track on first party website/app',
 'Track user on other websites',
 'Transactional data',
 'Unnamed third party',
 'Unspecified',
 'Use',
 'User Profile',
 'User account data',
 'User online activities',
 'User participation',
 'User profile',
 'User with account',
 'User without account',
 'View',
 'not-selected']

In [110]:
attribute_value_types

['Access Scope',
 'Access Type',
 'Action First-Party',
 'Action Third Party',
 'Audience Type',
 'Change Type',
 'Choice Scope',
 'Choice Type',
 'Collection Mode',
 'Do Not Track policy',
 'Does/Does Not',
 'Identifiability',
 'Notification Type',
 'Other Type',
 'Personal Information Type',
 'Purpose',
 'Retention Period',
 'Retention Purpose',
 'Security Measure',
 'Third Party Entity',
 'User Choice',
 'User Type']

In [111]:
attribute_value_values

['Additional service/feature',
 'Advertising',
 'Aggregated or anonymized',
 'Analytics/Research',
 'Basic service/feature',
 'Both',
 'Browser/device privacy controls',
 'Californians',
 'Children',
 'Citizens from other countries',
 'Collect from user on other websites',
 'Collect in mobile app',
 'Collect on first party website/app',
 'Collect on mobile website',
 'Collect on website',
 'Collection',
 'Computer information',
 'Contact',
 'Cookies and tracking elements',
 'Data access limitation',
 'Deactivate account',
 'Delete account (full)',
 'Delete account (partial)',
 'Demographic',
 'Does',
 'Does Not',
 'Dont use service/feature',
 'Edit information',
 'Europeans',
 'Explicit',
 'Export',
 'Financial',
 'First party collection',
 'First party use',
 'First-party privacy controls',
 'General notice in privacy policy',
 'General notice on website',
 'Generic',
 'Generic personal information',
 'Health',
 'Honored',
 'IP address and device IDs',
 'Identifiable',
 'Implicit',
 '

Okay, we have all of our attribute value types! Now, let's construct a dataframe of ALL of our data.

It should have columns: 

1. Paragraph2Vec representation of the text span
2. One hot representation of category

Unused data:
1. Segment index
2. Start index
3. End index
4. One hot representation of attribute
5. One hot representation of attribute type
6. One hot representation of attribute value


In [112]:
from nltk.tokenize import word_tokenize

In [113]:
model = gensim.models.Doc2Vec(size=100)

In [114]:
# flattened_policies = []
# for item in policies:
#     flattened_policies.extend(item)
# len(flattened_policies)

In [115]:
df_columns = ['text', 'category', 'category one hot', 'text vec']
df = pd.DataFrame([], columns=df_columns)
series = []
documents = []
cats = []
idx = 0
chosen_categories = ['First Party Collection/Use', 
                     'Third Party Sharing/Collection', 
                     'Other', 
                     'User Choice/Control', 
                     'Data Security',
                     'International and Specific Audiences',
                     'User Access, Edit and Deletion',
                     'Policy Change',
                     'Data Retention',
                     'Do Not Track',
                     'None' # added by us, not in original corpus
                    ]
remove_text = ['null', 'Not selected']
stemmer = nltk.stem.porter.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
for datum_idx in range(len(data)):
    datum = data[datum_idx]
    for idx in range(len(datum)):        
        category = datum['Category Name'][idx]
        
        if chosen_categories is None:
            continue
            
        if category not in chosen_categories:
            continue
        
        segment_id = datum['Segment ID'][idx]
        try:
            policy_text = policies[datum_idx][segment_id]
        except:
            print('Error:', datum_idx, segment_id)
            
        # ok, we have our policy text, now we need to 
        # remove all of the spans that are associated with a category
        # so we can attribute that text to the 'None' category
                
        parsed = json.loads(datum['Attributes/Values'][idx])
        for value in attribute_value_types:
            if value in parsed.keys():
                attributes = parsed[value]
                has_selected_text = 'selectedText' in attributes
                has_start_idx = 'startIndexInSegment' in attributes
                has_end_idx = 'endIndexInSegment' in attributes
                if has_selected_text and has_start_idx and has_end_idx:
                    text = attributes['selectedText']
                    start_idx = attributes['startIndexInSegment']
                    end_idx = attributes['endIndexInSegment']
                    
                    if text in remove_text or start_idx == -1 or end_idx == -1:
                        continue
                    
                    # extract the string and set policy text to the parts
                    # that were not annotated
                    prev_text = policy_text[:start_idx]
                    next_text = policy_text[end_idx:]
                    policy_text = prev_text + next_text
                    
                    policy_text = policy_text.lower().replace("<br>", " ").strip()   
                    text = text.lower()
                    
                    processed_policy_text = word_tokenize(policy_text)
                    processed_policy_text = [lemmatizer.lemmatize(word) for word in processed_policy_text]
                        
                    processed_text = word_tokenize(text)
                    #processed_text = [stemmer.stem(word) for word in processed_text]
                    processed_text = [lemmatizer.lemmatize(word) for word in processed_text]

                    doc = gensim.models.doc2vec.TaggedDocument(processed_policy_text, [idx])
                    documents.append(doc)
                    cats.append(cat_dict['None'])
                    text = ' '.join(processed_policy_text)
                    series.append(pd.Series([text, 'None', cat_dict['None'], None], index=df_columns))
                    
                    doc = gensim.models.doc2vec.TaggedDocument(processed_text, [idx])
                    documents.append(doc)
                    cats.append(cat_dict[category])
                    text = ' '.join(processed_text)
                    series.append(pd.Series([text, category, cat_dict[category], None], index=df_columns))
                    
                    idx += 1
cats = np.array(cats)

47 4
47 4
47 4
47 4
47 4
47 4
47 4
47 5
47 5
47 5
47 5
47 5
47 5
47 6
47 6
47 6
47 6
47 6
47 6
47 6
47 6
47 6
47 6
47 6
47 6
47 6
47 6
47 6
47 7
47 7
47 7
47 7
47 7
47 7
47 8
47 8
47 8
47 9
47 9
47 9
47 9
47 9
47 9
47 9
47 9
47 9
47 9
47 9
47 9
47 10
47 10
47 10
47 10
47 10
47 10
47 10
47 10
47 10
47 10
47 10
47 10
47 11
47 11
47 11
47 11
47 11
47 11
47 11
47 11
47 11
47 11
47 11
47 12
47 12
47 12
47 12
47 12
47 12
47 12
47 12
47 12
47 12
47 12
47 12
47 13
47 13
47 13
47 13
47 13
47 13
47 13
47 13
47 13
47 13
47 13
47 14
47 14
47 14
47 14
47 14
47 15
47 15
47 15
47 15
47 15
47 15
47 15
47 15
47 15
47 15
47 15
47 15
47 15
47 15
47 15
47 15
47 15
47 15
47 15
47 15
47 16
47 16
47 16
47 16
47 16
47 16
47 16
47 16
47 16
47 17
47 17
47 17
47 17
47 17
47 17
47 17
47 17
47 17
47 17
47 17
47 17
47 17
47 17
47 17
47 18
47 18
47 18
47 18
47 18
47 18
47 18
47 18
47 18
47 18
47 18
47 18
47 18
47 18
47 18
47 18
47 18
47 18
47 19
47 19
47 19
47 19
47 19
47 19
47 19
47 20
47 20
47 20
47 20
47 20
47 21

In [116]:
documents[0]

TaggedDocument(words=['privacy', 'policy'], tags=[0])

In [117]:
df = df.append(series, ignore_index=True)                    
df

Unnamed: 0,text,category,category one hot,text vec
0,privacy policy,,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",
1,sci-news.com is committed to protecting and re...,Other,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",
2,information that sci-news.com may collect onli...,,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",
3,sci-news.com may collect and process,First Party Collection/Use,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",
4,information that sci-news.com may collect onli...,,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",
5,nformation that you provide by filling in form...,First Party Collection/Use,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",
6,information that sci-news.com may collect onli...,,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",
7,including,First Party Collection/Use,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",
8,information that sci-news.com may collect onli...,,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",
9,"other purpose , for example when you report a ...",First Party Collection/Use,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",


In [118]:
df.shape

(155812, 4)

In [119]:
model.build_vocab(documents)

In [120]:
model.train(documents, total_examples=len(documents), epochs=1)

5272282

In [121]:
# setting model.random before infer_vector is required
# for determininistic behavior as described in 
# https://github.com/RaRe-Technologies/gensim/issues/447#issuecomment-138994654
model.random = np.random.RandomState(1234)
test_vec = model.infer_vector(word_tokenize('this is a test'))
print(test_vec.shape, '\n', test_vec)

(100,) 
 [ 0.08417413 -0.0075465  -0.03761142  0.03726314 -0.01932174 -0.00371569
 -0.06478154  0.02322414 -0.07497223  0.02693135 -0.03022596  0.02806931
 -0.06931741  0.06975242  0.10365339 -0.06788568  0.0024898  -0.07098911
 -0.1169481   0.06146821 -0.04647142 -0.13347702 -0.00232718 -0.02486847
  0.05867615  0.0262397  -0.0717935   0.07437395  0.04312051  0.03772339
  0.09851807  0.07333487  0.06507745 -0.01374978 -0.03784891  0.02418258
  0.04956978 -0.01811005 -0.05612132  0.07283252  0.06724891 -0.04931923
 -0.04288157  0.00683967  0.04661451 -0.02796188 -0.03153907 -0.03375927
  0.07053074  0.0552861  -0.05998477  0.05854597 -0.08552095  0.06194466
 -0.07292381  0.01506165 -0.02363598  0.03026196 -0.04866464  0.01258939
  0.01941918  0.03675984 -0.04692562 -0.06252577 -0.06831784 -0.07673521
  0.03753974  0.03148334  0.04826692  0.03712487  0.06886642  0.05530741
  0.02793279 -0.02420807  0.02568835  0.03082828 -0.0399214   0.02794833
  0.11868624 -0.10883427  0.04813946 -0.01

In [122]:
vecs = []
for row in df.itertuples():
    category_not_chosen = chosen_categories is None
    categorgy_chosen_and_matches = chosen_categories is not None and row.category in chosen_categories
    if categorgy_chosen_and_matches or category_not_chosen:
        model.random = np.random.RandomState(1234)
        vecs.append(np.array(model.infer_vector(word_tokenize(row.text))))
        
vecs = np.array(vecs)

In [124]:
vecs.shape

(155812, 100)

In [125]:
# df = pd.concat([df, pd.DataFrame(vecs)], axis=1)

In [126]:
# Keras version
nn_model = Sequential()
nn_model.reset_states()
nn_model.add(Dense(256, batch_input_shape=(None, 100, ), activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(256, activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(11, activation='softmax'))

optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
nn_model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[metrics.mae, metrics.categorical_accuracy])
#nn_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[metrics.mae, metrics.binary_accuracy])

print(nn_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               25856     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 11)                2827      
Total params: 94,475
Trainable params: 94,475
Non-trainable params: 0
_________________________________________________________________
None


In [127]:
cats.shape

(155812, 11)

In [128]:
df['category one hot'].shape

(155812,)

In [129]:
X_in = vecs
Y_in = cats
print(X_in.shape, Y_in.shape)

(155812, 100) (155812, 11)


In [130]:
tensorboard_callback = TensorBoard(log_dir='C:/tmp/pp_run-'+time.strftime("%Y-%m-%d-%H%M%S"))
nn_model.fit(X_in, Y_in, validation_split=0.25, batch_size=128, epochs=128, verbose=1, callbacks=[tensorboard_callback])

Train on 116859 samples, validate on 38953 samples
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128
Epoch 22/128
Epoch 23/128
Epoch 24/128
Epoch 25/128
Epoch 26/128
Epoch 27/128
Epoch 28/128
Epoch 29/128
Epoch 30/128
Epoch 31/128
Epoch 32/128
Epoch 33/128
Epoch 34/128
Epoch 35/128


Epoch 36/128
Epoch 37/128
Epoch 38/128
Epoch 39/128
Epoch 40/128
Epoch 41/128
Epoch 42/128
Epoch 43/128
Epoch 44/128
Epoch 45/128
Epoch 46/128
Epoch 47/128
Epoch 48/128
Epoch 49/128
Epoch 50/128
Epoch 51/128
Epoch 52/128
Epoch 53/128
Epoch 54/128
Epoch 55/128
Epoch 56/128
Epoch 57/128
Epoch 58/128
Epoch 59/128
Epoch 60/128
Epoch 61/128
Epoch 62/128
Epoch 63/128
Epoch 64/128
Epoch 65/128
Epoch 66/128
Epoch 67/128
Epoch 68/128
Epoch 69/128


Epoch 70/128
Epoch 71/128
Epoch 72/128
Epoch 73/128
Epoch 74/128
Epoch 75/128
Epoch 76/128
Epoch 77/128
Epoch 78/128
Epoch 79/128
Epoch 80/128
Epoch 81/128
Epoch 82/128
Epoch 83/128
Epoch 84/128
Epoch 85/128
Epoch 86/128
Epoch 87/128
Epoch 88/128
Epoch 89/128
Epoch 90/128
Epoch 91/128
Epoch 92/128
Epoch 93/128
Epoch 94/128
Epoch 95/128
Epoch 96/128
Epoch 97/128
Epoch 98/128
Epoch 99/128
Epoch 100/128
Epoch 101/128
Epoch 102/128
Epoch 103/128


Epoch 104/128
Epoch 105/128
Epoch 106/128
Epoch 107/128
Epoch 108/128
Epoch 109/128
Epoch 110/128
Epoch 111/128
Epoch 112/128
Epoch 113/128
Epoch 114/128
Epoch 115/128
Epoch 116/128
Epoch 117/128
Epoch 118/128
Epoch 119/128
Epoch 120/128
Epoch 121/128
Epoch 122/128
Epoch 123/128
Epoch 124/128
Epoch 125/128
Epoch 126/128
Epoch 127/128
Epoch 128/128


<keras.callbacks.History at 0x1cdbd91b588>

In [131]:
test_text = 'We may disclose information'
#test_text = 'We do not share your profile with other third parties.'
#test_text = 'These tracking technologies may be deployed'
#test_text = 'we will not disclose your information'
#test_text = 'we use cookies or similar technologies'
#test_text = 'we collect your'
#test_text = 'share'

#test_text = [stemmer.stem(word) for word in word_tokenize(test_text.lower())]
test_text = [lemmatizer.lemmatize(word) for word in word_tokenize(test_text.lower())]
print(test_text)
model.random = np.random.RandomState(1234)
test_vec = model.infer_vector(test_text)
prediction_cat = nn_model.predict(np.array([test_vec])).round().astype(int)[0]
print(test_vec)
print(prediction_cat)
idx = 0
for one_hot_cat in one_hot_categories:
    if np.array_equal(one_hot_cat, prediction_cat):
        break
    idx += 1
print(idx)
print(chosen_categories[idx])

['we', 'may', 'disclose', 'information']
[ 0.03530358 -0.01507751 -0.01247788  0.0225808   0.02562725 -0.05893671
 -0.02526408  0.04218696 -0.07718476  0.00120358  0.03843549  0.02974659
 -0.03096849  0.02195295  0.01187804 -0.07208186 -0.03014539 -0.07863555
 -0.05328407 -0.00857365  0.00962165 -0.01778396 -0.02301499 -0.00082013
  0.01544286 -0.00712203 -0.00306138  0.08116971  0.00535893  0.04521177
  0.03022733  0.03510654  0.07270603 -0.00739933 -0.04361444  0.0445812
  0.08460516 -0.0004672  -0.0208342   0.01682387  0.05159039  0.00919086
  0.02740796 -0.06693022 -0.01035921 -0.01516946 -0.0768647  -0.10798656
  0.03130743  0.07199154 -0.03683877  0.04923306 -0.07043765  0.03152582
 -0.00704568 -0.04266111 -0.03270567  0.01676793  0.00906926  0.00926929
  0.06691784 -0.01142184 -0.05278565 -0.09492948  0.02520428 -0.01249292
  0.01883915  0.02535829  0.03329401  0.00512398  0.06866264  0.07812385
 -0.04012113 -0.0372388  -0.07440734 -0.02724568 -0.03397125  0.03077291
  0.1294352