# Notice and Choice?: Automated Privacy Policy Analysis with BERT

## I. Load Packages

In [1]:
import numpy as np
import pandas as pd
from random import sample, seed
from sklearn.metrics import classification_report
from transformers import TFBertModel, AutoTokenizer
import tensorflow as tf
from tensorflow import keras
from keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam

  from .autonotebook import tqdm as notebook_tqdm


## II. Load Data

Data is read in as it was produced by the *data_processing.ipynb* notebook.

Data is split into training, validation, and test sets. To avoid training data being leaked into the test or validation sets that could occur when different segments of the same policy appear in multiple sets, the split is performed at the policy level (75 policies in training, the remaining split evenly into test and validation)

In [2]:
# Load the Data
data = pd.read_csv('../data.csv')
baseline = pd.read_csv('../baseline.csv')

# Set the seed to fix the samples in place
seed(0)

splits = {'training':75,'val':20,'test':20}
# splits = {'training':28,'val':8,'test':4}

# Sample 75 of the 115 unique policy_id values, then save the other 40 for test
policies_for_training = sample(list(set(data.policy_id)), splits['training'])
train = data[data.policy_id.isin(policies_for_training)]
train = list(train.drop(['policy_id', 'segment_id'], axis=1).itertuples(index=False, name=None))
train_segments = [row[0] for row in train]
train_flags = [(row[1], row[2], row[3], row[4]) for row in train]
train_flags = {
    'first_party_data_output': tf.convert_to_tensor([x[0] for x in train_flags]),
    'third_party_sharing_output': tf.convert_to_tensor([x[1] for x in train_flags]),
    'first_party_choice_output': tf.convert_to_tensor([x[2] for x in train_flags]),
    'third_party_choice_output': tf.convert_to_tensor([x[3] for x in train_flags]),
    }

# # Sample 20 of the remaining for validation
policies_for_val = sample(list(set(data[~data.policy_id.isin(policies_for_training)].policy_id)), splits['val'])
val = data[data.policy_id.isin(policies_for_val)]
val = list(val.drop(['policy_id', 'segment_id'], axis=1).itertuples(index=False, name=None))
val_segments = [row[0] for row in val]
val_flags = [(row[1], row[2], row[3], row[4]) for row in val]
val_flags = {
        'first_party_data_output': tf.convert_to_tensor([x[0] for x in val_flags]),
        'third_party_sharing_output': tf.convert_to_tensor([x[1] for x in val_flags]),
        'first_party_choice_output': tf.convert_to_tensor([x[2] for x in val_flags]),
        'third_party_choice_output': tf.convert_to_tensor([x[3] for x in val_flags]),
        }

# # Sample 20 of the remaining for test
test = data[~data.policy_id.isin(policies_for_training+policies_for_val)]
test = list(test.drop(['policy_id', 'segment_id'], axis=1).itertuples(index=False, name=None))
test_segments = [row[0] for row in test]
test_flags_simple = [(row[1], row[2], row[3], row[4]) for row in test]
test_flags = {
        'first_party_data_output': tf.convert_to_tensor([x[0] for x in test_flags_simple]),
        'third_party_sharing_output': tf.convert_to_tensor([x[1] for x in test_flags_simple]),
        'first_party_choice_output': tf.convert_to_tensor([x[2] for x in test_flags_simple]),
        'third_party_choice_output': tf.convert_to_tensor([x[3] for x in test_flags_simple]),
        }

test_baseline = baseline[~data.policy_id.isin(policies_for_training+policies_for_val)].to_dict()

del data, policies_for_training, train, val, test, baseline

## III. Set Hyperparameters

In [3]:
# Embedding hyperparameters
MODEL_CHECKPOINT = 'nlpaueb/legal-bert-base-uncased' # Only uncased is available
# MODEL_CHECKPOINT = 'bert-base-uncased' # Only uncased is available
MAX_LENGTH = 150

# Model hyperparameters
HIDDEN_LAYER_SIZE = 128
DROPOUT_RATE = 0.5
LEARNING_RATE = 0.005

# Train-time hyperparameters
BATCH_SIZE = 50
EPOCHS = 3

# Classification hyperparameters
THRESHOLD = 0.025

## IV. Tokenize Text from Policy Segments

In [4]:
# tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_CHECKPOINT)

# tokenize the dataset, truncate when passed `MAX_LENGTH`, 
# and pad with 0's when less than `MAX_LENGTH` and return a tf Tensor
tr_encodings = tokenizer(
    train_segments,
    truncation=True,
    padding=True,
    max_length=MAX_LENGTH,
    return_tensors='tf'
)

val_encodings = tokenizer(
    val_segments, 
    truncation=True, 
    padding=True, 
    max_length=MAX_LENGTH, 
    return_tensors='tf'
)

test_encodings = tokenizer(
    test_segments, 
    truncation=True, 
    padding=True, 
    max_length=MAX_LENGTH, 
    return_tensors='tf'
)

## V. Define the Model

In [5]:
def create_model(
        checkpoint=MODEL_CHECKPOINT,
        hidden_size=HIDDEN_LAYER_SIZE, 
        dropout=DROPOUT_RATE,
        learning_rate=LEARNING_RATE
    ):
    """
    Build a multi-label classification model using Legal-BERT
    One output head per binary label we wish to classify
    """
    bert_model = TFBertModel.from_pretrained(checkpoint)                                              

    # Train all layers in BERT
    bert_model.trainable = True

    # Define the BERT inputs
    input_ids = Input(shape=(MAX_LENGTH,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = Input(shape=(MAX_LENGTH,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = Input(shape=(MAX_LENGTH,), dtype=tf.int64, name='attention_mask_layer')
    bert_inputs = {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask
    }

    # Define the pooler output from BERT to pass to our hidden layer
    pooler_token = bert_model(bert_inputs)[1]

    # Create a hidden layer
    hidden = Dense(
        units=hidden_size, activation='relu', name='hidden_layer'
    )(pooler_token)

    # Add a dropout layer
    hidden = Dropout(dropout)(hidden)

    # Output layers for each of the 4 labels
    labs = list(train_flags.keys())
    outputs = {}

    for i in range(len(labs)):
        outputs[labs[i]] = Dense(units=1, activation='sigmoid', name=labs[i])(hidden)

    classification_model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask], 
        outputs=outputs
    )
    
    classification_model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='binary_crossentropy', 
        metrics='accuracy'
    )

    return classification_model

bert_model = create_model()
bert_model.summary()

Some layers from the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask_layer (InputLay  [(None, 150)]       0           []                               
 er)                                                                                              
                                                                                                  
 input_ids_layer (InputLayer)   [(None, 150)]        0           []                               
                                                                                                  
 token_type_ids_layer (InputLay  [(None, 150)]       0           []                               
 er)                                                                                              
                                                                                              

## VI. Train the Model

In [6]:
bert_model_history = bert_model.fit(
    [tr_encodings.input_ids, tr_encodings.token_type_ids, tr_encodings.attention_mask], 
    train_flags,
    validation_data=(
        [val_encodings.input_ids, val_encodings.token_type_ids, val_encodings.attention_mask], 
        val_flags
    ),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


## VII. Evaluate the Model

In [7]:
test_predictions = bert_model.predict([
    test_encodings.input_ids, 
    test_encodings.token_type_ids, 
    test_encodings.attention_mask
])



In [14]:
test_predictions2 = [
    (a.item() > THRESHOLD, b.item() > THRESHOLD, c.item() > THRESHOLD, d.item() > THRESHOLD) 
    for a, b, c, d in zip(
        test_predictions['first_party_data_output'],
        test_predictions['third_party_sharing_output'],
        test_predictions['first_party_choice_output'],
        test_predictions['third_party_choice_output'])
]

print(classification_report(
    test_flags_simple,
    test_predictions2
))

              precision    recall  f1-score   support

           0       0.15      1.00      0.26        97
           1       0.15      1.00      0.27       100
           2       0.04      1.00      0.07        23
           3       0.00      0.00      0.00        12

   micro avg       0.11      0.95      0.20       232
   macro avg       0.08      0.75      0.15       232
weighted avg       0.13      0.95      0.23       232
 samples avg       0.11      0.30      0.16       232



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
score = bert_model.evaluate([
    test_encodings.input_ids, 
    test_encodings.token_type_ids, 
    test_encodings.attention_mask
    ],
    test_flags
)

print('Test loss:', round(score[0], 5)) 
print('Test accuracy:', round(score[1], 5))

