# [Title]

[Description]

## I. Setup

In [None]:
import pandas as pd
import tensorflow as tf

from random import sample, seed

from transformers import AutoTokenizer, AutoModel

## II. Load Data

#### Split Data into Training and Test Sets (Based on Policies)

In [None]:
# Load the Data
data = pd.read_csv('../data.csv')

# Set the seed to fix the samples in place
seed(0)

splits = {'training':75,'val':20,'test':20}
# splits = {'training':28,'val':8,'test':4}

# Sample 75 of the 115 unique policy_id values, then save the other 40 for test
policies_for_training = sample(list(set(data.policy_id)), splits['training'])
train = data[data.policy_id.isin(policies_for_training)]
train = list(train.drop(['policy_id', 'segment_id'], axis=1).itertuples(index=False, name=None))
train_segments = [row[0] for row in train]
train_flags = [(row[1], row[2], row[3], row[4]) for row in train]

# # Sample 20 of the remaining for validation
policies_for_val = sample(list(set(data[~data.policy_id.isin(policies_for_training)].policy_id)), splits['val'])
val = data[data.policy_id.isin(policies_for_val)]
val = list(val.drop(['policy_id', 'segment_id'], axis=1).itertuples(index=False, name=None))
val_segments = [row[0] for row in val]
val_flags = [(row[1], row[2], row[3], row[4]) for row in val]

# # Sample 20 of the remaining for test
test = data[~data.policy_id.isin(policies_for_training+policies_for_val)]
test = list(test.drop(['policy_id', 'segment_id'], axis=1).itertuples(index=False, name=None))
test_segments = [row[0] for row in test]
test_flags = [(row[1], row[2], row[3], row[4]) for row in test]

del data, policies_for_training, train, val, test

## III. BERT Model

In [None]:
max_length = 400

In [None]:
model_checkpoint = 'nlpaueb/legal-bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModel.from_pretrained(model_checkpoint)

In [None]:
# tokenize the dataset, truncate when passed `max_length`, 
# and pad with 0's when less than `max_length` and return a tf Tensor
train_encodings = tokenizer(
    train_segments,
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf'
)

val_encodings = tokenizer(
    val_segments, 
    truncation=True, 
    padding=True, 
    max_length=max_length, 
    return_tensors='tf'
)

test_encodings = tokenizer(
    test_segments, 
    truncation=True, 
    padding=True, 
    max_length=max_length, 
    return_tensors='tf'
)

### TEST ZONE STARTS HERE

In [None]:
from tensorflow import keras
from keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from transformers import TFBertModel

def create_model(
        checkpoint=model_checkpoint,
        num_labels=1,
        hidden_size=201, 
        dropout=0.5,
        learning_rate=0.0001
    ):
    """
    Build a simple classification model with BERT. Use the Pooler Output for classification purposes.
    """
    bert_model = TFBertModel.from_pretrained(checkpoint)                                              

    # Train all layers in BERT
    bert_model.trainable = True

    # Define the BERT inputs
    input_ids = Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')
    bert_inputs = {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask
    }

    # Define the pooler output from BERT to pass to our hidden layer
    # bert_out = bert_model(bert_inputs)
    # pooler_token = bert_out[1]
    pooler_token = bert_model(bert_inputs)[1]

    # Create a hidden size layer of size 201
    hidden = Dense(
        units=hidden_size,
        activation='relu',
        name='hidden_layer'
    )(pooler_token)

    # Add a dropout layer
    hidden = Dropout(dropout)(hidden)

    # Output 1
    classification1 = Dense(
        units=num_labels,
        activation='sigmoid',
        name='first_party_data_output'
    )(hidden)

    # Output 2
    classification2 = Dense(
        units=num_labels,
        activation='sigmoid',
        name='third_party_sharing_output'
    )(hidden)
    
    # Output 3
    classification3 = Dense(
        units=num_labels,
        activation='sigmoid',
        name='first_party_choice_output'
    )(hidden)

    # Output 4
    classification4 = Dense(
        units=num_labels,
        activation='sigmoid',
        name='third_party_choice_output'
    )(hidden)
    
    classification_model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask], 
        outputs={
            'first_party_data_output': classification1,
            'third_party_sharing_output': classification2,
            'first_party_choice_output': classification3,
            'third_party_choice_output': classification4,
        }
    )
    
    classification_model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='binary_crossentropy', 
        metrics='accuracy'
    )

    return classification_model

bert_model = create_model()
bert_model.summary()

In [None]:
import numpy as np

# Move above
train_flags2 = {
    'first_party_data_output': tf.convert_to_tensor([x[0] for x in train_flags]),
    'third_party_sharing_output': tf.convert_to_tensor([x[1] for x in train_flags]),
    'first_party_choice_output': tf.convert_to_tensor([x[2] for x in train_flags]),
    'third_party_choice_output': tf.convert_to_tensor([x[3] for x in train_flags]),
    }
val_flags2 = {
        'first_party_data_output': tf.convert_to_tensor([x[0] for x in val_flags]),
        'third_party_sharing_output': tf.convert_to_tensor([x[1] for x in val_flags]),
        'first_party_choice_output': tf.convert_to_tensor([x[2] for x in val_flags]),
        'third_party_choice_output': tf.convert_to_tensor([x[3] for x in val_flags]),
        }
test_flags2 = {
        'first_party_data_output': tf.convert_to_tensor([x[0] for x in test_flags]),
        'third_party_sharing_output': tf.convert_to_tensor([x[1] for x in test_flags]),
        'first_party_choice_output': tf.convert_to_tensor([x[2] for x in test_flags]),
        'third_party_choice_output': tf.convert_to_tensor([x[3] for x in test_flags]),
        }

bert_model_history = bert_model.fit(
    [train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
    train_flags2,
    
    validation_data=(
        [val_encodings.input_ids, val_encodings.token_type_ids, val_encodings.attention_mask], 
        val_flags2
        
    ),    
    batch_size=12,
    epochs=3
)

In [None]:
score = bert_model.evaluate([
    test_encodings.input_ids, 
    test_encodings.token_type_ids, 
    test_encodings.attention_mask
    ],
    test_flags2
)

print('Test loss:', round(score[0], 5)) 
print('Test accuracy:', round(score[1], 5))

In [None]:
predictions = bert_model.predict([
    test_encodings.input_ids, 
    test_encodings.token_type_ids, 
    test_encodings.attention_mask
])

threshold = 0.05
predictions2 = [(a.item() > threshold, b.item() > threshold, c.item() > threshold, d.item() > threshold) for a, b, c, d 
                in zip(predictions['first_party_data_output'],
                        predictions['third_party_sharing_output'],
                        predictions['first_party_choice_output'],
                        predictions['third_party_choice_output'])]

from sklearn.metrics import classification_report

print(classification_report(
    test_flags,
    predictions2
))