Importing libraries


In [None]:
import pandas as pd
import tensorflow as tf
#import facts_preprocessing
import keras
from keras import layers
from keras.preprocessing.sequence import pad_sequences
import json
import numpy as np
from keras.preprocessing.text import Tokenizer

Upload cleaned CSV file, source: https://github.com/DavidOB1/supreme-court-nlp/

In [None]:
data = pd.read_csv("clean_data.csv",encoding="Windows-1252").dropna()

In [None]:
data_for_model = pd.DataFrame()

Define some variables

In [None]:
batch_size = 4
embedding_dim = 100
# 1000 for character level
seq_length = 250
# 75 for character level
max_features = 20000
embedding_matrix = np.zeros((max_features, embedding_dim))

Keras Tokenizer


In [None]:
def tokenize_with_keras(text_data, char_level = False):
  global max_features
  tokenizer = Tokenizer(num_words=max_features, split=' ',char_level=char_level)
  tokenizer.fit_on_texts(text_data.values)  
  max_features = tokenizer.num_words
  #X = tokenizer.texts_to_sequences(text_data.values)
  #X = pad_sequences(X, padding = "post",truncating="post",maxlen = seq_length)
  return tokenizer

Prepare the data

In [None]:
def load_data(use_keras = True):
  
  if (use_keras):
    vectorizer = tokenize_with_keras(data["facts"],char_level = True)  
    data_for_model["facts"] = vectorizer.texts_to_sequences(data["facts"].values)
  else:
    vectorizer = facts_preprocessing.create_vectorization_model(data["facts"])
    data_for_model["facts"] = data["facts"].apply(lambda x: facts_preprocessing.vectorize_string(x,vectorizer))
    global max_features
    max_features = vectorizer.num_words
  #data["first_party"] = data["first_party"].apply(lambda x: facts_preprocessing.vectorize_string(x,vectorizer))
  #data["second_party"] = data["second_party"].apply(lambda x: facts_preprocessing.vectorize_string(x,vectorizer))
  data_for_model["first_party_won"] = data["first_party_won"].apply(lambda x: 1 if x == True else 0)
  data_for_model["ideologies"] = data["ideologies"].apply(lambda x: json.loads(x))

  return data_for_model, vectorizer

If we're using Gensim, we need to make a matrix of vectors for use in the embedding layer

In [None]:
def build_embedding(vectorizer):
  hits = 0
  misses = 0
  global max_features
  max_features = vectorizer.num_words
  global embedding_matrix
  embedding_matrix = np.zeros((max_features, embedding_dim))
  
  for word, i in vectorizer.wv.key_to_index.items():
      embedding_vector = vectorizer.wv[word]
      if embedding_vector is not None:
          # Words not found in embedding index will be all-zeros.
          # This includes the representation for "padding" and "OOV"
          embedding_matrix[i] = embedding_vector
          hits += 1
      else:
          misses += 1
  print("Converted %d words (%d misses)" % (hits, misses))

Organize data into vectors for use in the model

In [None]:
def get_complete_data():
  global data_for_model
  data_for_model = data_for_model.dropna() 
  X = pad_sequences(data_for_model["facts"],maxlen=seq_length,padding="post", truncating="post")
  #FP = pad_sequences(data["first_party"],maxlen=20,padding="post", truncating="post")
  #SP = pad_sequences(data["second_party"],maxlen=20,padding="post", truncating="post")
  I = pad_sequences(data_for_model["ideologies"], maxlen = 9)
  Y = data_for_model["first_party_won"]
  return X, I, Y

Build the actual model

In [None]:
def build_model(use_keras = True):
  text_input = layers.Input(shape=(seq_length))
  if (not use_keras):
    embedding = layers.Embedding(max_features, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False)(text_input)
  embedding = layers.Embedding(max_features, embedding_dim,input_length=seq_length)(text_input)
  conv_layer_1 = layers.Conv1D(1024,8,padding="same",activation="relu")(embedding)
  pool_1 = layers.MaxPool1D()(conv_layer_1)
  norm_1 = layers.BatchNormalization()(pool_1)
  conv_layer_2 = layers.Conv1D(1024,8,padding="same",activation="relu")(norm_1)
  pool_2 = layers.MaxPool1D(pool_size=2)(conv_layer_2)
  norm_2 = layers.BatchNormalization()(pool_2)
  conv_layer_3 = layers.Conv1D(1024,8,padding="same",activation="relu")(norm_2)
  text_pool = layers.GlobalAveragePooling1D()(conv_layer_3)

  ideology_input = layers.Input(shape=(9))
  i_1 = layers.Dense(100,activation="relu")(ideology_input) 
  i_2 = layers.Dense(50,activation="relu")(i_1)
  i_3 = layers.Dense(20,activation="relu")(i_2)  

  
  first_party_input = layers.Input(shape=(20,100))
  fp_conv = layers.Conv1D(64,3,padding="same",activation="relu")(first_party_input)
  fp_pool = layers.GlobalAvgPool1D()(fp_conv)

  second_party_input = layers.Input(shape=(20,100))
  s_conv = layers.Conv1D(64,3,padding="same",activation="relu")(second_party_input)
  s_pool = layers.GlobalAvgPool1D()(s_conv)

  #combined = layers.concatenate([text_pool,fp_pool,s_pool,i_pool])
  combined = layers.concatenate([text_pool,i_3])

  dense_1 = layers.Dense(1024,activation="relu")(combined)
  dense_2 = layers.Dense(128,activation="relu")(dense_1)
  norm_d = layers.BatchNormalization()(dense_2)
  dense_3 = layers.Dense(16,activation="relu")(norm_d)
  output = layers.Dense(1,activation="sigmoid")(dense_3)

  #return keras.Model(inputs=(text_input,first_party_input,second_party_input,ideology_input),outputs=output)
  #tf.metrics.BinaryAccuracy(threshold=0.0)
  mod = keras.Model(inputs=(text_input, ideology_input),outputs=output)
  mod.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  mod.summary()
  return mod

In [None]:
m = build_model(True)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 250)]        0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 250, 100)     2000000     ['input_2[0][0]']                
                                                                                                  
 conv1d_2 (Conv1D)              (None, 250, 1024)    820224      ['embedding_1[0][0]']            
                                                                                                  
 max_pooling1d_1 (MaxPooling1D)  (None, 125, 1024)   0           ['conv1d_2[0][0]']               
                                                                                              

In [None]:
tf.keras.utils.plot_model(m, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

Train and test the model

In [None]:
def train_model(use_keras = True):
  data, vec = load_data(use_keras)
  print("loaded data")
  X, I, Y = get_complete_data()
  print("vectorized data")
  model = build_model()
  print("built model")
  if (not use_keras):
    build_embedding(vec)
    print("built embeddings")
  model.fit(x=(X,I), y = Y, batch_size=batch_size,epochs=10,validation_split=0.3)
  #model.predict(x=[])
  return model

In [None]:
max_features

20000

In [None]:
m = train_model(use_keras = True)

loaded data
vectorized data
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_10 (InputLayer)          [(None, 250)]        0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 250, 100)     2000000     ['input_10[0][0]']               
                                                                                                  
 conv1d_12 (Conv1D)             (None, 250, 1024)    820224      ['embedding_3[0][0]']            
                                                                                                  
 max_pooling1d_5 (MaxPooling1D)  (None, 125, 1024)   0           ['conv1d_12[0][0]']              
                                                                

KeyboardInterrupt: ignored

In [None]:
data_for_model["ideologies"].tolist()
len(pad_sequences(data_for_model["ideologies"].dropna(), maxlen = 9))

3067

# Let's try something else - BERT

BERT is a pretrained text model much like GPT, and it's used to predict words within sequences. However, it can be adapted for other purposes, like classifying text data.

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 42.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 41.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 47.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [None]:
from transformers import AutoTokenizer, TFAutoModel
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
bert = TFAutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")

Some layers from the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
def build_model(use_keras = True):
  text_input = layers.Input(shape=(seq_length))
  if (not use_keras):
    embedding = layers.Embedding(max_features, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False)(text_input)
  embedding = layers.Embedding(max_features, embedding_dim,input_length=seq_length)(text_input)
  conv_layer_1 = layers.Conv1D(1024,8,padding="same",activation="relu")(embedding)
  pool_1 = layers.MaxPool1D()(conv_layer_1)
  norm_1 = layers.BatchNormalization()(pool_1)
  conv_layer_2 = layers.Conv1D(1024,8,padding="same",activation="relu")(norm_1)
  pool_2 = layers.MaxPool1D(pool_size=2)(conv_layer_2)
  norm_2 = layers.BatchNormalization()(pool_2)
  conv_layer_3 = layers.Conv1D(1024,8,padding="same",activation="relu")(norm_2)
  text_pool = layers.GlobalAveragePooling1D()(conv_layer_3)

  ideology_input = layers.Input(shape=(9))
  i_1 = layers.Dense(100,activation="relu")(ideology_input) 
  i_2 = layers.Dense(50,activation="relu")(i_1)
  i_3 = layers.Dense(20,activation="relu")(i_2)  

  
  first_party_input = layers.Input(shape=(20,100))
  fp_conv = layers.Conv1D(64,3,padding="same",activation="relu")(first_party_input)
  fp_pool = layers.GlobalAvgPool1D()(fp_conv)

  second_party_input = layers.Input(shape=(20,100))
  s_conv = layers.Conv1D(64,3,padding="same",activation="relu")(second_party_input)
  s_pool = layers.GlobalAvgPool1D()(s_conv)

  #combined = layers.concatenate([text_pool,fp_pool,s_pool,i_pool])
  combined = layers.concatenate([text_pool,i_3])

  dense_1 = layers.Dense(1024,activation="relu")(combined)
  dense_2 = layers.Dense(128,activation="relu")(dense_1)
  norm_d = layers.BatchNormalization()(dense_2)
  dense_3 = layers.Dense(16,activation="relu")(norm_d)
  output = layers.Dense(1,activation="sigmoid")(dense_3)

  #return keras.Model(inputs=(text_input,first_party_input,second_party_input,ideology_input),outputs=output)
  #tf.metrics.BinaryAccuracy(threshold=0.0)
  mod = keras.Model(inputs=(text_input, ideology_input),outputs=output)
  mod.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  mod.summary()
  return mod

In [None]:
bert_layer = bert.layers.pop(0)

In [None]:
bert_layer

<transformers.models.bert.modeling_tf_bert.TFBertMainLayer at 0x7fb4737759d0>