# BERT is not all you need - at least in LegalTech - draft

This short notebook was inspired by post https://www.linkedin.com/feed/update/urn:li:activity:6782558075611037697/ and this article / mater thesis: https://arxiv.org/pdf/2103.11792.pdf

In [25]:
import json
import lzma
import random
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import f1_score

def calculate_accuracy_f1_score_for_set(x, y, name):
    x_test_vectorized = vectorizer.transform(x)
    y_pred = model.predict(x_test_vectorized)
    
    acc = sum(y == y_pred)/len(y)
    print(f"Accuracy for {name} set {acc}")
    score = f1_score(y, y_pred,  pos_label='majority') 
    print(f"F1 for {name} set {score}")

def get_number_of_opinions(row):
    try:
        return len(row.casebody["data"]["opinions"])
    except AttributeError:
        print(row)
        return None
    
def get_label_and_text(row):
    try:
        return [(row[0], i["type"], i["text"]) for i in row[1].casebody["data"]["opinions"]]
    except:
        return None

# Load data

In [2]:
data = []
for file in ["./data_legal/Arkansas-20200302-text//data/data.jsonl.xz",
             "./data_legal/Illinois-20200302-text///data/data.jsonl.xz",
             "./data_legal/New Mexico-20200302-text///data/data.jsonl.xz",
             "./data_legal/North Carolina-20200416-text///data/data.jsonl.xz"]:
    with lzma.open(file, 'rb') as f:
        x = f.read()
    data.extend([json.loads(j) for j in x.decode('utf-8').splitlines()])

In [3]:
data = pd.DataFrame(data)

In [4]:
data["length"] =  [get_number_of_opinions(i[1]) for i in data.iterrows()]

In [5]:
data["length"].unique()

array([1, 2, 3, 4, 0, 5, 8, 7, 6])

In [6]:
all_cases = []
for row in data.iterrows():
    all_cases.extend(get_label_and_text(row))

In [7]:
data = pd.DataFrame(all_cases)

In [8]:
data.columns = ["case_id", "label", "text"]

In [9]:
sum(data["label"]=='dissent')

18650

In [10]:
data

Unnamed: 0,case_id,label,text
0,0,majority,OPINION OF THE COIÍRT. This is an action of de...
1,1,majority,OPINION OP THE COURT. This is an appeal from t...
2,2,majority,"CROSS, Judge.\nThe record in this case shows t..."
3,3,majority,"W. H.“Dub” Arnold, Chief Justice.\nThis is a c..."
4,3,rehearing,SUPPLEMENTAL OPINION ON DENIAL OF REHEARING\nW...
...,...,...,...
388634,358814,majority,PER CURIAM.\nJustice EDMUNDS took no part in t...
388635,358815,majority,"HUDSON, Justice.\nHere we are asked to determi..."
388636,358816,majority,1. State’s Motion for Temporary Stay (COA14-41...
388637,358817,majority,1. State’s Motion for Temporary Stay (COA15-15...


# Prepare smaller dataset

In [11]:
data.label.value_counts()

majority                                     358489
dissent                                       18650
concurrence                                    7765
concurring-in-part-and-dissenting-in-part      2060
rehearing                                      1663
on-motion-to-strike-cost-bill                     4
on-the-merits                                     4
remittitur                                        3
unanimous                                         1
Name: label, dtype: int64

In [12]:
data_to_check = data[data.label.isin(["majority", "dissent"])]

In [13]:
majority_samples = data_to_check[data_to_check["label"] == "majority"].sample(30000, random_state=123)

In [14]:
data_sampled = pd.concat((majority_samples, data[data["label"] == "dissent"]))

In [15]:
data_to_check = data_sampled.drop_duplicates("case_id")

In [17]:
data_to_check.label.value_counts()

majority    30000
dissent     16198
Name: label, dtype: int64

# "Experiment" on the original opinions

In [18]:
x_train, x_test, y_train, y_test = train_test_split(data_sampled.text, 
                                                    data_sampled.label, 
                                                    stratify=data_sampled.label,
                                                    random_state=123, 
                                                    test_size=0.3
                                                    )

In [19]:
x_test, x_valid, y_test, y_valid = train_test_split(x_test, 
                                                    y_test, 
                                                    stratify=y_test,
                                                    random_state=123, 
                                                    test_size=0.5
                                                    )

In [20]:
len(x_train), len(x_test), len(x_valid)

(34055, 7297, 7298)

In [21]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.lower().split(),
                             preprocessor=lambda x:  x.lower().split(),
                            analyzer=lambda x:  x.lower().split(),
                            stop_words='english')

In [22]:
x_train_vectorized = vectorizer.fit_transform(x_train.tolist())

In [23]:
model = XGBClassifier(random_state=123)
model.fit(x_train_vectorized, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=20, num_parallel_tree=1,
              random_state=123, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

### Results

In [26]:
calculate_accuracy_f1_score_for_set(x_test, y_test, "test")

Accuracy for test set 0.9946553378100589
F1 for test set 0.9956700344176751


In [27]:
calculate_accuracy_f1_score_for_set(x_valid, y_valid, "valid")

Accuracy for valid set 0.9950671416826528
F1 for valid set 0.9960053262316911


### Words that are most important

In [28]:
words_ids = np.where(model.feature_importances_ > 0.02)[0]

In [29]:
vocab = {k:v for v, k in vectorizer.vocabulary_.items()}

In [30]:
for i in words_ids:
    print(vocab[i])

concur.
dissenting:
i
majority
you


# "Experiment" on the truncated opinions (word "dissenting" is removed from the beginning of opinion)

### Removing "dissenting"

In [31]:
data_sampled[data_sampled["text"].apply(lambda x: "dissenting" in x[:100] if x else False)].label.value_counts()

dissent     17824
majority       18
Name: label, dtype: int64

In [32]:
data_sampled["truncated_text"] = data_sampled["text"].apply(lambda x: x[100:])

### And the same procedure...

In [33]:
x_train, x_test, y_train, y_test = train_test_split(data_sampled.truncated_text, 
                                                    data_sampled.label, 
                                                    stratify=data_sampled.label,
                                                    random_state=123, 
                                                    test_size=0.3)

In [34]:
x_test, x_valid, y_test, y_valid = train_test_split(x_test, 
                                                    y_test, 
                                                    stratify=y_test,
                                                    random_state=123, 
                                                    test_size=0.5)

In [35]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.lower().split(),
                             preprocessor=lambda x:  x.lower().split(),
                            analyzer=lambda x:  x.lower().split(),
                            stop_words='english')

In [36]:
x_train_vectorized = vectorizer.fit_transform(x_train.tolist())

In [37]:
model = XGBClassifier(random_state=123)
model.fit(x_train_vectorized, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=20, num_parallel_tree=1,
              random_state=123, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [38]:
def calculate_accuracy_f1_score_for_set(x, y, name):
    x_test_vectorized = vectorizer.transform(x)
    y_pred = model.predict(x_test_vectorized)
    
    acc = sum(y == y_pred)/len(y)
    print(f"Accuracy for {name} set {acc}")
    score = f1_score(y, y_pred,  pos_label='majority') 
    print(f"F1 for {name} set {score}")


### Results

In [39]:
calculate_accuracy_f1_score_for_set(x_test, y_test, "test")

Accuracy for test set 0.9693024530629025
F1 for test set 0.9752977503308337


In [40]:
calculate_accuracy_f1_score_for_set(x_valid, y_valid, "valid")

Accuracy for valid set 0.9693066593587284
F1 for valid set 0.9753629564452265


### Important words

In [41]:
words_ids = np.where(model.feature_importances_ > 0.02)[0]

In [42]:
vocab = {k:v for v, k in vectorizer.vocabulary_.items()}

In [43]:
for i in words_ids:
    print(vocab[i])

concur.
i
majority
you
