# Train Legal Classifiers

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

In [0]:
from johnsnowlabs import * 

# Multilabel classifier training

## Loading the data

In [0]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Legal/data/finance_data.csv

dbutils.fs.cp("file:/databricks/driver/finance_data.csv", "dbfs:/") 

In [0]:
import pandas as pd
df = pd.read_csv('/dbfs/finance_data.csv')
df['label'] = df['label'].apply(eval)

In [0]:
data = spark.createDataFrame(df)

# If you have a single dataset, then split it or else you can load the test dataset the same way that you load the train data.
train, test = data.randomSplit([0.8, 0.2], seed = 123)

In [0]:
train.show(truncate=50)

In [0]:
from pyspark.sql.functions import col

test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

## With Universal Encoder

In [0]:
document_assembler = nlp.DocumentAssembler() \
      .setInputCol("provision") \
      .setOutputCol("document") \
      .setCleanupMode("shrink")

embeddings = nlp.UniversalSentenceEncoder.pretrained() \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")

classsifierdl = nlp.MultiClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("label")\
      .setMaxEpochs(30)\
      .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        embeddings,
        classsifierdl
    ])

In [0]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

In [0]:
preds = clf_pipelineModel.transform(test)

In [0]:
preds_df = preds.select('label','provision',"class.result").toPandas()
preds_df.head()

Unnamed: 0,label,provision,result
0,"[waivers, amendments]",(a) No failure or delay by any Agent or any Le...,"[waivers, amendments]"
1,[assignments],"(a) Seller, the Agent, each Managing Agent, ea...","[successors, assignments]"
2,[waivers],(a) Any provision of this Agreement may be wai...,"[waivers, amendments]"
3,[notices],(a) Except where telephonic instructions or no...,[notices]
4,"[governing laws, entire agreements]","(a) THIS AGREEMENT AND ANY CLAIM, CONTROVERSY,...","[governing laws, entire agreements]"


In [0]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

mlb = MultiLabelBinarizer()

y_true = mlb.fit_transform(preds_df['label'])
y_pred = mlb.fit_transform(preds_df['result'])


print("Classification report: \n", (classification_report(y_true, y_pred)))
print("F1 micro averaging:",(f1_score(y_true, y_pred, average='micro')))
print("ROC: ",(roc_auc_score(y_true, y_pred, average="micro")))


## With RoBerta Embeddings

We do not have have any specific Legal Sentence Embeddings, but we can use Legal RoBerta Embeddings and then average them.

In [0]:
embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings") \
    .setMaxSentenceLength(512)

In [0]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("provision") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddingsSentence = nlp.SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = nlp.MultiClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class") \
    .setLabelColumn("label")\
    .setMaxEpochs(1)\
    .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        tokenizer,
        embeddings,
        embeddingsSentence,
        classsifierdl
    ])

In [0]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

In [0]:
preds = clf_pipelineModel.transform(test)

In [0]:
preds_df = preds.select('provision','label',"class.result").toPandas()

In [0]:
preds_df.head()

Unnamed: 0,provision,label,result
0,(a) No failure or delay by any Agent or any Le...,"[waivers, amendments]",[waivers]
1,"(a) Seller, the Agent, each Managing Agent, ea...",[assignments],"[successors, assigns]"
2,(a) Any provision of this Agreement may be wai...,[waivers],[]
3,(a) Except where telephonic instructions or no...,[notices],[notices]
4,"(a) THIS AGREEMENT AND ANY CLAIM, CONTROVERSY,...","[governing laws, entire agreements]","[governing laws, entire agreements]"


In [0]:
import os
log_file_name = os.listdir("/root/annotator_logs")[0]

with open("/root/annotator_logs/"+log_file_name, "r") as log_file :
    print(log_file.read())

Training started - epochs: 6 - learning_rate: 0.001 - batch_size: 64 - training_examples: 22042 - classes: 15
Epoch 0/6 - 6.60s - loss: 0.096051626 - acc: 0.9705064 - batches: 345
Epoch 1/6 - 4.51s - loss: 0.039551556 - acc: 0.98879695 - batches: 345
Epoch 2/6 - 4.74s - loss: 0.03486474 - acc: 0.9903383 - batches: 345
Epoch 3/6 - 4.52s - loss: 0.0321748 - acc: 0.99127585 - batches: 345
Epoch 4/6 - 4.64s - loss: 0.030249653 - acc: 0.99202967 - batches: 345
Epoch 5/6 - 4.48s - loss: 0.028734822 - acc: 0.9925955 - batches: 345



In [0]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

mlb = MultiLabelBinarizer()

y_true = mlb.fit_transform(preds_df['label'])
y_pred = mlb.fit_transform(preds_df['result'])


print("Classification report: \n", (classification_report(y_true, y_pred)))
print("F1 micro averaging:",(f1_score(y_true, y_pred, average='micro')))
print("ROC: ",(roc_auc_score(y_true, y_pred, average="micro")))


### Saving & loading back the trained model

In [0]:
clf_pipelineModel.stages

In [0]:
clf_pipelineModel.stages[-1].write().overwrite().save('/dbfs/MultilabelClfRoBerta')

In [0]:
# Load back  saved Multilabel Classifier Model
MultilabelClfModel = nlp.MultiClassifierDLModel.load('/dbfs/MultilabelClfRoBerta')

In [0]:
ld_pipeline = Pipeline(stages=[document_assembler, tokenizer, embeddings, embeddingsSentence, MultilabelClfModel])
ld_pipeline_model = ld_pipeline.fit(spark.createDataFrame([['']]).toDF("provision"))

In [0]:
# Apply Model Transform to testData
ld_preds = ld_pipeline_model.transform(test)

In [0]:
ld_preds_df = ld_preds.select('provision','label',"class.result").toPandas()

In [0]:
ld_preds_df.head(10)

Unnamed: 0,provision,label,result
0,(a) No failure or delay by any Agent or any Le...,"[waivers, amendments]",[waivers]
1,"(a) Seller, the Agent, each Managing Agent, ea...",[assignments],"[successors, assigns]"
2,(a) Any provision of this Agreement may be wai...,[waivers],[]
3,(a) Except where telephonic instructions or no...,[notices],[notices]
4,"(a) THIS AGREEMENT AND ANY CLAIM, CONTROVERSY,...","[governing laws, entire agreements]","[governing laws, entire agreements]"
5,(a) This Agreement may be executed by one or m...,[counterparts],[counterparts]
6,A Any notice which either party hereto may be ...,[notices],[notices]
7,A counterpart original of this Amendment duly ...,[amendments],[counterparts]
8,Advisor represents that Advisor’s services und...,[representations],[]
9,"After the Termination Date, this Agreement sha...",[terminations],"[survival, terminations]"


# Multiclass classifier training

## Loading the data

In [0]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/legal/data/finance_clf_data.csv

In [0]:
import pandas as pd
df = pd.read_csv('/dbfs/finance_clf_data.csv')

In [0]:
df.head()

Unnamed: 0,text,label,len
0,Presently we do not believe any U S or State r...,business,402
1,\r\nnetwork outages or performance degradatio...,risk_factors,496
2,Available Information\r\nOur reports filed wit...,business,356
3,\r\n 42 530\r\n \r\n \r\n \r\n \r\n \r\n 42 5...,financial_statements,359
4,8\r\nTable of Contents\r\ndevelopment employee...,business,582


In [0]:
df['label'].value_counts()

In [0]:
data = spark.createDataFrame(df)

train, test = data.randomSplit([0.8, 0.2], seed = 100)

In [0]:
from pyspark.sql.functions import col

train.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

In [0]:
from pyspark.sql.functions import col

test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

## With Universal Encoder

In [0]:
document_assembler = nlp.DocumentAssembler() \
      .setInputCol("text") \
      .setOutputCol("document") 

embeddings = nlp.UniversalSentenceEncoder.pretrained() \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")

classsifierdl = legal.ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("label")\
      .setMaxEpochs(30)\
      .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        embeddings,
        classsifierdl
    ])

In [0]:
clf_pipelineModel = clf_pipeline.fit(train)

In [0]:
preds = clf_pipelineModel.transform(test)

In [0]:
preds_df = preds.select('label','text',"class.result").toPandas()
preds_df.head()

Unnamed: 0,label,text,result
0,risk_factors,\r\n\r\n \r\n\r\n\r\nNet cash provided by oper...,[financial_statements]
1,financial_statements,\r\n\r\n\r\n \r\n \r\n \r\n Identification of...,[financial_statements]
2,form_10k_summary,\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\...,[financial_statements]
3,financial_statements,\r\n \r\n 120\r\n \r\n \r\n \r\n 202\r\n \r\n...,[financial_statements]
4,risk_factors,\r\n \r\nAn assertion by a third party that w...,[risk_factors]


In [0]:
# The result is an array since in Spark NLP you can have multiple sentences.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

In [0]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['result']))

### Saving & loading back the trained model

In [0]:
clf_pipelineModel.stages

In [0]:
clf_pipelineModel.stages[-1].write().overwrite().save('/dbfs/Clf_Use')

In [0]:
# Load back  saved Classifier Model
ClfModel = legal.ClassifierDLModel.load('/dbfs/Clf_Use')

In [0]:
ld_pipeline = Pipeline(stages=[document_assembler, embeddings,ClfModel])
ld_pipeline_model = ld_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))

In [0]:
# Apply Model Transform to testData
ld_preds = ld_pipeline_model.transform(test)

In [0]:
ld_preds_df = ld_preds.select('text','label',"class.result").toPandas()

In [0]:
ld_preds_df.head()

Unnamed: 0,text,label,result
0,\r\n\r\n \r\n\r\n\r\nNet cash provided by oper...,risk_factors,[financial_statements]
1,\r\n\r\n\r\n \r\n \r\n \r\n Identification of...,financial_statements,[financial_statements]
2,\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\...,form_10k_summary,[financial_statements]
3,\r\n \r\n 120\r\n \r\n \r\n \r\n 202\r\n \r\n...,financial_statements,[financial_statements]
4,\r\n \r\nAn assertion by a third party that w...,risk_factors,[risk_factors]


## With RoBerta Embeddings

We do not have Legal Sentence Embeddings yet, But we can use the Legal RoBerta Embeddings and then average them.

In [0]:
embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings") \
    .setMaxSentenceLength(512)

In [0]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddingsSentence = nlp.SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = legal.ClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class") \
    .setLabelColumn("label")\
    .setMaxEpochs(8)\
    .setLr(0.001)\
    .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        tokenizer,
        embeddings,
        embeddingsSentence,
        classsifierdl
    ])

In [0]:
clf_pipelineModel = clf_pipeline.fit(train)

In [0]:
preds = clf_pipelineModel.transform(test)

In [0]:
preds_df = preds.select('label','text',"class.result").toPandas()

In [0]:
preds_df.head()

Unnamed: 0,label,text,result
0,risk_factors,\r\n\r\n \r\n\r\n\r\nNet cash provided by oper...,[financial_statements]
1,financial_statements,\r\n\r\n\r\n \r\n \r\n \r\n Identification of...,[financial_statements]
2,form_10k_summary,\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\...,[financial_statements]
3,financial_statements,\r\n \r\n 120\r\n \r\n \r\n \r\n 202\r\n \r\n...,[financial_statements]
4,risk_factors,\r\n \r\nAn assertion by a third party that w...,[risk_factors]


In [0]:
log_files = os.listdir("/root/annotator_logs")

with open("/root/annotator_logs/"+log_files[0], "r") as log_file :
    print(log_file.read())

Training started - epochs: 8 - learning_rate: 0.001 - batch_size: 64 - training_examples: 9783 - classes: 14
Epoch 0/8 - 1.31s - loss: 352.6541 - acc: 0.57318145 - batches: 153
Epoch 1/8 - 1.09s - loss: 348.1564 - acc: 0.5952826 - batches: 153
Epoch 2/8 - 1.09s - loss: 347.85394 - acc: 0.596105 - batches: 153
Epoch 3/8 - 1.08s - loss: 347.59656 - acc: 0.5982637 - batches: 153
Epoch 4/8 - 1.06s - loss: 347.57974 - acc: 0.59982246 - batches: 153
Epoch 5/8 - 1.11s - loss: 347.59897 - acc: 0.600542 - batches: 153
Epoch 6/8 - 1.10s - loss: 347.61884 - acc: 0.6012784 - batches: 153
Epoch 7/8 - 1.07s - loss: 347.6311 - acc: 0.601998 - batches: 153



In [0]:
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['result']))


                         precision    recall  f1-score   support

               business       0.00      0.00      0.00       420
    controls_procedures       0.00      0.00      0.00        51
                 equity       0.00      0.00      0.00        49
             executives       0.00      0.00      0.00        39
executives_compensation       0.00      0.00      0.00        80
               exhibits       0.00      0.00      0.00        15
   financial_conditions       0.00      0.00      0.00       105
   financial_statements       0.58      0.96      0.72       743
       form_10k_summary       0.00      0.00      0.00       106
      legal_proceedings       0.00      0.00      0.00        22
            market_risk       0.00      0.00      0.00        46
             properties       0.00      0.00      0.00        24
           risk_factors       0.59      0.97      0.73       760
     security_ownership       0.00      0.00      0.00        14

               accuracy

# Save model and Zip it for Modelshub Upload/Downloads

In [0]:
# Save a Spark NLP model
clf_pipelineModel.stages[-1].write().overwrite().save('ClfBert')

# cd into saved dir and zip
! cd /content/ClfBert ; zip -r /content/ClfBert.zip *

  adding: classifierdl_tensorflow (deflated 58%)
  adding: fields/ (stored 0%)
  adding: fields/datasetParams/ (stored 0%)
  adding: fields/datasetParams/.part-00003.crc (stored 0%)
  adding: fields/datasetParams/.part-00002.crc (stored 0%)
  adding: fields/datasetParams/part-00000 (deflated 27%)
  adding: fields/datasetParams/.part-00000.crc (stored 0%)
  adding: fields/datasetParams/part-00002 (deflated 27%)
  adding: fields/datasetParams/._SUCCESS.crc (stored 0%)
  adding: fields/datasetParams/part-00003 (deflated 32%)
  adding: fields/datasetParams/part-00001 (deflated 26%)
  adding: fields/datasetParams/.part-00001.crc (stored 0%)
  adding: fields/datasetParams/_SUCCESS (stored 0%)
  adding: metadata/ (stored 0%)
  adding: metadata/part-00000 (deflated 39%)
  adding: metadata/.part-00000.crc (stored 0%)
  adding: metadata/._SUCCESS.crc (stored 0%)
  adding: metadata/_SUCCESS (stored 0%)
