![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings_JSL/Legal/80.0.Use_case_Capital_Calls.ipynb)

# Use Case: Extracting Contact and Financial entities from Capital Calls

# Installation

In [1]:
! pip install -q johnsnowlabs

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.7/74.7 KB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.4/95.4 KB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m453.8/453.8 KB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m570.6/570.6 KB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.9/66.9 KB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.3/82.3 KB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for databricks-cli (setup.py) ... [?25l[?25hdone


## Automatic Installation
Using my.johnsnowlabs.com SSO

In [1]:
from johnsnowlabs import nlp, finance

# nlp.install(force_browser=True)

## Manual downloading
If you are not registered in my.johnsnowlabs.com, you received a license via e-email or you are using Safari, you may need to do a manual update of the license.

- Go to my.johnsnowlabs.com
- Download your license
- Upload it using the following command

In [2]:
from google.colab import files
print('Please Upload your John Snow Labs License using the button below')
license_keys = files.upload()

Please Upload your John Snow Labs License using the button below


- Install it

In [4]:
nlp.install()

👌 Detected license file /content/4.2.8.json
📋 Stored John Snow Labs License in /root/.johnsnowlabs/licenses/license_number_0_for_Spark-Healthcare_Spark-OCR.json
👷 Setting up  John Snow Labs home in /root/.johnsnowlabs, this might take a few minutes.
Downloading 🐍+🚀 Python Library spark_nlp-4.2.8-py2.py3-none-any.whl
Downloading 🐍+💊 Python Library spark_nlp_jsl-4.2.8-py3-none-any.whl
Downloading 🫘+🚀 Java Library spark-nlp-assembly-4.2.8.jar
Downloading 🫘+💊 Java Library spark-nlp-jsl-4.2.8.jar
🙆 JSL Home setup in /root/.johnsnowlabs
Installing pyspark to /usr/bin/python3
👌 Detected license file /content/4.2.8.json
Installing /root/.johnsnowlabs/py_installs/spark_nlp_jsl-4.2.8-py3-none-any.whl to /usr/bin/python3
Installed 2 products:
🐍+⚡ PySpark==3.1.2 installed! ✅ The big data Engine 
💊 Spark-Healthcare==4.2.8 installed! ✅ Heal the planet with NLP! 


# Starting

In [3]:
spark = nlp.start()

👌 Detected license file /content/4.2.8.json
👌 Launched [92mcpu optimized[39m session with with: 🚀Spark-NLP==4.2.8, 💊Spark-Healthcare==4.2.8, running on ⚡ PySpark==3.1.2


# Pipeline

In [5]:
documentAssembler = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentence = nlp.SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence") 

tokenizer = nlp.Tokenizer() \
    .setInputCols("sentence") \
    .setOutputCol("token")

embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings")\
    .setMaxSentenceLength(512)

ner = finance.NerModel.pretrained('finner_capital_calls', 'en', 'finance/models')\
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner")

converter = finance.NerConverterInternal()\
    .setInputCols(["sentence", "token", "ner"]) \
    .setOutputCol("ner_chunk")\

pipeline = nlp.Pipeline(stages=[documentAssembler,
                            sentence,
                            tokenizer,
                            embeddings,
                            ner,
                            converter
                            ])

df = spark.createDataFrame([[""]]).toDF("text")

model = pipeline.fit(df)

bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[OK!]
finner_capital_calls download started this may take some time.
[OK!]


# Some text examples

## Example 1: A short, semistructuerd Capital Call Notice

In [None]:
from pyspark.sql import functions as F

In [10]:
text1 = """Capital Call Notice:
Fund: LMNOP Fund, GP
Amount Called: $15,000,000
Account Name: Dynamic Designs LLC
Bank Name: Wells Fargo Bank
Account Number: 15122142155
Routing Number: 992531023
Due Date: May 15th, 2022

Please be advised that this capital call is mandatory and it is important to meet the due date in order to avoid any penalties or default under the terms of the partnership agreement. If you need assistance or have any questions, please contact our office at 555-555-5555 or email us at admin@lmnopfund.com.

Thank you for your prompt attention to this matter."""

result1 = model.transform(spark.createDataFrame([[text1]]).toDF("text"))

result1.select(F.explode(F.arrays_zip(result1.ner_chunk.result, result1.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label"),
              F.expr("cols['1']['confidence']").alias("confidence")).show(truncate=False)
                

+-------------------+--------------+----------+
|chunk              |ner_label     |confidence|
+-------------------+--------------+----------+
|LMNOP Fund         |FUND          |0.89905   |
|$15,000,000        |AMOUNT        |0.9819    |
|Dynamic Designs LLC|ACCOUNT_NAME  |0.9999333 |
|Wells Fargo Bank   |BANK_NAME     |1.0       |
|15122142155        |ACCOUNT_NUMBER|0.999     |
|992531023          |ABA           |0.7875    |
|May 15th, 2022     |DUE_DATE      |1.0       |
|555-555-5555       |PHONE         |1.0       |
|admin@lmnopfund.com|EMAIL         |0.9997    |
+-------------------+--------------+----------+



## Example 2: A short, unstructured Capital Call Notice with some additional information

In [13]:
text2 = """Greetings Sophia Martinez,

This is an urgent reminder regarding your investment in Innovative Investments LLC. A capital call for $30000.00 has been issued and is due on Jan 1 2023.

We highly recommend electronic transfer to the following account:

Account Name Innovative Investments LLC.
Account Number 1234567-6XX
ABA 42100191
Ally Financial

Please act swiftly to ensure the full funding of your investment. If you have any questions or need assistance, our team is available at thomasann@example.com.

Thank you for your cooperation and support.

Best regards,
Michael D. Davis"""

result2 = model.transform(spark.createDataFrame([[text2]]).toDF("text"))

result2.select(F.explode(F.arrays_zip(result2.ner_chunk.result, result2.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label"),
              F.expr("cols['1']['confidence']").alias("confidence")).show(truncate=False)

+---------------------------+--------------+----------+
|chunk                      |ner_label     |confidence|
+---------------------------+--------------+----------+
|Sophia Martinez            |OTHER_PERSON  |1.0       |
|Innovative Investments LLC |ORG           |0.99986666|
|$30000.00                  |AMOUNT        |1.0       |
|Jan 1 2023                 |DUE_DATE      |1.0       |
|Innovative Investments LLC.|ACCOUNT_NAME  |0.99942505|
|1234567-6XX                |ACCOUNT_NUMBER|1.0       |
|42100191                   |ABA           |1.0       |
|Ally Financial             |BANK_NAME     |0.99915004|
|thomasann@example.com      |EMAIL         |1.0       |
|Michael D. Davis           |OTHER_PERSON  |0.980025  |
+---------------------------+--------------+----------+



# Example 3: A long Capital Call Notice with much additional information

In [28]:
text3 = """The Next Big Thing Inc is calling 800000 USD from Great Oaks Venture Capital (the Fund) as a net contribution for an investment, management
fees and expenses, offset by a cash distribution. Please refer to the attached schedule for a breakdown
of the components of the amount, and to the narrative below for a description of the components
referred to in this notice.
Your portion of the call is 800000 USD. After this notice, you will have contributed 5% of your
commitment of 4000000 EUR, and your unfunded commitment will be USD 80000,000. You have received
USD 50000,000 in distributions, of which USD 30000 is considered Recallable Capital per Section 3.3 of the
LPA. Please refer to the Schedule for further details.
Please wire your funds to:
Date Due 2026.01.01
Amount Due USD 800000
Bank Name Citigroup
Bank Contact Charlotte Davis
Bank Address 00025 Cynthia Row Suite 132
ABA # 44000024
Credit to Next Big Thing Inc
Account # 1234567-2XX
Next Big Thing Inc (The Company) is calling USD 800000 from Great Oaks Venture Capital for an investment. The Next Big Thing Inc. is a manufacturer of widgets located in Akron, Ohio and will use the capital to expand into the Canadian market. The deal is expected
to close on 02.02.2023, and is expected to consist of a total financing of USD 4000000,000, with $ 800000
coming from the Citigroup in the form of a senior secured note and the remaining 500000 USD
coming from investment partner Great Oaks Venture Capital. Both Next Big Thing Inc and Great Oaks Venture Capital will receive
participating preferred stock with a 1x liquidation preference and a cumulative 1% dividend.

If you have any questions or concerns, please contact Michael D. Davis at (213)555-0303 or via email michaeldavis@example.com.
Best Regards,
Andew Smith"""


result3 = model.transform(spark.createDataFrame([[text3]]).toDF("text"))

result3.select(F.explode(F.arrays_zip(result3.ner_chunk.result, result3.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label"),
              F.expr("cols['1']['confidence']").alias("confidence")).show(truncate=False)

+---------------------------+----------------+----------+
|chunk                      |ner_label       |confidence|
+---------------------------+----------------+----------+
|The Next Big Thing Inc     |ORG             |0.9851    |
|800000 USD                 |AMOUNT          |0.99934995|
|Great Oaks Venture Capital |FUND            |0.99969995|
|800000 USD                 |AMOUNT          |1.0       |
|5%                         |OTHER_PERCENTAGE|1.0       |
|4000000 EUR                |OTHER_AMOUNT    |1.0       |
|USD 80000,000              |OTHER_AMOUNT    |1.0       |
|USD 50000,000              |OTHER_AMOUNT    |1.0       |
|USD 30000                  |OTHER_AMOUNT    |1.0       |
|2026.01.01                 |DUE_DATE        |1.0       |
|USD 800000                 |AMOUNT          |0.99975   |
|Citigroup                  |BANK_NAME       |0.9997    |
|Charlotte Davis            |BANK_CONTACT    |0.9997    |
|00025 Cynthia Row Suite 132|BANK_ADDRESS    |0.99990004|
|44000024     

# Let's visualize the results with Spark NLP Viz library
To do that, we need Light Pipelines

In [29]:
lp = nlp.LightPipeline(model)

In [30]:
res = lp.fullAnnotate(text3)

In [31]:
from johnsnowlabs import viz
nerviz = viz.NerVisualizer()

nerviz.display(res[0], label_col='ner_chunk')

# You can even create a graph to store the results in a Graph Database!

In [49]:
import networkx as nx
G = nx.Graph()

G.clear()
G.nodes()

NodeView(())

## Using Plotly to show the graph

In [50]:
import plotly.graph_objects as go
import random

def get_nodes_from_graph(graph, pos, node_color):
  """Extracts the nodes from a networkX dataframe in Plotly Scatterplot format"""
  node_x = []
  node_y = []
  texts = []
  hovers = []
  for node in graph.nodes():
    entity = graph.nodes[node]['attr_dict']['entity']
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    texts.append(node)
    hovers.append(entity)

  node_trace = go.Scatter(
    x=node_x, y=node_y, text=texts, hovertext=hovers,
    mode='markers+text',
    hoverinfo='text',
    marker=dict(
        color=node_color,
        size=40,
        line_width=2))
  
  return node_trace


def get_edges_from_graph(graph, pos, edge_color):
  """Extracts the edges from a networkX dataframe in Plotly Scatterplot format"""
  edge_x = []
  edge_y = []
  hovers = []
  xtext = []
  ytext = []
  for edge in graph.edges():
    relation = graph.edges[edge]['attr_dict']['relation']
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)
    hovers.append(relation)
    xtext.append((x0+x1)/2)
    ytext.append((y0+y1)/2)

  edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=2, color=edge_color),
    mode='lines')
  
  labels_trace = go.Scatter(x=xtext,y= ytext, mode='text',
                              textfont = {'color': edge_color},
                              marker_size=0.5,
                              text=hovers,
                              textposition='top center',
                              hovertemplate='weight: %{text}')
  return edge_trace, labels_trace


def show_graph_in_plotly(graph, node_color='white', edge_color='grey'):
  """Shows Plotly graph in Databricks"""
  pos = nx.spring_layout(graph)
  node_trace = get_nodes_from_graph(graph, pos, node_color)
  edge_trace, labels_trace = get_edges_from_graph(graph, pos, edge_color)
  fig = go.Figure(data=[edge_trace, node_trace, labels_trace],
               layout=go.Layout(
                  title='Visualization',
                  titlefont_size=16,                   
                  showlegend=False,
                  width=1600,
                  height=1000,
                  xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                  yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                  )
  fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers')) 
  fig.show()

## Getting the center of our graph: the company requesting the call

In [51]:
center = list(set([x.result for x in res[0]['ner_chunk'] if x.metadata['entity']=='ORG']))[0]
center

'The Next Big Thing Inc'

In [52]:
G.add_node(center,  attr_dict={'entity': 'ORG'})

# Creating an "OTHER" node for additional information, not capital-call related

In [53]:
OTHER = "OTHER"

In [54]:
G.add_node(OTHER,  attr_dict={'entity': 'OTHER'})

In [55]:
G.add_edge(center, OTHER, attr_dict={'relation': "has_additional_info"})  

## Getting the rest of the entities and relations

In [56]:
for x in res[0]['ner_chunk']:
  ent_type = x.metadata['entity']
  ent_text = x.result
  ent_conf = x.metadata['confidence']
  
  G.add_node(ent_text,  attr_dict={'entity': ent_type, 'confidence': ent_conf})
  relation = f'has_{ent_type}'
    
  if ent_type.startswith('OTHER'):
    G.add_edge(OTHER, ent_text, attr_dict={'relation': relation.lower()})
  else:
    G.add_edge(center, ent_text, attr_dict={'relation': relation.lower()})

In [57]:

show_graph_in_plotly(G)