In [9]:
import numpy as np
import pandas as pd
import pickle
import bcolz
import mysql.connector

from nltk import word_tokenize
from sklearn.cluster import KMeans
from sentence_transformers import util

## Load Data

In [2]:
chunksize = 100000
chunks = []
chunk_no=0
for chunk in pd.read_csv('./actor_action_desc.csv', chunksize=chunksize):
    print(chunk_no)
    chunks.append(chunk)
    chunk_no+=1
    if chunk_no==2:
        break

0
1


In [3]:
df_case_action = pd.concat(chunks,ignore_index=True)
del chunks
df_case_action.columns = ['Actor','Action','Description']
df_case_action.head()

Unnamed: 0,Actor,Action,Description
0,Plaintiff,Application filed,Application filed on 04/27/2011 at Boston Muni...
1,Plaintiff,Appearance filed,"Appearance for Potfolio Recovery Associates, L..."
2,Plaintiff,Application filed,Supplementary process application filed; filin...
3,Court,Event scheduled,HEARING SCHEDULED for 06/09/2011 10:00 AM.
4,Court,Return of Service,Return of service on complaint and summons to ...


In [17]:
df_case_action.tail()

Unnamed: 0,Actor,Action,Description
53505511,,Return of Service,Return of service Summons ret. w/serv. RE: CRA...
53505512,,Return of Service,Return of service Summons ret. w/serv. RE: NCO...
53505513,,Return of Service,Return of service Summons ret. w/serv. RE PORT...
53505514,,Event resulted,Event Resulted The following event: Case Mana...
53505515,,,Motion Receiver's mot. for approval of propose...


In [16]:
df_case_action['Actor'].notnull() & df_case_action['Action'].notnull()

0            True
1            True
2            True
3            True
4            True
            ...  
53505511    False
53505512    False
53505513    False
53505514    False
53505515    False
Length: 53505516, dtype: bool

In [69]:
mydb = mysql.connector.connect(host='73.38.248.152', user='buspark', password='U@5p1r3!')

if (mydb):
    print("Connection Successful")

else:
    print("Connection Unsuccessful")

mycursor = mydb.cursor()

mycursor.execute("Show databases")

for db in mycursor:
    print(db)

Connection Successful
('information_schema',)
('civica_courtdocs',)
('wp_courtdocs',)
('wp_courtdocs_NORMALIZED',)


In [54]:
df = pd.read_sql("SELECT * FROM wp_courtdocs.cdocs_case_action_index as c_a_index \
                          WHERE c_a_index.action != ' ' and c_a_index.actor != ' ' LIMIT 0, 100000;"
                          ,con = mydb)

## Load Embeddings

In [4]:
vectors = bcolz.open('./glove.6B.50d.dat')[:]
words = pickle.load(open('./glove.6B.50d_words.pkl', 'rb'))
word2idx = pickle.load(open('./glove.6B.50d_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

## Encode Sentences

In [8]:
text = df_case_action['Description'].astype('str').values
text_embeddings = np.zeros((text.shape[0],50))
for i,sentence in enumerate(text):
    tokens = word_tokenize(sentence)
    sentence_emb = []
    for token in tokens:
        try:
            sentence_emb.append(glove[token.lower()])
        except:
            continue
    if len(sentence_emb) == 0:
        print(sentence)
    else:
        text_embeddings[i] = np.mean(np.array(sentence_emb),axis=0)
    

Order-mailed
INVETORY
Order-mailed
Order-mailed 11-30-2010
Agreement-JB
Affidavit-Kiernan Joliat
Default-No Capias
Order-mailed
Order-mailed
Order-mailed


In [11]:
text_embeddings.shape

(200000, 50)

## Clustering

In [12]:
def community_detection(embeddings, threshold=0.75, min_community_size=10, init_max_size=1000):
    """
    Function for Fast Community Detection
    Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
    Returns only communities that are larger than min_community_size. The communities are returned
    in decreasing order. The first element in each list is the central point in the community.
    """

    # Compute cosine similarity scores
    cos_scores = util.pytorch_cos_sim(embeddings, embeddings)

    # Minimum size for a community
    top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)

    # Filter for rows >= min_threshold
    extracted_communities = []
    for i in range(len(top_k_values)):
        if top_k_values[i][-1] >= threshold:
            new_cluster = []

            # Only check top k most similar entries
            top_val_large, top_idx_large = cos_scores[i].topk(k=init_max_size, largest=True)
            top_idx_large = top_idx_large.tolist()
            top_val_large = top_val_large.tolist()

            if top_val_large[-1] < threshold:
                for idx, val in zip(top_idx_large, top_val_large):
                    if val < threshold:
                        break

                    new_cluster.append(idx)
            else:
                # Iterate over all entries (slow)
                for idx, val in enumerate(cos_scores[i].tolist()):
                    if val >= threshold:
                        new_cluster.append(idx)

            extracted_communities.append(new_cluster)

    # Largest cluster first
    extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)

    # Step 2) Remove overlapping communities
    unique_communities = []
    extracted_ids = set()

    for community in extracted_communities:
        add_cluster = True
        for idx in community:
            if idx in extracted_ids:
                add_cluster = False
                break

        if add_cluster:
            unique_communities.append(community)
            for idx in community:
                extracted_ids.add(idx)

    return unique_communities

In [22]:
clusters = community_detection(text_embeddings[:10000], min_community_size=2, threshold=0.95)

In [23]:
df_case_action.iloc[:10000]['Action'].value_counts()

Event scheduled                1220
Fee due                         597
Event resulted                  547
Appearance filed                412
Judgment                        396
                               ... 
Motion to amend judgment          1
Deposit in Court                  1
Amended Answer filed              1
Arbitration                       1
Must file to avoid judgment       1
Name: Action, Length: 146, dtype: int64

In [26]:
len(clusters)
df = df_case_action.iloc[:10000]

In [37]:
print(df.iloc[clusters[5]]['Action'].value_counts())
print(df.iloc[clusters[5]]['Actor'].value_counts())
df.iloc[clusters[5]]

Counsel added       165
Appearance filed      1
Name: Action, dtype: int64
Plaintiff     139
Defendant      12
Intervenor      1
Name: Actor, dtype: int64


Unnamed: 0,Actor,Action,Description
8850,Plaintiff,Counsel added,On this date Wed Feb 22 00:00:00 EST 2012 Wils...
5783,Plaintiff,Counsel added,Filed\nOn this date Wed Jan 29 00:00:00 EST 20...
8705,Plaintiff,Counsel added,Filed\nOn this date Mon Dec 17 00:00:00 EST 20...
8348,Plaintiff,Counsel added,Filed\nOn this date Mon Oct 06 00:00:00 EDT 20...
7625,Plaintiff,Counsel added,Filed\nOn this date Mon Oct 06 00:00:00 EDT 20...
...,...,...,...
6688,Plaintiff,Counsel added,Filed\nOn this date Wed Feb 12 00:00:00 EST 20...
4133,Plaintiff,Counsel added,Filed\nOn this date Sat Apr 12 00:00:00 EDT 20...
3563,Plaintiff,Counsel added,Filed\nOn this date Mon Oct 07 00:00:00 EDT 20...
3550,Plaintiff,Counsel added,Filed\nOn this date Mon Oct 07 00:00:00 EDT 20...


In [53]:
kmeans = KMeans(n_clusters = 161, init = 'k-means++', random_state = 42)
kmeans.fit(text_embeddings)

KMeans(n_clusters=161, random_state=42)

In [38]:
preds = kmeans.predict(text_embeddings)

In [44]:
for i in range(len(preds)):
    print(text[i])
    print('predicted : ',preds[i])
    print('actual : ',df.at[i,'actor'])
    print('\n')
    if i==20:
        break

Certificate of Death
predicted :  4
actual :  Administrator/rix CTA


Appointment of Agent
predicted :  0
actual :  Administrator/rix CTA


Petition for Administration CTA
predicted :  3
actual :  Administrator/rix CTA


Certificate of Death (Mary Lorraine Vernaglia)
predicted :  2
actual :  Administrator/rix CTA


Petition for Sale of Real Estate
predicted :  0
actual :  Administrator/rix CTA


Petition for Administration CTA
predicted :  3
actual :  Administrator/rix CTA


Copy of Will dated 09/19/1989 3p.
predicted :  2
actual :  Administrator/rix CTA


Certificate of Death
predicted :  4
actual :  Administrator/rix CTA


Petition for Order of Complete Settlement
predicted :  0
actual :  Administrator/rix CTA


Petition for Administration
predicted :  3
actual :  Administrator/rix CTA


Certificate of Death
predicted :  4
actual :  Administrator/rix CTA


Petition for Administration CTA
predicted :  3
actual :  Administrator/rix CTA


Certificate of Death
predicted :  4
actual :  Ad