# Creating Embeddings and Storing them in a Pinecone Index

Here the SBert and irish_law2_vec embeddings are created using the judgment data stored in the "irish_judgment_HC-CA-SC_refined_clean_df_v2.csv" file. This also includes training my own word2Vec model (called irish_law2Vec). Following this, the SBert embeddings were stored in the vector database known as Pinecone.

In [None]:
!pip install sentence-transformers
!pip install transformers
!pip install tokenizer
!pip install s3fs
!pip install boto3

In [2]:
# standard library imports
import sys
import pickle

# related third party imports
import torch
import gensim
import nltk
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# imports for accessing s3 bucket
import s3fs
import boto3
from getpass import getpass



In [3]:
# input private aws credentials if using Google Colab
print('Input AWS access key ID:')
aws_access_key_id = getpass()
print('Input AWS secret access key:')
aws_secret_access_key = getpass()

Input AWS access key ID:
··········
Input AWS secret access key:
··········


In [4]:
# this cell is for s3 bucket access when using Google Colab

# enter authentication credentials
s3 = boto3.resource('s3', aws_access_key_id = aws_access_key_id,
                          aws_secret_access_key = aws_secret_access_key)

# define bucket & file
my_bucket = s3.Bucket('legal-research-thesis-data')

# list bucket objects
for my_bucket_object in my_bucket.objects.all():
    print(my_bucket_object)





s3.ObjectSummary(bucket_name='legal-research-thesis-data', key='SBert_embeddings_mpnet.pkl')
s3.ObjectSummary(bucket_name='legal-research-thesis-data', key='irish_judgment_HC-CA-SC_refined_clean_df.csv')
s3.ObjectSummary(bucket_name='legal-research-thesis-data', key='irish_judgment_HC-CA-SC_refined_clean_df_model_training.csv')
s3.ObjectSummary(bucket_name='legal-research-thesis-data', key='irish_judgment_HC-CA-SC_refined_clean_df_v2.csv')
s3.ObjectSummary(bucket_name='legal-research-thesis-data', key='irish_law2vec_embeddings.pkl')
s3.ObjectSummary(bucket_name='legal-research-thesis-data', key='irish_law2vec_model.txt')


In [5]:
# read file from s3 if using Google Colab
s3 = boto3.client('s3', aws_access_key_id = aws_access_key_id,
                          aws_secret_access_key = aws_secret_access_key)

# define bucket
my_bucket = 'legal-research-thesis-data'

training_object = s3.get_object(Bucket = my_bucket, Key = 'irish_judgment_HC-CA-SC_refined_clean_df_v2.csv')
judgment_object_clean = s3.get_object(Bucket = my_bucket, Key = 'irish_judgment_HC-CA-SC_refined_clean_df_v2.csv')

# read csv file from s3 into dataframes
training_df = pd.read_csv(training_object['Body'])
judgments_clean_df =pd.read_csv(judgment_object_clean['Body'])

print(training_df.head())
print(judgments_clean_df.head())

   Unnamed: 0  judgment_id neutral_citation  \
0           0            0  [2020] IEHC 628   
1           1            1   [2015] IESC 72   
2           2            2  [2013] IEHC 536   
3           3            3  [1997] IEHC 133   
4           4            4  [2019] IEHC 230   

                                      judgment_title judgment_date  \
0  TMT Digital centre Limited & anor  v  Grehan &...    2020-11-27   
1                  Fingal County Council  v  Kennedy    2015-07-31   
2      S.O & anor  v  Refugee Appeals Tribunal & ors    2013-11-01   
3                                  D.P.P. v. D. (J.)    1997-07-29   
4  X (a minor)  v  The Board of Management of Sch...    2019-03-29   

      court_name   judgment_by judgment_status  \
0     High Court    Twomey J.         Approved   
1  Supreme Court    Laffoy J.         Approved   
2     High Court     Clark J.         Approved   
3     High Court       No data         No data   
4     High Court   Barrett J.         Approved

In [None]:
print(len(judgments_clean_df))

17917


# Sentence Bert

In [6]:

# instantiate sentence level Bert from HuggingFace
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')


Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:

# method to create embeddings from judgment documents using Sentence Bert
def create_sentence_embeddings(model):

    # set sequence length to maximum
    model.max_seq_length = 512

    embeddings = model.encode(judgments_clean_df.judgment.to_list())
    print(embeddings)

    return embeddings

# call create_sentence_embeddings() method
sbert_embeddings = create_sentence_embeddings(model)

# save embeddings to new column in dataframe
judgments_clean_df['sbert_embeddings']=sbert_embeddings.tolist()
print(judgments_clean_df.iloc[0])


[[ 0.00275445  0.00612127  0.02597547 ...  0.02404665 -0.05809844
   0.04707154]
 [-0.04703782 -0.02117731  0.02813832 ...  0.00569782 -0.08960135
   0.03052621]
 [ 0.02815215 -0.03501812 -0.01074056 ...  0.03119799  0.01989095
   0.06181829]
 ...
 [-0.03423378 -0.0234972   0.0310467  ...  0.01787127 -0.03799784
   0.04735771]
 [ 0.00434431 -0.03618313 -0.00158401 ...  0.0240287   0.00039362
   0.04975927]
 [ 0.01038131  0.00534071  0.04947361 ...  0.03058869 -0.08950339
   0.0023316 ]]
Unnamed: 0                                                          0
judgment_id                                                         0
neutral_citation                                      [2020] IEHC 628
judgment_title      TMT Digital centre Limited & anor  v  Grehan &...
judgment_date                                              2020-11-27
court_name                                                 High Court
judgment_by                                                Twomey J. 
judgment_status   

In [None]:
# save embeddings from google colab
from google.colab import files
import pickle as pkl

# write SBert embeddings to pickle file
with open('SBert_embeddings_mpnet.pkl', 'wb') as pkl:
    pickle.dump(sbert_embeddings, pkl)

# download file locally
files.download('SBert_embeddings_mpnet.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
# read Sbert Embeddings from S3 into Google Colab
s3 = boto3.client('s3', aws_access_key_id = aws_access_key_id,
                          aws_secret_access_key = aws_secret_access_key)

# define bucket & files
my_bucket = 'legal-research-thesis-data'
sbert_embeddings_object = 'SBert_embeddings_mpnet.pkl'

# save embeddings from bucket to file
with open('SBert_embeddings_mpnet.pkl', 'wb') as embeddings:
    s3.download_fileobj(my_bucket, sbert_embeddings_object, embeddings)

# Load embeddings from pickle file
with open('SBert_embeddings_mpnet.pkl', 'rb') as embeddings:
    sbert_embeddings = pickle.load(embeddings)

print(sbert_embeddings)


[[ 0.00275445  0.00612127  0.02597547 ...  0.02404665 -0.05809844
   0.04707154]
 [-0.04703782 -0.02117731  0.02813832 ...  0.00569782 -0.08960135
   0.03052621]
 [ 0.02815215 -0.03501812 -0.01074056 ...  0.03119799  0.01989095
   0.06181829]
 ...
 [-0.03423376 -0.02349717  0.03104671 ...  0.01787126 -0.03799781
   0.04735773]
 [ 0.0043443  -0.03618313 -0.00158402 ...  0.0240287   0.00039365
   0.04975929]
 [ 0.01038132  0.00534074  0.04947361 ...  0.03058868 -0.08950338
   0.0023316 ]]


In [12]:
print(len(sbert_embeddings))

17917


In [8]:
# method to embed user queries with sentence bert
def create_sb_query_embeddings(query, model):

    query_embedding = model.encode(query)

    return query_embedding


In [9]:
query = "caused an accident driving while drinking"

In [10]:
# method to find closest query documents using cosine similarity

# define k
top_k_docs = 10

def cosine_sim_documents(query, model, sbert_embeddings, judgments_clean_df, top_k_docs):

    # create query embedding
    query_embedding = create_sb_query_embeddings(query, model)

    # find cosine similarity
    cos_sim_scores = util.cos_sim(query_embedding, sbert_embeddings)

    # return top scores
    doc_values,doc_ids = torch.topk(cos_sim_scores, k = top_k_docs)

    # convert values to dict
    top_judgments_dict = {
    'cos_sim_scores': doc_values.tolist(),
    'judgment_id': doc_ids.tolist()
  }

    # Get values from dict
    cos_sim_scores_list = top_judgments_dict['cos_sim_scores'][0]
    judgment_ids_list = top_judgments_dict['judgment_id'][0]

    # Add values to similarity  DF
    similarities_df = pd.DataFrame({'judgment_id': judgment_ids_list, 'cos_sim_scores': cos_sim_scores_list})

    # return top judgments from df using ids held in similairties_df
    judgment_results =judgments_clean_df[judgments_clean_df.index.isin(similarities_df['judgment_id'])]

    # merge dataframes
    ranked_judgments_df = pd.merge(similarities_df, judgment_results, on="judgment_id")

    # drop unnamed column
    ranked_judgments_df = ranked_judgments_df.drop(columns=['Unnamed: 0'])

    # sort by cos_sim_scores value
    ranked_judgments_df.sort_values(by='cos_sim_scores',ascending=False,inplace=True)

    return ranked_judgments_df

cosine_sim_documents(query, model, sbert_embeddings, judgments_clean_df, top_k_docs)




Unnamed: 0,judgment_id,cos_sim_scores,neutral_citation,judgment_title,judgment_date,court_name,judgment_by,judgment_status,judgment,clean_judgment,judgment_url
0,119,0.605365,[2020] IECA 350,Director of Public Prosecutions v Whelan,2020-10-14,Court of Appeal,McCarthy J.,Approved,\nTHE COURT OF APPEAL\n[97/2018]\nEdwards J.\n...,court appeal edward mccarthy kennedy j people ...,https://courts.ie/acc/alfresco/a43f99b3-46cd-4...
1,11599,0.603368,[2019] IECA 12,Director of Public Prosecutions v Moran,2019-01-21,Court of Appeal,Birmingham P.,Approved,THE COURT OF APPEAL [67/17] The Presid...,court appeal president edward mcgovern j peopl...,https://courts.ie/acc/alfresco/a69162f3-d91a-4...
2,5770,0.598771,[2009] IESC 1,Hussey v Twomey & ors,2009-01-21,Supreme Court,Kearns J.,Approved,THE SUPREME COURT Geoghegan J. Kearns J. F...,supreme court geoghegan kearns finnegan j 103 ...,https://courts.ie/acc/alfresco/70a2c5ee-87bd-4...
3,15871,0.592904,[2005] IEHC 17,Hussey v Twomey & ors,2005-01-18,High Court,Peart J.,Approved,Neutral Citation No: [2005] IEHC 17 THE HIG...,neutral citation 2005 iehc 17 high court recor...,https://courts.ie/acc/alfresco/825789dc-a6ce-4...
4,15152,0.591752,[2016] IECA 326,Director of Public Prosecutions v Power,2016-11-01,Court of Appeal,Sheehan J.,Approved,THE COURT OF APPEAL Birmingham J. Sheeha...,court appeal birmingham sheehan mahon j people...,https://courts.ie/acc/alfresco/087ad6f3-a196-4...
5,2275,0.586602,[2016] IECA 305,Director of Public Prosecutions v Kazinski,2016-10-24,Court of Appeal,Birmingham J.,Approved,THE COURT OF APPEAL Birmingham J. Sheehan...,court appeal birmingham sheehan edward j peopl...,https://courts.ie/acc/alfresco/3e771a81-e13a-4...
6,2196,0.582143,[2017] IEHC 3,Director of Public Prosecutions v Laing,2017-01-13,High Court,Eagar J.,Approved,Neutral Citation [2017] IEHC 3 THE HIGH COURT...,neutral citation 2017 iehc 3 high court 2016 7...,https://courts.ie/acc/alfresco/d65ed287-4b9e-4...
7,11785,0.576436,[2014] IEHC 514,O'Brien v Reilly & Anor,2014-10-31,High Court,Herbert J.,Approved,Neutral Citation: [2014] IEHC 514 THE HIGH C...,neutral citation 2014 iehc 514 high court 2009...,https://courts.ie/acc/alfresco/8b3731b1-2254-4...
8,6776,0.57472,[2001] IEHC 88,Fitzgerald v. D.P.P.,2001-05-04,High Court,No data,No data,THE HIGH COURT JUDICIAL REVIEW No.299/19...,high court judicial review brian fitzgerald ap...,https://www.bailii.org/ie/cases/IEHC/2001/88.html
9,9453,0.569209,[2017] IECA 31,Director of Public Prosecutions v Eric Ryan Jnr,2017-02-06,Court of Appeal,Birmingham J.,Approved,THE COURT OF APPEAL Record No. 136CJA/16 ...,court appeal record birmingham sheehan mahon j...,https://courts.ie/acc/alfresco/8427fabb-c86d-4...


# Word2Vec

In [None]:
# ensure no NaN rows exist
training_df = training_df[training_df['clean_judgment'].notnull()]

# split into list of words
sentences =  [row.split() for row in training_df['clean_judgment']]


In [None]:
# define word2vec parameters with skip-gram(i.e., 1)
irish_law2vec_model = Word2Vec(min_count = 5, size = 100, sg = 1)

In [None]:
# build vocab
irish_law2vec_model.build_vocab(sentences)

In [None]:
# train word2vec model on irish case law
irish_law2vec_model.train(sentences, total_examples = irish_law2vec_model.corpus_count, epochs = 15)

(765069913, 853253565)

In [None]:
# save model on google colab
with open('irish_law2vec.txt', 'wb') as pkl:
    pickle.dump(irish_law2vec_model, pkl)

In [6]:
# load model from S3 into Google Colab

from gensim.models import Word2Vec

# read file from s3
s3 = boto3.client('s3', aws_access_key_id = aws_access_key_id,
                          aws_secret_access_key = aws_secret_access_key)

# define bucket
my_bucket = 'legal-research-thesis-data'

s3.download_file(my_bucket, 'irish_law2vec_model.txt', 'irish_law2vec_model.txt')

# load model
irish_law2vec_model = gensim.models.KeyedVectors.load_word2vec_format('irish_law2vec_model.txt', binary=False)




In [18]:
print('Vocabulary size:', len(irish_law2vec_model))

Vocabulary size: 54930


In [28]:
# test model
irish_law2vec_model.most_similar(positive=["assault"])

[('sexual', 0.7925685048103333),
 ('indecent', 0.7815971374511719),
 ('rape', 0.7701523303985596),
 ('burglary', 0.7306751608848572),
 ('incident', 0.7244697213172913),
 ('robbery', 0.7178521156311035),
 ('arson', 0.7156529426574707),
 ('assaulting', 0.7063090801239014),
 ('battery', 0.7040494084358215),
 ('sexually', 0.6990309953689575)]

In [7]:
# method to get embeddings for each judgment or query
def get_w2v_embeddings(document):

    embeddings = []

    # check length of document
    if len(document)>1:
      for token in document:

        # check if token in model vocab
        if token in irish_law2vec_model:

          # append token vector
          embeddings.append(irish_law2vec_model.get_vector(token))
        else:
          embeddings.append(np.random.rand(100))
    else:
      return np.zeros(100)

    # return mean of embeddings for document
    return np.mean(embeddings, axis=0)



In [8]:
# Get judgment law2vec embeddings
judgments_clean_df['law2vec_embeddings']=judgments_clean_df['clean_judgment'].apply(lambda x :get_w2v_embeddings(x.split()))
print(judgments_clean_df.head())

   Unnamed: 0  judgment_id neutral_citation  \
0           0            0  [2020] IEHC 628   
1           1            1   [2015] IESC 72   
2           2            2  [2013] IEHC 536   
3           3            3  [1997] IEHC 133   
4           4            4  [2019] IEHC 230   

                                      judgment_title judgment_date  \
0  TMT Digital centre Limited & anor  v  Grehan &...    2020-11-27   
1                  Fingal County Council  v  Kennedy    2015-07-31   
2      S.O & anor  v  Refugee Appeals Tribunal & ors    2013-11-01   
3                                  D.P.P. v. D. (J.)    1997-07-29   
4  X (a minor)  v  The Board of Management of Sch...    2019-03-29   

      court_name   judgment_by judgment_status  \
0     High Court    Twomey J.         Approved   
1  Supreme Court    Laffoy J.         Approved   
2     High Court     Clark J.         Approved   
3     High Court       No data         No data   
4     High Court   Barrett J.         Approved

In [14]:
print(judgments_clean_df.head())

   Unnamed: 0  judgment_id neutral_citation  \
0           0            0  [2020] IEHC 628   
1           1            1   [2015] IESC 72   
2           2            2  [2013] IEHC 536   
3           3            3  [1997] IEHC 133   
4           4            4  [2019] IEHC 230   

                                      judgment_title judgment_date  \
0  TMT Digital centre Limited & anor  v  Grehan &...    2020-11-27   
1                  Fingal County Council  v  Kennedy    2015-07-31   
2      S.O & anor  v  Refugee Appeals Tribunal & ors    2013-11-01   
3                                  D.P.P. v. D. (J.)    1997-07-29   
4  X (a minor)  v  The Board of Management of Sch...    2019-03-29   

      court_name   judgment_by judgment_status  \
0     High Court    Twomey J.         Approved   
1  Supreme Court    Laffoy J.         Approved   
2     High Court     Clark J.         Approved   
3     High Court       No data         No data   
4     High Court   Barrett J.         Approved

In [None]:
# save embeddings from google colab
from google.colab import files
import pickle as pkl

law2vec_embeddings = judgments_clean_df.law2vec_embeddings.to_list()

# write embeddings to pickle file
with open('irish_law2vec_embeddings.pkl', 'wb') as pkl:
    pickle.dump(law2vec_embeddings, pkl)

# download file locally
files.download('irish_law2vec_embeddings.pkl')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
# read law2vec Embeddings from S3 into Google Colab
s3 = boto3.client('s3', aws_access_key_id = aws_access_key_id,
                          aws_secret_access_key = aws_secret_access_key)

# define bucket & files
my_bucket = 'legal-research-thesis-data'
law2vec_embeddings_object = 'irish_law2vec_embeddings.pkl'


# save embeddings from bucket to file
with open('irish_law2vec_embeddings.pkl', 'wb') as embeddings:
    s3.download_fileobj(my_bucket, law2vec_embeddings_object, embeddings)

# Load embeddings from pickle file
with open('irish_law2vec_embeddings.pkl', 'rb') as embeddings:
    law2vec_embeddings = pickle.load(embeddings)


In [26]:
query = "cases which involve someone drinking alcohol and causing a car accident"

In [27]:
# method to embed query and return ranked documents
def get_ranked_judgments(query):

  # preprocess query
  query = clean_doc_text(query)

  # generate query embeddings
  query_embeddings = get_w2v_embeddings(query.split())

  # assign ids from training_df['judgment_id'] to new df
  similarities_df = judgments_clean_df[['judgment_id'].copy()]

  # calculate judgment-query similarity
  similarities_df['similarity'] = judgments_clean_df['law2vec_embeddings'].apply(lambda x:
                                                                                 cosine_similarity(np.array(query_embeddings).reshape(1, -1),
                                                                                                   np.array(x).reshape(1, -1)).item())
  # merge dataframes
  ranked_judgments_df = pd.merge(similarities_df, judgments_clean_df, on="judgment_id")

  print(ranked_judgments_df.columns.tolist())

  ranked_judgments_df = ranked_judgments_df.drop(columns=['Unnamed: 0'])

  # sort by similarity value
  ranked_judgments_df.sort_values(by='similarity',ascending=False,inplace=True)

  # return top 10 judgments
  return ranked_judgments_df.head(10)

# call ranked judgmebnts method
get_ranked_judgments(query)

['judgment_id', 'similarity', 'Unnamed: 0', 'neutral_citation', 'judgment_title', 'judgment_date', 'court_name', 'judgment_by', 'judgment_status', 'judgment', 'clean_judgment', 'judgment_url', 'law2vec_embeddings']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similarities_df['similarity'] = judgments_clean_df['law2vec_embeddings'].apply(lambda x:


Unnamed: 0,judgment_id,similarity,neutral_citation,judgment_title,judgment_date,court_name,judgment_by,judgment_status,judgment,clean_judgment,judgment_url,law2vec_embeddings
11474,11474,0.84162,[2012] IEHC 437,Price v Connors & Anor,2012-10-30,High Court,O'Neill J.,Approved,Neutral Citation [2012] IEHC 437 THE HIGH CO...,neutral citation 2012 iehc 437 high court 2011...,https://courts.ie/acc/alfresco/4de3212e-b266-4...,"[-0.08117976016753937, 0.029067199642668313, -..."
336,336,0.83486,[2013] IEHC 100,Gallagher v McGeady,2013-03-08,High Court,Ryan J.,Approved,Neutral Citation [2013] IEHC 100 THE HIGH CO...,neutral citation 2013 iehc 100 high court 2010...,https://courts.ie/acc/alfresco/652c5583-e751-4...,"[-0.06562862542812907, 0.054004516028561794, -..."
17054,17054,0.83456,[2021] IEHC 689,Ryan v O'Sullivan & Anor,2021-10-27,High Court,Barr J.,Approved,\nTHE HIGH COURT\n[2021] IEHC 689\n[Record No....,high court 2021 iehc 689 record p john ryan pl...,https://courts.ie/acc/alfresco/f9bd7114-1dfb-4...,"[-0.055721637710883724, 0.07386373076401041, -..."
8856,8856,0.83384,[2010] IEHC 50,McDermott v McCormack,2010-02-26,High Court,Charleton J.,Approved,Neutral Citation Number: [2010] IEHC 50 THE ...,neutral citation number 2010 iehc 50 high cour...,https://courts.ie/acc/alfresco/60d055ae-0aeb-4...,"[-0.06238344428357434, 0.09324597769743682, -0..."
11715,11715,0.833428,[2001] IEHC 103,Rothwell v. Motor Insurers Bureau of Ireland,2001-07-06,High Court,No data,No data,THE HIGH COURT 1998 No. 3082P BETWEEN LIA...,high court 1998 3082p liam rothwell plaintiff ...,https://www.bailii.org/ie/cases/IEHC/2001/103....,"[-0.08186446908186719, 0.0037824464237002993, ..."
8131,8131,0.832442,[2006] IEHC 287,Devlin v Cassidy,2006-07-31,High Court,Peart J.,Approved,Neutral Citation Number [2006] IEHC 287 THE ...,neutral citation number 2006 iehc 287 high cou...,https://courts.ie/acc/alfresco/008e1bd0-eabf-4...,"[-0.05514773399742375, 0.015662478424552834, -..."
14536,14536,0.831229,[2019] IEHC 409,Volkova v Dunne,2019-05-30,High Court,O'Hanlon J.,Approved,THE HIGH COURT [2017 No. 4565 P.] BETWEEN A...,high court 2017 4565 adele volkova plaintiff p...,https://courts.ie/acc/alfresco/07380648-4f5c-4...,"[-0.06796597874820572, 0.1008557757730666, -0...."
13671,13671,0.830086,[2003] IEHC 142,Rogers v. Motor Insurers Bureau of Ireland,2003-12-05,High Court,No data,No data,Neutral Citation No: [2003] IEHC 142 THE HIGH...,neutral citation 2003 iehc 142 high court 1998...,https://www.bailii.org/ie/cases/IEHC/2003/142....,"[-0.0730854107756721, 0.07754188332221257, -0...."
15255,15255,0.829531,[2005] IEHC 394,Connolly v O'Donnell & anor,2005-11-22,High Court,Peart J.,Approved,[Neutral Citation Number: 2005 IEHC 394] T...,neutral citation number 2005 iehc 394 high cou...,https://courts.ie/acc/alfresco/d56f237e-1569-4...,"[-0.04129268415642463, 0.09683018784144014, -0..."
4606,4606,0.82897,[2017] IEHC 259,Power v Eustace & Ors,2017-05-03,High Court,Barr J.,Approved,THE HIGH COURT [2013 No. 5612P] BETWEEN VAL...,high court 2013 5612p valerie power plaintiff ...,https://courts.ie/acc/alfresco/f109c946-5581-4...,"[-0.06575423322955577, 0.0636417274742379, -0...."


In [11]:
# this file contains methods for extract and preprocessing documents and user input

!pip install unidecode

# standard library imports
import unidecode
import re

# related third party imports
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# import english stopwords and WordNetLemmatizer
stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


# method to get pos tags
def get_pos_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
# method to preprocess documents and user input for model training
def clean_doc_text(document):

    clean_doc = []

    # tokenize document
    tokenized_doc = word_tokenize(document)

    for token in tokenized_doc:

        # remove accents
        token = unidecode.unidecode(token)

        # convert to lowercase
        token = token.lower()

        clean_doc.append(token)

    # remove non-alphanumeric tokens
    clean_doc = [word for word in clean_doc if word.isalnum()]

    # remove stopwords
    clean_doc = [word for word in clean_doc if word not in stop]

    # get pos tags for cleaned document
    pos_tags = pos_tag(clean_doc)

    # update tags with own tagging function
    pos_tags = list(map(lambda x: (x[0], get_pos_tag(x[1])), pos_tags))

    lemmatized_doc = []

    for token, tag in pos_tags:
        if tag is None:

            # if no tag available, append token as is
            lemmatized_doc.append(token)
        else:

            # else lemmatize token according to pos tag
            lemmatized_doc.append(lemmatizer.lemmatize(token, tag))

    # merge tokens
    clean_doc = " ".join(lemmatized_doc)

    return clean_doc

Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/235.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# Upsert Embeddings into Pinecone

Once the SBert had been created for each judgment in the DataFrame, the SBert embeddings were uploaded to the vector database known as Pinecone.

In [None]:
pip install -U pinecone-client

In [None]:
import pinecone

In [None]:
# convert ids from int to string
judgments_clean_df['judgment_id'] = judgments_clean_df['judgment_id'].apply(str)

In [None]:
# input Pinecone API Key
print('Input Pinecone API Key:')
pinecone_api_key = getpass()

Input Pinecone API Key:
··········


A new index was created in Pinecone called "case-embeddings". This index was created with a dimensionality of 768 and assigned the "cosine" metric in order to perform cosine similarity on the embeddings.

In [None]:
index_name="case-embeddings"

# initalise pinecone
pinecone.init(api_key=pinecone_api_key, environment="us-west4-gcp")

# delete index if it already exists
if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

# create index
pinecone.create_index(
        index_name,
        dimension=768,
        metric="cosine"
    )


In [None]:
# connect to index
pinecone_index = pinecone.Index(index_name)

The following is a Batch Generator class provided by pinecone that is used to break down the Dataframe intp smaller chunks for upserting.

In [None]:
from typing import Iterator

class BatchGenerator:

   # initalise with self
    def __init__(self, batch_size: int = 10) -> None:
        self.batch_size = batch_size

    # Breaks the DataFrame down into chunks
    def to_chunks(self, df: pd.DataFrame) -> Iterator[pd.DataFrame]:
        splits = self.splits_num(df.shape[0])
        if splits <= 1:
            yield df
        else:
            for chunk in np.array_split(df, splits):
                yield chunk

    # Calculates the number of chunks a DataFrame Contains
    def splits_num(self, elements: int) -> int:
        return round(elements / self.batch_size)

    __call__ = to_chunks

# Set batch size to 300
df_batcher = BatchGenerator(300)

In [None]:
judgments_clean_df['judgment_id'] = judgments_clean_df['judgment_id'].apply(str)

Using the BatchGenerator class mentioned above, the SBert embeddings were then inserted into the "SBert-embeddings" namespace of the "case-embeddings" index in Pinecone.

In [None]:
# Upsert embeddings case-embeddings index
print("Uploading SBert embeddings to index...")
for batch_df in df_batcher(judgments_clean_df):
    pinecone_index.upsert(vectors=zip(batch_df.judgment_id, batch_df.sbert_embeddings))

Uploading SBert embeddings to index...


As can be seen from the index stats, the vector count is 17917, which corresponds to the number of judgments in the judgments DataFrame above.

In [None]:
# Check index size for each namespace
pinecone_index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 17917}},
 'total_vector_count': 17917}

In [None]:
print(judgments_clean_df['sbert_embeddings'].iloc[17916])
pinecone_index.fetch(["17916"])

[0.010381313040852547, 0.005340714007616043, 0.049473606050014496, -0.015968509018421173, -0.06376010924577713, -0.0033715309109538794, 0.019100967794656754, 0.010959031991660595, -0.008545827120542526, -0.0015105693601071835, -0.0029569147154688835, 0.030744370073080063, -0.004309759475290775, -0.0009132148115895689, 0.03956124559044838, 0.068661168217659, 0.03147020936012268, 0.013268886134028435, -0.03974958136677742, -0.005051812157034874, 0.05022905021905899, 0.014079825021326542, 0.03762390837073326, 0.012219148688018322, -0.022782407701015472, 0.01665588468313217, -0.015579773113131523, 0.004717596340924501, -0.01610291749238968, -0.048665743321180344, 0.003963305149227381, -0.004627648275345564, 0.01747436262667179, -0.04457975924015045, 1.9748210888792528e-06, -0.024193348363041878, 0.004361958242952824, -0.045260392129421234, -0.07216587662696838, -0.021167591214179993, -0.017669111490249634, -0.013229082338511944, -0.03468330577015877, -0.006216044537723064, -0.0398852117359

{'namespace': '',
 'vectors': {'17916': {'id': '17916',
                       'values': [0.010381313,
                                  0.00534071401,
                                  0.0494736061,
                                  -0.015968509,
                                  -0.0637601092,
                                  -0.00337153091,
                                  0.0191009678,
                                  0.010959032,
                                  -0.00854582712,
                                  -0.00151056936,
                                  -0.00295691472,
                                  0.0307443701,
                                  -0.00430975948,
                                  -0.000913214812,
                                  0.0395612456,
                                  0.0686611682,
                                  0.0314702094,
                                  0.0132688861,
                                  -0.0397495814,
                  