# Update irish_judgment_HC-CA-SC_refined_clean_df

Before creating the SBert and law2Vec embeddings, i wanted to ensure that the judgments dataframe I using contained no NaN values as this could later cause corruption in my data.

In [1]:
!pip install s3fs
!pip install boto3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.14.1-py3-

In [3]:
# standard library imports
import sys
import pickle

# related third party imports
import pandas as pd

# imports for accessing s3 bucket
import s3fs
import boto3
from getpass import getpass



In [4]:
# input private aws credentials if using Google Colab
print('Input AWS access key ID:')
aws_access_key_id = getpass()
print('Input AWS secret access key:')
aws_secret_access_key = getpass()

Input AWS access key ID:
··········
Input AWS secret access key:
··········


In [5]:
# this cell is for s3 bucket access when using Google Colab

# enter authentication credentials
s3 = boto3.resource('s3', aws_access_key_id = aws_access_key_id, 
                          aws_secret_access_key = aws_secret_access_key)

# define bucket & file
my_bucket = s3.Bucket('legal-research-thesis-data')

# list bucket objects
for my_bucket_object in my_bucket.objects.all():
    print(my_bucket_object)





s3.ObjectSummary(bucket_name='legal-research-thesis-data', key='SBert_embeddings_mpnet.pkl')
s3.ObjectSummary(bucket_name='legal-research-thesis-data', key='irish_judgment_HC-CA-SC_refined_clean_df.csv')
s3.ObjectSummary(bucket_name='legal-research-thesis-data', key='irish_judgment_HC-CA-SC_refined_clean_df_model_training.csv')
s3.ObjectSummary(bucket_name='legal-research-thesis-data', key='irish_law2vec_embeddings.pkl')
s3.ObjectSummary(bucket_name='legal-research-thesis-data', key='irish_law2vec_model.txt')


In [15]:
# read file from s3 if using Google Colab
s3 = boto3.client('s3', aws_access_key_id = aws_access_key_id, 
                          aws_secret_access_key = aws_secret_access_key) 

# define bucket & object
my_bucket = 'legal-research-thesis-data'
judgment_object_clean = s3.get_object(Bucket = my_bucket, Key = 'irish_judgment_HC-CA-SC_refined_clean_df.csv') 

# read csv file from s3 into dataframes
judgments_clean_df =pd.read_csv(judgment_object_clean['Body'])

print(len(judgments_clean_df))

print(judgments_clean_df.head())

17933
   judgment_id neutral_citation  \
0            0  [2020] IEHC 628   
1            1   [2015] IESC 72   
2            2  [2013] IEHC 536   
3            3  [1997] IEHC 133   
4            4  [2019] IEHC 230   

                                      judgment_title judgment_date  \
0  TMT Digital centre Limited & anor  v  Grehan &...    2020-11-27   
1                  Fingal County Council  v  Kennedy    2015-07-31   
2      S.O & anor  v  Refugee Appeals Tribunal & ors    2013-11-01   
3                                  D.P.P. v. D. (J.)    1997-07-29   
4  X (a minor)  v  The Board of Management of Sch...    2019-03-29   

      court_name   judgment_by judgment_status  \
0     High Court    Twomey J.         Approved   
1  Supreme Court    Laffoy J.         Approved   
2     High Court     Clark J.         Approved   
3     High Court       No data         No data   
4     High Court   Barrett J.         Approved   

                                            judgment  \
0  \n

In [16]:
# remove any NaN rows from judgments_clean_df
judgments_clean_df = judgments_clean_df[judgments_clean_df['clean_judgment'].notnull()]

Once the NaN rows had been removed from the judgments_clean_df dataframe, the index was reset in order to avoid discrepancies between the IDs for the SBert embeddings and law2Vec embeddings. As can be seen from the print statement in the cell below, there are now 17917 rows as opposed to 17933.

In [17]:
# Reset Index/Judgement IDs as rows we're removed
judgments_clean_df=judgments_clean_df.reset_index(drop=True)
judgments_clean_df['judgment_id']=judgments_clean_df.index
print(len(judgments_clean_df))

17917


Finally, the updated dataframe is saved from Google Colab. It is later added to the legal-research-thesis-data bucket manually. 

In [18]:
# save embeddings from google colab
from google.colab import files

# write embeddings to csv file
with open('irish_judgment_HC-CA-SC_refined_clean_df_v2.csv', 'wb') as f:
   judgments_clean_df.to_csv('irish_judgment_HC-CA-SC_refined_clean_df_v2.csv')

# download file locally
files.download('irish_judgment_HC-CA-SC_refined_clean_df_v2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

To verify that there was no corruption to the DataFrame during transfer to s3, the updated DataFrame was downloaded. As can be seen, there are still 17917 rows in the DataFrame.

In [19]:
# read file from s3 if using Google Colab
s3 = boto3.client('s3', aws_access_key_id = aws_access_key_id, 
                          aws_secret_access_key = aws_secret_access_key) 

# define bucket & object
my_bucket = 'legal-research-thesis-data'
judgment_object_clean = s3.get_object(Bucket = my_bucket, Key = 'irish_judgment_HC-CA-SC_refined_clean_df_v2.csv') 

# read csv file from s3 into dataframes
judgments_clean_df =pd.read_csv(judgment_object_clean['Body'])

print(len(judgments_clean_df))

print(judgments_clean_df.head())

17917
   Unnamed: 0  judgment_id neutral_citation  \
0           0            0  [2020] IEHC 628   
1           1            1   [2015] IESC 72   
2           2            2  [2013] IEHC 536   
3           3            3  [1997] IEHC 133   
4           4            4  [2019] IEHC 230   

                                      judgment_title judgment_date  \
0  TMT Digital centre Limited & anor  v  Grehan &...    2020-11-27   
1                  Fingal County Council  v  Kennedy    2015-07-31   
2      S.O & anor  v  Refugee Appeals Tribunal & ors    2013-11-01   
3                                  D.P.P. v. D. (J.)    1997-07-29   
4  X (a minor)  v  The Board of Management of Sch...    2019-03-29   

      court_name   judgment_by judgment_status  \
0     High Court    Twomey J.         Approved   
1  Supreme Court    Laffoy J.         Approved   
2     High Court     Clark J.         Approved   
3     High Court       No data         No data   
4     High Court   Barrett J.         Ap

In [20]:
print(len(judgments_clean_df))

17917
