In [1]:
# this file contains methods for extract and preprocessing documents and user input

# standard library imports
import unidecode
import re

# related third party imports
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet


# import english stopwords and WordNetLemmatizer
stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


# method to get pos tags
def get_pos_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
        
# method to preprocess documents and user input for model training
def clean_text(text):

    clean_doc = []

    # tokenize document
    tokenized_doc = word_tokenize(text)

    for token in tokenized_doc:

        # remove accents
        token = unidecode.unidecode(token)
       
        # convert to lowercase
        token = token.lower()

        clean_doc.append(token)

    # remove non-alphanumeric tokens
    clean_doc = [word for word in clean_doc if word.isalnum()]

    # remove stopwords
    clean_doc = [word for word in clean_doc if word not in stop]

    # get pos tags for cleaned document
    pos_tags = pos_tag(clean_doc)

    # update tags with own tagging function
    pos_tags = list(map(lambda x: (x[0], get_pos_tag(x[1])), pos_tags))

    lemmatized_doc = []

    for token, tag in pos_tags:
        if tag is None:

            # if no tag available, append token as is
            lemmatized_doc.append(token)
        else:

            # else lemmatize token according to pos tag
            lemmatized_doc.append(lemmatizer.lemmatize(token, tag))

    # merge tokens
    clean_doc = " ".join(lemmatized_doc)

    return clean_doc

# method for light preprocessing 
def light_clean_text(text):

    text = re.sub(r'\[\"', " ", text)
    text = re.sub(r'\"\]', " ", text)
    text = re.sub(r'\-', " ", text)   

    return text


    

In [30]:
# method to clean html judgment text stored under "grounds" column

# standard library imports
import csv

# related third party imports
import pandas as pd
from bs4 import BeautifulSoup


# field limit increased to avoid ParserError
csv.field_size_limit(20000000)


# method to remove html tags from text
def remove_html_tags():
    
    input_file_path = "C:\\Users\\Charlie\\Desktop\\thesis_data\\irish_jugments_HC-CA-SC_unclean_df.csv"
    output_file_path = "C:\\Users\\Charlie\\Desktop\\thesis_data\\irish_jugments_HC-CA-SC_unclean_df_no_html.csv"    

    # load data into DataFrame
    judgments_df = pd.read_csv(input_file_path, engine = "python")

    # Drop rows where grounds is NaN
    judgments_df =  judgments_df.dropna(axis = 0, subset = ["grounds"])

    # reset index
    judgments_df.reset_index(drop = True, inplace = True)

    # assign index value to judgment_id
    judgments_df.insert(0, "judgment_id", judgments_df.index)   

    # remove html tags from "grounds" column
    judgments_df["grounds"] = [BeautifulSoup(html_text).get_text() for html_text in judgments_df["grounds"]] 
     
    # save updated judgments_df to new csv file without index
    judgments_df.to_csv(output_file_path, index = False)

    return judgments_df 
  
# call remove_html_tags method
remove_html_tags()  

    


Unnamed: 0,judgment_id,doc_id,doc_name,doc_date,doc_type,composition_court,court_name,details,doc_date_delivery,doc_last_modification,...,doc_id_label,cites_from_blockquotes,celex_cites,cited_docs,record_numbers,document_relationships,pdf_url,pdf_text,view_url,category
0,0,IE-2020-IEHC-628,TMT Digital centre Limited & anor -v- Grehan &...,2020-11-27,IEHC,[],High Court,,2020-11-27 00:00:00,,...,[2020] IEHC 628,,[],[],"[{""hcr_doc_id"":""IEF-2019-P-4318"",""record_type""...","[{""to"":""IEF-2019-P-4318"",""from"":""IE-2020-IEHC-...",https://courts.ie/acc/alfresco/95020573-cc98-4...,[2020] IEHC 628\n ...,,
1,1,IE-2015-IESC-72-LAFFOY,Fingal County Council -v- Kennedy,2015-07-31,IESC,"[""Hardiman J."",""McKechnie J."",""Clarke J."",""Laf...",Supreme Court,Submissions directed.,2015-07-31 00:00:00,2015-09-01 00:00:00,...,[2015] IESC 72,[],[],"[{""label"":""[1980] IR 132"",""title"":""Brown v. Do...","[{""hcr_doc_id"":""IEF-2012-MCA-402"",""record_type...","[{""to"":""IEF-2012-MCA-402"",""from"":""IE-2015-IESC...",https://courts.ie/acc/alfresco/d685aed4-96cd-4...,,https://courts.ie/view/judgments/d685aed4-96cd...,
2,2,IE-2013-IEHC-536,S.O & anor -v- Refugee Appeals Tribunal & ors,2013-11-01,IEHC,[],High Court,,2013-11-01 00:00:00,2013-12-18 00:00:00,...,[2013] IEHC 536,[],[],"[{""label"":""(2008) 47 EHRR 39"",""title"":""N v. Th...","[{""hcr_doc_id"":""IEF-2009-JR-666"",""record_type""...","[{""to"":""IEF-2009-JR-666"",""from"":""IE-2013-IEHC-...",https://courts.ie/acc/alfresco/6a8e82f3-727d-4...,,https://courts.ie/view/judgments/6a8e82f3-727d...,
3,3,IE-1997-IEHC-133,D.P.P. v. D. (J.),1997-07-29,IEHC,,High Court,,,,...,[1997] IEHC 133,[],[],[],"[{""record_type"":""High Court Record Number"",""re...",[],,,,
4,4,IE-2019-IEHC-230,X (a minor) -v- The Board of Management of Sch...,2019-03-29,IEHC,[],High Court,,2019-03-29 00:00:00,2019-04-24 00:00:00,...,[2019] IEHC 230,[],[],"[{""label"":""[2012] IESC 49"",""title"":""Okunade v....","[{""hcr_doc_id"":""IEF-2019-JR-83"",""record_type"":...","[{""to"":""IEF-2019-JR-83"",""from"":""IE-2019-IEHC-2...",https://courts.ie/acc/alfresco/49e19a0f-8020-4...,,https://courts.ie/view/judgments/49e19a0f-8020...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17928,17928,IE-2022-IEHC-408,A -v- Minister for Justice and Equality,2022-07-04,IEHC,[],High Court,,2022-07-04 00:00:00,,...,[2022] IEHC 408,,[],[],"[{""hcr_doc_id"":""IEF-2021-JR-285"",""record_type""...","[{""to"":""IEF-2021-JR-285"",""from"":""IE-2022-IEHC-...",https://courts.ie/acc/alfresco/26169ec7-b51b-4...,THE HIGH COURT\n ...,,
17929,17929,IE-2022-IEHC-411,O’Connell -v- Solas,2022-05-31,IEHC,[],High Court,,2022-05-31 00:00:00,,...,[2022] IEHC 411,,[],[],"[{""hcr_doc_id"":""IEF-2018-P-8542"",""record_type""...","[{""to"":""IEF-2018-P-8542"",""from"":""IE-2022-IEHC-...",https://courts.ie/acc/alfresco/2cffb114-b931-4...,THE HIGH COURT\n ...,,
17930,17930,IE-2022-IEHC-412,Pysz -v- Ireland & Ors,2022-07-05,IEHC,[],High Court,,2022-07-05 00:00:00,,...,[2022] IEHC 412,,[],[],"[{""hcr_doc_id"":""IEF-2019-P-1444"",""record_type""...","[{""to"":""IEF-2019-P-1444"",""from"":""IE-2022-IEHC-...",https://courts.ie/acc/alfresco/47380771-787a-4...,THE HIGH COURT\n ...,,
17931,17931,IE-2022-IEHC-413,M -v- Minister for Justice and Equality,2022-07-01,IEHC,[],High Court,,2022-07-01 00:00:00,,...,[2022] IEHC 413,,[],[],"[{""hcr_doc_id"":""IEF-2020-JR-776"",""record_type""...","[{""to"":""IEF-2020-JR-776"",""from"":""IE-2022-IEHC-...",https://courts.ie/acc/alfresco/178e899c-69d4-4...,THE HIGH COURT\n ...,,


In [31]:
# standard library imports
import csv

# related third party imports
import pandas as pd

# field limit increased to avoid ParserError
csv.field_size_limit(20000000)


20000000

In [32]:
# method to extract and preprocess data for w2v model training 

def clean_model_data():

    input_file_path = "C:\\Users\\Charlie\\Desktop\\thesis_data\\irish_jugments_HC-CA-SC_unclean_df_no_html.csv"
    output_file_path = "C:\\Users\\Charlie\\Desktop\\thesis_data\\irish_jugments_HC-CA-SC_refined_clean_df_model_training.csv"    

    # load unclean data into DataFrame
    unclean_df = pd.read_csv(input_file_path, engine = "python")
          
    # deep clean "grounds" text data and assign to new column 
    unclean_df["clean_judgment"] = unclean_df["grounds"].apply(clean_text)

    # drop unnecessary columns and assign to new clean_df
    clean_df = unclean_df.drop(['doc_id', 'doc_name', 'doc_date', 'doc_type',
       'composition_court', 'court_name', 'details', 'doc_date_delivery',
       'doc_last_modification', 'doc_status', 'from', 'judgment_by',
       'judgment_date', 'judgment_status', 'last_updated_on',
       'result', 'url', 'year', 'supporting_docs',
       'related_docs', 'doc_id_label', 'cites_from_blockquotes', 'celex_cites',
       'cited_docs', 'record_numbers', 'document_relationships', 'pdf_url',
       'pdf_text', 'view_url', 'category', 'neutral_citation'], axis = 1)

    # save updated clean_df to new csv file without index
    clean_df.to_csv(output_file_path, index = False)

    return clean_df

# call clean_model_data() method
clean_model_data()






Unnamed: 0,judgment_id,grounds,clean_judgment
0,0,\n[2020] IEHC 628\nTHE HIGH COURT\n[2019 No. 4...,2020 iehc 628 high court 2019 4318 p tmt digit...
1,1,THE SUPREME COURT [Appeal No. 322/13] Hard...,supreme court appeal hardiman mckechnie clarke...
2,2,Neutral Citation: [2013] IEHC 536 THE HIGH ...,neutral citation 2013 iehc 536 high court judi...
3,3,CENTRAL CRIMINAL COURT Bill No. C.C. 0011 o...,central criminal court bill 0011 1977 plaintif...
4,4,THE HIGH COURT 2019 No. 83 JR Between: X (A...,high court 2019 83 jr x minor sue father next ...
...,...,...,...
17928,17928,\nTHE HIGH COURT\n[2022] IEHC 408\n[2021/285/J...,high court 2022 iehc 408 matter section 5 ille...
17929,17929,\nTHE HIGH COURT\n[2022] IEHC 411\n[2018 8542 ...,high court 2022 iehc 411 2018 8542 p fergal co...
17930,17930,\nTHE HIGH COURT\n[2022] IEHC 412\n[Record No....,high court 2022 iehc 412 record p tomasz pysz ...
17931,17931,\nTHE HIGH COURT\nJUDICIAL REVIEW\n[2022] IEHC...,high court judicial review 2022 iehc 413 recor...


In [33]:
# method to extract and preprocess results data returned to user

import pandas as pd

def clean_results_data():
    
    input_file_path = "C:\\Users\\Charlie\\Desktop\\thesis_data\\irish_jugments_HC-CA-SC_unclean_df_no_html.csv"
    output_file_path = "C:\\Users\\Charlie\\Desktop\\thesis_data\\irish_jugments_HC-CA-SC_refined_unclean_df.csv"

    # load unclean data into DataFrame
    unclean_df = pd.read_csv(input_file_path, engine = "python")
  
    # try replace NaN rows in "neutral_citation " column with data from "doc_id_label " column
    try:
        unclean_df["neutral_citation"] = unclean_df["neutral_citation"].fillna(unclean_df["doc_id_label"])
    except:
        print("an exception occured")
    
    # try replace NaN rows in "pdf_url" column with data from "url" column
    try:
        unclean_df["pdf_url"] = unclean_df["pdf_url"].fillna(unclean_df["url"])
    except:
        print("an exception occured")
    
    # try replace NaN rows with String
    try:
        unclean_df = unclean_df.fillna("No data")
    except:
        print("an exception occured")

    # lightly clean "judgment_by" and "doc_name" columns
    unclean_df["judgment_by"] = unclean_df["judgment_by"].apply(light_clean_text)
    unclean_df["doc_name"] = unclean_df["doc_name"].apply(light_clean_text)
  
    # drop unnecessary columns and assign to new clean_df
    clean_df = unclean_df.drop(['doc_id', 'doc_type', 'composition_court', 'details', 'result', 
        'doc_date_delivery', 'doc_last_modification', 'doc_status', 'from', 'pdf_text', 
        'judgment_date', 'last_updated_on','year', 'doc_id_label', 'cites_from_blockquotes', 
        'celex_cites', 'category', 'url', 'view_url', 'related_docs', 'cited_docs', 
        'supporting_docs', 'record_numbers', 'document_relationships'], axis = 1)

    # reorder clean_df columns
    clean_df = clean_df[['judgment_id', 'neutral_citation', 'doc_name', 'doc_date', 'court_name',
                    'judgment_by', 'judgment_status', 'grounds', 'pdf_url']]

    # rename columns
    clean_df = clean_df.rename(columns = { "grounds" : "judgment", "doc_name" : "judgment_title", 
                            "doc_date" : "judgment_date","pdf_url": "judgment_url"})
   
    # save updated clean_df to new csv file without index
    clean_df.to_csv(output_file_path, index = False)
  
    return clean_df
    
# call clean_results_data() method
clean_results_data()

Unnamed: 0,judgment_id,neutral_citation,judgment_title,judgment_date,court_name,judgment_by,judgment_status,judgment,judgment_url
0,0,[2020] IEHC 628,TMT Digital centre Limited & anor v Grehan &...,2020-11-27,High Court,Twomey J.,Approved,\n[2020] IEHC 628\nTHE HIGH COURT\n[2019 No. 4...,https://courts.ie/acc/alfresco/95020573-cc98-4...
1,1,[2015] IESC 72,Fingal County Council v Kennedy,2015-07-31,Supreme Court,Laffoy J.,Approved,THE SUPREME COURT [Appeal No. 322/13] Hard...,https://courts.ie/acc/alfresco/d685aed4-96cd-4...
2,2,[2013] IEHC 536,S.O & anor v Refugee Appeals Tribunal & ors,2013-11-01,High Court,Clark J.,Approved,Neutral Citation: [2013] IEHC 536 THE HIGH ...,https://courts.ie/acc/alfresco/6a8e82f3-727d-4...
3,3,[1997] IEHC 133,D.P.P. v. D. (J.),1997-07-29,High Court,No data,No data,CENTRAL CRIMINAL COURT Bill No. C.C. 0011 o...,https://www.bailii.org/ie/cases/IEHC/1997/133....
4,4,[2019] IEHC 230,X (a minor) v The Board of Management of Sch...,2019-03-29,High Court,Barrett J.,Approved,THE HIGH COURT 2019 No. 83 JR Between: X (A...,https://courts.ie/acc/alfresco/49e19a0f-8020-4...
...,...,...,...,...,...,...,...,...,...
17928,17928,[2022] IEHC 408,A v Minister for Justice and Equality,2022-07-04,High Court,Bolger J.,Approved,\nTHE HIGH COURT\n[2022] IEHC 408\n[2021/285/J...,https://courts.ie/acc/alfresco/26169ec7-b51b-4...
17929,17929,[2022] IEHC 411,O’Connell v Solas,2022-05-31,High Court,Egan J.,Approved,\nTHE HIGH COURT\n[2022] IEHC 411\n[2018 8542 ...,https://courts.ie/acc/alfresco/2cffb114-b931-4...
17930,17930,[2022] IEHC 412,Pysz v Ireland & Ors,2022-07-05,High Court,Bolger J.,Approved,\nTHE HIGH COURT\n[2022] IEHC 412\n[Record No....,https://courts.ie/acc/alfresco/47380771-787a-4...
17931,17931,[2022] IEHC 413,M v Minister for Justice and Equality,2022-07-01,High Court,O'Regan J.,Approved,\nTHE HIGH COURT\nJUDICIAL REVIEW\n[2022] IEHC...,https://courts.ie/acc/alfresco/178e899c-69d4-4...


In [2]:
# method to extract and preprocess data for transformer model training and user results

def clean_judgment_data():
    
    input_file_path = "C:\\Users\\Charlie\\Desktop\\thesis_data\\irish_jugments_HC-CA-SC_unclean_df_no_html.csv"
    output_file_path = "C:\\Users\\Charlie\\Desktop\\thesis_data\\irish_jugments_HC-CA-SC_refined_clean_df_v2.csv"

    # load unclean data into DataFrame
    unclean_df = pd.read_csv(input_file_path, engine = "python")
   
    # try replace NaN rows in "neutral_citation " column with data from "doc_id_label " column
    try:
        unclean_df["neutral_citation"] = unclean_df["neutral_citation"].fillna(unclean_df["doc_id_label"])
    except:
        print("an exception occured")
    
     # try replace NaN rows in "pdf_url" column with data from "url" column
    try:
        unclean_df["pdf_url"] = unclean_df["pdf_url"].fillna(unclean_df["url"])
    except:
        print("an exception occured")
    
    # try replace NaN rows with String
    try:
        unclean_df = unclean_df.fillna("No data")
    except:
        print("an exception occured")

    # ligthly clean columns
    unclean_df["judgment_by"] = unclean_df["judgment_by"].apply(light_clean_text)
    unclean_df["doc_name"] = unclean_df["doc_name"].apply(light_clean_text)
    
    # deep clean "grounds" text data for model training
    unclean_df["clean_judgment"] = unclean_df["grounds"].apply(clean_text)
   
    # drop unnecessary columns and assign to new clean_df
    clean_df = unclean_df.drop(['doc_id', 'doc_type', 'composition_court', 'details', 'result', 
        'doc_date_delivery', 'doc_last_modification', 'doc_status', 'from', 'pdf_text', 
        'judgment_date', 'last_updated_on','year', 'doc_id_label', 'cites_from_blockquotes', 
        'celex_cites', 'category', 'url', 'view_url', 'related_docs', 'cited_docs', 
        'supporting_docs', 'record_numbers', 'document_relationships'], axis = 1)

    # reorder clean_df columns
    clean_df = clean_df[['judgment_id', 'neutral_citation', 'doc_name', 'doc_date', 'court_name',
                    'judgment_by', 'judgment_status', 'grounds', 'clean_judgment', 'pdf_url']]

    # rename columns
    clean_df = clean_df.rename(columns = { "grounds" : "judgment", "doc_name" : "judgment_title", 
                            "doc_date" : "judgment_date","pdf_url": "judgment_url"})
    
    # remove any NaN rows from judgments_clean_df
    clean_df = clean_df[clean_df['clean_judgment'].notnull()]
   
    # save updated clean_df to new csv file without index
    clean_df.to_csv(output_file_path, index = False)
  
    return clean_df
    
# call clean_judgment_data() method
clean_judgment_data()

NameError: name 'pd' is not defined

In [35]:
# Method for exploring various datasets. Note that judgments only go as far as 07/07/2022

# standard library imports
import csv

# related third party imports
import pandas as pd

# field limit increased to avoid ParserError
csv.field_size_limit(20000000)

# path for judgment data
user_data_filepath = "C:\\Users\\Charlie\\Desktop\\thesis_data\\irish_jugments_HC-CA-SC_refined_clean_df.csv"
model_data_filepath = "C:\\Users\\Charlie\\Desktop\\thesis_data\\irish_jugments_HC-CA-SC_refined_clean_df_model_training.csv"  

# # open judgments file
# with open(user_data_filepath, "r", encoding="utf8") as judgments_file:
        
#     # load data into DataFrame
#     user_judgments_df = pd.read_csv(judgments_file, engine = "python")

# # print uncleaned user data
# print("*****User Data*****\n", user_judgments_df.judgment_url.iloc[18390])
# print(user_judgments_df.head())

# print("*****User Data*****\n", user_judgments_df.grounds.iloc[13319])


# open judgments file
with open(model_data_filepath, "r", encoding="utf8") as judgments_file:
        
    # load data into DataFrame
    model_judgments_df = pd.read_csv(judgments_file, engine = "python")
    print(model_judgments_df["clean_judgment"].isnull().sum())

# print cleaned model data
print("\n\n*****Cleaned Model Data*****\n\n", model_judgments_df.judgment_id, model_judgments_df.clean_judgment)
# print(model_judgments_df.columns)
    







 


16


*****Cleaned Model Data*****

 0            0
1            1
2            2
3            3
4            4
         ...  
17928    17928
17929    17929
17930    17930
17931    17931
17932    17932
Name: judgment_id, Length: 17933, dtype: int64 0        2020 iehc 628 high court 2019 4318 p tmt digit...
1        supreme court appeal hardiman mckechnie clarke...
2        neutral citation 2013 iehc 536 high court judi...
3        central criminal court bill 0011 1977 plaintif...
4        high court 2019 83 jr x minor sue father next ...
                               ...                        
17928    high court 2022 iehc 408 matter section 5 ille...
17929    high court 2022 iehc 411 2018 8542 p fergal co...
17930    high court 2022 iehc 412 record p tomasz pysz ...
17931    high court judicial review 2022 iehc 413 recor...
17932    record edwards kennedy ni raifeartaigh people ...
Name: clean_judgment, Length: 17933, dtype: object


: 