#pip install pypdf

In [234]:
import pypdf
import pandas as pd
import os

In [236]:
# Outcome Extraction: 
# “Outcome”, “Conclusion”, “Decision”, “Result”, “Orders”, “Summary”, “Judgment”
# 	Positive: “convict”, “sentence”, “allow”, “award” + “plaintiff”, “Judgment for” + “plaintiff”, “grant” + “plaintiff”, “grant” , “succeed” , “authorise"
# 	Negative: “vary”, “disallow”, “dismiss”, “deny”, “award” + “defendant”, “judgment for” + “defendant”, “grant” + “defendant”,
# 	No outcome: “retrial”
#   Arguments: “{A} liable to {B} = positive for B
# (Note on outcome: case may refer to decisions from other cases (The decision in…; xxx’s decision), therefore suggest extracting 
# decision from last paragraph onwards, in reverse order.)

# 1) Lower court’s decision may be referenced in the Judgment, therefore it would be more likely to find the outcome when searching from the end of the Judgment

# 2) If cross-appeal/cross-claim exists, outcome extraction may be affected 
#(e.g. allow cross-appeal but dismiss appeal, which screws with our extraction scheme). 
#Solution could be to look for either plaintiff/appellant or defendant/respondent near the keyword, 
#then assign the outcome based on the combination of keyword and party. If party does not exist, default to the keyword

# 3) Partial claims are an issue as well (e.g. allow claim/appeal wrt issue 1, but dismiss claim/appeal wrt issue 2)

# 4) In rare cases, the court might only state the court orders that they are making, 
#without using any identifiable keyword to indicate that the judgement is favourable to either the 
#pf or df. Will have to use GPT for this scenario

# 5) “My order in the present case does not on any basis suggest that a foreign accused person without roots in 
#Singapore must be denied bail.” This judgment contains the keyword denied, but does not in fact deny bail from this statement alone.

# 6) On the contrary, “in this case, it was not appropriate at this time to grant bail to the 
#respondent” this statement contains the keyword grant, but denies bail in effect

In [237]:
#sentence is also a positive word but less priority -> only if no keywords listed below is found sentence is checked.
#space and \n in front of the keyword are most commonly found in the cases
positive = [" convict", " allow", " grant", " succeed", " authorise", " award", "\nconvict", "\nallow", "\ngrant", "\nsucceed", "\nauthorise", "\naward"]
order_accordingly = ["order accordingly", "\norder accordingly", "orders accordingly", "\norders accordingly"]
negative = [" vary", " disallow", " dismiss", " deny", " denied", "\nvary", "\ndisallow", "\ndismiss", "\ndeny", "\ndenied"]
neutral = [" retrial", "\nretrial"]

#get person_keyword such as "plaintiff" and its nearby word, if nearby words has something positive then adds one point.
def counting(words, points, method1_used, person_keyword, opposite_person_keyword):
    indexes = [i for i, word in enumerate(words) if person_keyword in word]
    prev__index = None
    for index in indexes:
        #when there are both platiniff and defendant nearby dont do this method cos its tricky to understand context
        #e.g. I reject the plaintiff’s claim and allow the defendant’s counterclaim. (2020 sghc 252)
        #e.g. Plaintiffs` appeal dismissed; defendants` appeal allowed. (2000 sghc 268) expected unfavourable 
        if opposite_person_keyword != "NA" and opposite_person_keyword in ' '.join(words[max(0, index - 8):index + 9]): #estimated 8 nearby words
            continue
        
        # Only allow processing if the next e.g 'plaintiff' is more than 8 words (estimated) apart from the previous "plaintiff" to avoid double counting.
        # > 8 words because any words <= 8 will have subset nearby word of e.g. index 0 nearby word is 1 2 3 4, index 8 nearby word is 4 5 6 7
        #0          1     2    3    4     5      6    7      8         9
        #defendant has chosen to appeal against my order defendant defendant
        if prev__index is None or abs(prev__index - index) > 8:
            #accused special case, for criminal they will say i sentenced the accused/ i acquitted the accused
            if person_keyword == 'accused':
                words_nearby = ' '.join(words[max(0, index - 8):index + 9]) # Extract 8 nearby words, case like 2017_SGHC_107 
                #have sentence as nearby word (8 distance away) "The sentence under appeal was imposed on the Accused"
                if "sentence" in words_nearby:
                    points += 1
                    method1_used = True
                elif "acquit" in words_nearby:
                    points -= 1
                    method1_used = True
            else:   
                # Extract 4 nearby words, not any more since usually the keyword like Judgment for plaintiff or allow appeal for plaintiff is very near
                words_nearby = ' '.join(words[max(0, index - 4):index + 5]) 
                if any(phrase in words_nearby for phrase in positive + ['judgment for', 'sentence', 'favour of']):
                    if person_keyword == "plaintiff" or person_keyword == "appellant":
                        points += 1
                    elif person_keyword == "defendant" or person_keyword == "respondent":
                        points -= 1
                    method1_used = True
                if any(phrase in words_nearby for phrase in negative):
                    if person_keyword == "plaintiff" or person_keyword == "appellant":
                        points -= 1
                    elif person_keyword == "defendant" or person_keyword == "respondent":
                        points += 1
                    method1_used = True
        prev__index = index
    return [points, method1_used]


#Method 1 priority to "liable to" since its the most accurate e.g .{A} liable to {B} = positive for B
#Method 2 if method 1 fails, next priority will be accused, e.g. accused sentenced to... = positive
#Method 3 if 2 fails, search for "plaintiff" and its nearby word, if nearby words has something positive then adds one point.
#Method 4 if 3 fails, search for "appellant" and its nearby word, if nearby words has something positive then adds one point.
#Method 5 if all else fail, such for keyword
def extract_outcome(text):
    points, method1_used = 0, False
    text = text.lower()
    words = text.split()
    liable_to_method = False
    liable_indexes = [i for i, word in enumerate(words) if "liable" in word]
    prev__index = None

    #Method 1
    for index in liable_indexes:
        if prev__index is None or abs(prev__index - index) > 8: #same as counting function above, to avoid double counting
            if "to" == words[index + 1]:
                left_words_nearby = ' '.join(words[max(0, index - 4):index])
                right_words_nearby = ' '.join(words[index + 1:index + 5])
                if ("plaintiff" in left_words_nearby and "defendant" in right_words_nearby) or ("appallent" in left_words_nearby and "respondent" in right_words_nearby):
                    points -= 1
                    liable_to_method = True
                    method1_used = True
                elif ("defendant" in left_words_nearby and "plaintiff" in right_words_nearby) or ("respondent" in left_words_nearby and "appallent" in right_words_nearby):
                    points += 1
                    liable_to_method = True
                    method1_used = True
        prev__index = index

    
    if liable_to_method == False:
        #Method 2
        points, method1_used = counting(words, points, method1_used, "accused", "NA")
        #Method 3
        if method1_used == False:
            points, method1_used = counting(words, points, method1_used, "plaintiff", "defendant")
            points, method1_used = counting(words, points, method1_used, "defendant", "plaintiff")
            #Method 4
            if method1_used == False:
                points, method1_used = counting(words, points, method1_used,  "appallent", "respondent")
                points, method1_used = counting(words, points, method1_used,  "respondent", "appallent")

    if method1_used:
        if points == 0:
            if liable_to_method:
                return ['Neutral', "Method 1 liable to"]
            return ['Neutral', "Method 1"]
        elif points > 0:
            if liable_to_method:
                return ['Favourable', "Method 1 liable to"]
            return ['Favourable', "Method 1"]
        else:
            if liable_to_method:
                return ['Unfavourable', "Method 1 liable to"]
            return ['Unfavourable', "Method 1"]
    else:
        ##Method 5
        if any(phrase in text for phrase in positive):
            return ['Favourable', "Method 2"]
        elif any(phrase in text for phrase in negative):
            return ['Unfavourable', "Method 2"]
        elif any(phrase in text for phrase in neutral):
            return ['Neutral', "Method 2"]
        elif any(phrase in text for phrase in order_accordingly):
            return ['Order Accordingly', "Method 2"]
        elif "sentence" in text:
            return ['Favourable', "Method 2"]
        else:
            return ['NA', "NA"]

In [238]:
#I took out "\nJudgment:\n" because most of these cases with Judgment:, its ruling outcome is at the end of page. e.g. 2001sghc18, 2001sghc208, 2001sghc28
section_keyword  = ["\nOutcome:\n", "Conclusion\n", "\nSummary\n", "\nOrders\n", "\nResult\n", "\nDecision\n", "\nJudgment\n"] #some cases' extracted test has "Conclusion\n"
outcome_text = ""
def extract_ruling(pdf_file):
    with open(pdf_file, 'rb') as f:
        reader = pypdf.PdfReader(f)
        result, found, found_but_NA, final_keyword,num_pages = [], False, False, "", len(reader.pages)
        outcome_text = ""

        #to save time, extract only last page till 1/4 from the end and extract minimum 4 pages (some cases have section keyword at the last 4th page)
        #and skip first page as most cases has section keyword at the end.
        start_from = [i for i in range(num_pages-1, max(num_pages - int(num_pages * 0.25), 0) - 1, -1)]
        if num_pages <= 15:
            start_from = [i for i in range(num_pages-1, max(num_pages - 4, 0) - 1, -1)]
        if 0 in start_from:
            start_from.remove(0)

        #search for section keyword in page    
        for keyword in section_keyword:
            if found:
                break

            #---------------- search for section keyword for each page ----------------
            for page_num in start_from:
                final_keyword, outcome_text = "", ""
                page = reader.pages[page_num]
                page_text = page.extract_text()

                if keyword in page_text:
                    final_keyword = keyword
                
                #---------------- some special cases here identified here ----------------
                #some cases are in uppercase, skip for JUDGMENT as it contains [Revenue Law — Income taxation — Capital allowance] -> causing misclassification
                elif keyword != "\nJudgment\n" and keyword.upper() in page_text:
                    final_keyword = keyword.upper()
                #Some section keywords like "Conclusions" may have 's' 
                elif keyword != "\nOrders\n" and keyword != "\nOutcome:\n"  and keyword.rsplit('\n', 1)[0] + 's\n' in page_text:
                    final_keyword = keyword.rsplit('\n', 1)[0] + 's\n'
                #some cases has "Conclusion \n" from the extracted text
                elif keyword == "Conclusion\n"  and "Conclusion \n" in page_text:
                    final_keyword = "Conclusion \n"    
                    
                #---------------- keyword found then search for keywords for ruling decision ----------------
                if final_keyword != "":
                    index = page_text.find(final_keyword)
                    outcome_text += page_text[index:].strip()
                    if final_keyword == "Conclusion\n":
                        outcome_text = outcome_text.split("Costs\n")[0] #some pdf after Conclusion has "Costs"

                    #allow extraction of next pages, up to 3 pages
                    curr_page = page_num + 1
                    while curr_page != num_pages and page_num != 0:
                        if curr_page >= page_num + 4:
                            break
                        page = reader.pages[curr_page]
                        page_text = page.extract_text()
                        outcome_text += "\n" + page_text
                        curr_page += 1
                        
                    if len(outcome_text) > len(final_keyword): #if outcome_text < len(keywords) means nth is extracted
                            #columns=["filename", "target",  "method", "section_keyword", "text"]
                            result = [extract_outcome(outcome_text)[0], extract_outcome(outcome_text)[1], final_keyword.replace("\n", ""), outcome_text]
                            if result[0] != 'NA':
                                found = True
                            else:
                                found_but_NA = True
                                
        #if found_but_NA means section keyword found but cant determine the result. just return the text, may need manual/gpt to classify           
        if found_but_NA:
            return result
                
        #---------------- no section keyword found then extract last 2000 characters (most ruling decision at the end) ----------------
        if len(result) == 0 or result[0] == 'NA':
            last_2000_text = ""
            current_page_num = num_pages - 1
            while current_page_num >= 0 and len(last_2000_text) < 2000:
                page = reader.pages[current_page_num]
                last_2000_text = page.extract_text() + last_2000_text
                current_page_num -= 1
            #columns=["filename", "target",  "method", "section_keyword", "text"]
            result = [extract_outcome(last_2000_text)[0], extract_outcome(last_2000_text)[1], "Last Page", last_2000_text]
                    
        return result

In [239]:
results = []

# Iterate over files in the folder
for filename in os.listdir("raw-cases"):
    # if  "2018" in filename or "2019" in filename or"2020" in filename or "2021" in filename or "2022" in filename or "2023" in filename:
    # if not ("2000" in filename or "2001" in filename or "2002" in filename):
    # if "2017_SGHC_157.pdf" in filename:
    print(filename)
    try:
        pdf_file_path = os.path.join("raw-cases", filename)
        outcome = extract_ruling(pdf_file_path)
        result = [filename, outcome[0], outcome[1], outcome[2], outcome[3]]
        results.append(result)

    except Exception as e:
        print(f"Error processing {filename}: {e}")

df = pd.DataFrame(results, columns=["filename", "target",  "method", "section_keyword", "text"])
print(df)

2000_SGCA_1.pdf
2000_SGCA_10.pdf
2000_SGCA_11.pdf
2000_SGCA_12.pdf
2000_SGCA_13.pdf
2000_SGCA_14.pdf
2000_SGCA_15.pdf
2000_SGCA_16.pdf
2000_SGCA_17.pdf
2000_SGCA_18.pdf
2000_SGCA_19.pdf
2000_SGCA_2.pdf
2000_SGCA_20.pdf
2000_SGCA_21.pdf
2000_SGCA_22.pdf
2000_SGCA_23.pdf
2000_SGCA_24.pdf
2000_SGCA_25.pdf
2000_SGCA_26.pdf
2000_SGCA_27.pdf
2000_SGCA_28.pdf
2000_SGCA_29.pdf
2000_SGCA_3.pdf
2000_SGCA_30.pdf
2000_SGCA_31.pdf
2000_SGCA_32.pdf
2000_SGCA_33.pdf
2000_SGCA_34.pdf
2000_SGCA_35.pdf
2000_SGCA_36.pdf
2000_SGCA_37.pdf
2000_SGCA_38.pdf
2000_SGCA_39.pdf
2000_SGCA_4.pdf
2000_SGCA_40.pdf
2000_SGCA_41.pdf
2000_SGCA_42.pdf
2000_SGCA_43.pdf
2000_SGCA_44.pdf
2000_SGCA_45.pdf
2000_SGCA_46.pdf
2000_SGCA_47.pdf
2000_SGCA_48.pdf
2000_SGCA_49.pdf
2000_SGCA_5.pdf
2000_SGCA_50.pdf
2000_SGCA_51.pdf
2000_SGCA_52.pdf
2000_SGCA_53.pdf
2000_SGCA_54.pdf
2000_SGCA_55.pdf
2000_SGCA_56.pdf
2000_SGCA_57.pdf
2000_SGCA_58.pdf
2000_SGCA_59.pdf
2000_SGCA_6.pdf
2000_SGCA_60.pdf
2000_SGCA_61.pdf
2000_SGCA_62.pdf
200

In [240]:
df = pd.DataFrame(results, columns=["filename", "target", "method", "section_keyword", "text"])
print(df)
df.to_csv("ruling_extraction.csv")

              filename             target    method section_keyword  \
0      2000_SGCA_1.pdf         Favourable  Method 2        Outcome:   
1     2000_SGCA_10.pdf         Favourable  Method 2        Outcome:   
2     2000_SGCA_11.pdf  Order Accordingly  Method 2        Outcome:   
3     2000_SGCA_12.pdf       Unfavourable  Method 2        Outcome:   
4     2000_SGCA_13.pdf       Unfavourable  Method 2        Outcome:   
...                ...                ...       ...             ...   
8562  2023_SGHC_95.pdf                 NA        NA      Conclusion   
8563  2023_SGHC_96.pdf       Unfavourable  Method 1      Conclusion   
8564  2023_SGHC_97.pdf       Unfavourable  Method 1      Conclusion   
8565  2023_SGHC_98.pdf         Favourable  Method 2      Conclusion   
8566  2023_SGHC_99.pdf         Favourable  Method 2      Conclusion   

                                                   text  
0     Outcome:\nAppeals allowed.\nCopyright © Govern...  
1     Outcome:\nAppeal allowed.

Run another round of extraction for NA cases but search thru the whole pdf pages for section keyword

In [241]:
df = pd.read_csv("ruling_extraction.csv")
filtered_df = df[pd.isna(df['target'])]
second_extraction = filtered_df["filename"].tolist()
second_extraction

['2000_SGCA_25.pdf',
 '2000_SGHC_104.pdf',
 '2000_SGHC_112.pdf',
 '2000_SGHC_12.pdf',
 '2000_SGHC_130.pdf',
 '2000_SGHC_140.pdf',
 '2000_SGHC_183.pdf',
 '2000_SGHC_185.pdf',
 '2000_SGHC_197.pdf',
 '2000_SGHC_202.pdf',
 '2000_SGHC_222.pdf',
 '2000_SGHC_248.pdf',
 '2000_SGHC_257.pdf',
 '2000_SGHC_260.pdf',
 '2000_SGHC_283.pdf',
 '2000_SGHC_290.pdf',
 '2000_SGHC_42.pdf',
 '2000_SGHC_45.pdf',
 '2000_SGHC_82.pdf',
 '2001_SGCA_29.pdf',
 '2001_SGCA_66.pdf',
 '2001_SGCA_73.pdf',
 '2001_SGHC_101.pdf',
 '2001_SGHC_107.pdf',
 '2001_SGHC_108.pdf',
 '2001_SGHC_111.pdf',
 '2001_SGHC_118.pdf',
 '2001_SGHC_128.pdf',
 '2001_SGHC_130.pdf',
 '2001_SGHC_132.pdf',
 '2001_SGHC_148.pdf',
 '2001_SGHC_150.pdf',
 '2001_SGHC_151.pdf',
 '2001_SGHC_163.pdf',
 '2001_SGHC_168.pdf',
 '2001_SGHC_174.pdf',
 '2001_SGHC_184.pdf',
 '2001_SGHC_186.pdf',
 '2001_SGHC_199.pdf',
 '2001_SGHC_214.pdf',
 '2001_SGHC_215.pdf',
 '2001_SGHC_222.pdf',
 '2001_SGHC_224.pdf',
 '2001_SGHC_228.pdf',
 '2001_SGHC_229.pdf',
 '2001_SGHC_232.pd

In [242]:
section_keyword  = ["\nOutcome:\n", "Conclusion\n", "\nSummary\n", "\nOrders\n", "\nResult\n", "\nDecision\n", "\nJudgment\n"]
outcome_text = ""
def extract_ruling2(pdf_file):
    with open(pdf_file, 'rb') as f:
        reader = pypdf.PdfReader(f)
        result, found, found_but_NA, final_keyword,num_pages = [], False, False, "", len(reader.pages)
        outcome_text = ""

        start_from = [i for i in range(num_pages-1, 0, -1)]
        # if 0 in start_from:
        #     start_from.remove(0)

        for keyword in section_keyword:
            if found:
                break
            for page_num in start_from:
                final_keyword = ""
                outcome_text = ""
                page = reader.pages[page_num]
                page_text = page.extract_text()

                if keyword in page_text:
                    final_keyword = keyword
                elif keyword != "\nJudgment\n" and keyword.upper() in page_text:
                    final_keyword = keyword.upper()
                elif keyword != "\nOrders\n" and keyword != "\nOutcome:\n"  and keyword.rsplit('\n', 1)[0] + 's\n' in page_text:
                    final_keyword = keyword.rsplit('\n', 1)[0] + 's\n'
                elif keyword == "Conclusion\n"  and "Conclusion \n" in page_text:
                    final_keyword = "Conclusion \n"    

                if final_keyword != "":
                    index = page_text.find(final_keyword)
                    outcome_text += page_text[index:].strip()
                    if final_keyword == "Conclusion\n":
                        outcome_text = outcome_text.split("Costs\n")[0]
                        
                    curr_page = page_num + 1
                    while curr_page != num_pages and page_num != 0:
                        if curr_page >= page_num + 4:
                            break
                        page = reader.pages[curr_page]
                        page_text = page.extract_text()
                        outcome_text += "\n" + page_text
                        curr_page += 1
                        
                    if len(outcome_text) > len(final_keyword):
                            result = [extract_outcome(outcome_text)[0], extract_outcome(outcome_text)[1], final_keyword.replace("\n", ""), outcome_text]
                            if result[0] != 'NA':
                                found = True
                            else:
                                found_but_NA = True
          
        if found_but_NA:
            return result
                
        if len(result) == 0 or result[0] == 'NA':
            last_2000_text = ""
            current_page_num = num_pages - 1
            while current_page_num >= 0 and len(last_2000_text) < 2000:
                page = reader.pages[current_page_num]
                last_2000_text = page.extract_text() + last_2000_text
                current_page_num -= 1

            result = [extract_outcome(last_2000_text)[0], extract_outcome(last_2000_text)[1], "Last Page", last_2000_text]
                    
        return result

In [243]:
results = []
for filename in os.listdir("raw-cases"):
    if filename in second_extraction:
        print(filename)
        try:
            pdf_file_path = os.path.join("raw-cases", filename)
            outcome = extract_ruling2(pdf_file_path)
            result = [filename, outcome[0], outcome[1], outcome[2], outcome[3]]
            results.append(result)

        except Exception as e:
            print(f"Error processing {filename}: {e}")

df2 = pd.DataFrame(results, columns=["filename", "target", "method",  "section_keyword", "text"])
print(df2)

2000_SGCA_25.pdf
2000_SGHC_104.pdf
2000_SGHC_112.pdf
2000_SGHC_12.pdf
2000_SGHC_130.pdf
2000_SGHC_140.pdf
2000_SGHC_183.pdf
2000_SGHC_185.pdf
2000_SGHC_197.pdf
2000_SGHC_202.pdf
2000_SGHC_222.pdf
2000_SGHC_248.pdf
2000_SGHC_257.pdf
2000_SGHC_260.pdf
2000_SGHC_283.pdf
2000_SGHC_290.pdf
2000_SGHC_42.pdf
2000_SGHC_45.pdf
2000_SGHC_82.pdf
2001_SGCA_29.pdf
2001_SGCA_66.pdf
2001_SGCA_73.pdf
2001_SGHC_101.pdf
2001_SGHC_107.pdf
2001_SGHC_108.pdf
2001_SGHC_111.pdf
2001_SGHC_118.pdf
2001_SGHC_128.pdf
2001_SGHC_130.pdf
2001_SGHC_132.pdf
2001_SGHC_148.pdf
2001_SGHC_150.pdf
2001_SGHC_151.pdf
2001_SGHC_163.pdf
2001_SGHC_168.pdf
2001_SGHC_174.pdf
2001_SGHC_184.pdf
2001_SGHC_186.pdf
2001_SGHC_199.pdf
2001_SGHC_214.pdf
2001_SGHC_215.pdf
2001_SGHC_222.pdf
2001_SGHC_224.pdf
2001_SGHC_228.pdf
2001_SGHC_229.pdf
2001_SGHC_232.pdf
2001_SGHC_239.pdf
2001_SGHC_240.pdf
2001_SGHC_244.pdf
2001_SGHC_250.pdf
2001_SGHC_254.pdf
2001_SGHC_257.pdf
2001_SGHC_265.pdf
2001_SGHC_266.pdf
2001_SGHC_267.pdf
2001_SGHC_270.pdf


In [244]:
df2.to_csv("na_files.csv")

# Merge the original DataFrame with the new DataFrame based on the "filename" column
merged_df = df.merge(df2, on='filename', how='left', suffixes=('_original', '_updated'))

# Update values in the original DataFrame with the updated values from df2
df['method'] = merged_df['method_updated'].combine_first(merged_df['method_original'])
df['target'] = merged_df['target_updated'].combine_first(merged_df['target_original'])
df['section_keyword'] = merged_df['section_keyword_updated'].combine_first(merged_df['section_keyword_original'])
df['text'] = merged_df['text_updated'].combine_first(merged_df['text_original'])
df.to_csv("ruling_extraction_final.csv")