In [5]:
from bs4 import BeautifulSoup
import requests
import re
import nltk
import pandas as pd
from tqdm import tqdm


def remove_html_tags(text):
    clean_text = re.sub(r'<.*?>', '', text)
    return clean_text


<strong>JUDGMENT:</strong>


In [6]:
base_url = 'https://www.elitigation.sg/gd/gd/{year}_{court}_{case_id}'
    
years = range(2000, 2016)
courts = ['SGHC', 'SGCA']
case_ids = range(0, 430)

data = []
fact_issues_identifiers = ['Background facts', 'The facts', 'Background', 'Facts', 'Finding of Fact', 'The issues', 
                           'The appeal', 'Appeal', 'The issues on appeal', 'The claim', 'The present claim', 
                           'Background to the dispute', 'The substantive issue', 'Issues to be determined', 'The relevant issues']

# Iterate over all combinations of years, courts, and case_ids
for year in years:
    for court in courts:
        for case_id in tqdm(case_ids):
            url = base_url.format(year=year, court=court, case_id=case_id)
            response = requests.get(url)

            extracted_text = ''

            # Parse the HTML content of the webpage
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find all <strong> tags
            strong_tags = soup.find_all('strong')
            #print(strong_tags)

            if response.status_code == 200: # Elitigation page returns status code 200 even for 404
                # Find all <h1> tags
                h1_tags = soup.find_all('h1')
                
                # Check if any of the <h1> tags contain the text 'Page Not Found'
                page_not_found = any(tag.get_text().strip() == 'Page Not Found' for tag in h1_tags)

                if not page_not_found:
                    #print(strong_tags)
                    
                    # Iterate through all <strong> tags
                    for strong_tag in strong_tags:
                        text = strong_tag.get_text()
                        # Check if keywords of facts and issues is in the text
                        if [identifier for identifier in fact_issues_identifiers if (identifier in text)]:
                            # Find the parent <p> tag of the current <strong> tag
                            parent_p_tag = strong_tag.find_parent('p')

                            #print(parent_p_tag)
                            if parent_p_tag:
                                
                                # Find all subsequent <p> tags
                                next_p_tags = parent_p_tag.find_next_siblings('p')
                                #print(len(next_p_tags))
                                # Extract the text content of each subsequent <p> tag until 'Conclusion' is found
                                for p_tag in next_p_tags:
                                    #print(p_tag.get_text)
                                    # Check if the 'Conclusion'paragraph is found
                                    if 'Conclusion' in p_tag.get_text():
                                        break  # Do not continue extraction of the paragraph text
                                    
                                    # If there is no 'Conclusion' 
                                    # Check word count of current paragraph
                                    words_count_current = len(p_tag.get_text().split())
                                    
                                    # Check word count of subsequent paragraphs
                                    words_count_next = len(' '.join([tag.get_text() for tag in p_tag.find_next_siblings('p')]).split())
                                    
                                    # Check if current paragraph has more than 10 words and subsequent paragraphs have no more than 10 words
                                    if words_count_current > 10 and words_count_next <= 10:
                                        break
                                    
                                    #print(p_tag.get_text().strip())
                                    extracted_text += p_tag.get_text().strip()

                                break  # Once all conditions fulfilled, stop extraction
                        
                    if extracted_text == '': # If extracted text is still empty
                        #print('here')
                        # Find all paragraphs with class attribute 'Judg-Heading-x'
                        judg_heading_paragraphs = soup.find_all('p', class_=lambda x: x and x.startswith('Judg-Heading-'))

                        justify_paragraphs = soup.find_all('p', align='justify')
                        # Iterate through each paragraph
                        for paragraph in judg_heading_paragraphs:

                            # Check if keywords of facts and issues is in the paragraph text
                            if [identifier for identifier in fact_issues_identifiers if (identifier in text)]:
                                
                                # Find all subsequent <p> tags
                                next_p_tags = paragraph.find_next_siblings('p')
                                
                                # Extract the text content of each subsequent <p> tag until 'Conclusion'is found
                                for p_tag in next_p_tags:
                                    # Check if the 'Conclusion' paragraph is found
                                    if 'Conclusion' in p_tag.get_text():
                                        break  # Stop extraction of the paragraph if 'Conclusion' is found

                                    # Using word count as stopping criteria for paragraphs without 'Conclusion

                                    # Check word cout of current paragraph
                                    words_count_current = len(p_tag.get_text().split())

                                    # Check word count of subsequent paragraphs
                                    words_count_next = len(' '.join([tag.get_text() for tag in p_tag.find_next_siblings('p')]).split())

                                    # Check if current paragraph has more than 10 words and subsequent paragraphs have no more than 10 words
                                    if words_count_current > 10 and words_count_next <= 10:
                                        break  # Stop iteration if condition is met
                                    extracted_text += p_tag.get_text().strip()

                    if extracted_text == '':
                        # instance where indentifier is in a alignment justified paragraph
                        justify_paragraphs = soup.find_all('p', align='justify')
                        #print(justify_paragraphs)
                        for paragraph in justify_paragraphs:
                            if paragraph.get_text(strip=True) == 'Background':
                                # Find all subsequent <p> tags
                                next_p_tags = paragraph.find_all_next('p')
                                #print(next_p_tags)
                                # Extract the text content of each subsequent <p> tag until 'Conclusion'is found
                                for p_tag in next_p_tags:
                                    # Check if the 'Conclusion' paragraph is found
                                    if 'Conclusion' in p_tag.get_text():
                                        break  # Stop extraction of the paragraph if 'Conclusion' is found

                                    # Using word count as stopping criteria for paragraphs without 'Conclusion

                                    # Check word cout of current paragraph
                                    words_count_current = len(p_tag.get_text().split())

                                    # Check word count of subsequent paragraphs
                                    words_count_next = len(' '.join([tag.get_text() for tag in p_tag.find_next_siblings('p')]).split())

                                    # Check if current paragraph has more than 10 words and subsequent paragraphs have no more than 10 words
                                    if words_count_current > 10 and words_count_next <= 10:
                                        break  # Stop iteration if condition is met
                                    extracted_text += p_tag.get_text().strip()

                    # Append the data to the list
                    data.append({'File Name': f'Case_{year}_{court}_{case_id}.pdf', 'text': remove_html_tags(extracted_text)})

df = pd.DataFrame(data)
# Save the DataFrame to a CSV file
df.to_csv('extracted_text_data.csv', index=False)


  0%|          | 0/430 [00:00<?, ?it/s]

100%|██████████| 430/430 [01:08<00:00,  6.27it/s]
100%|██████████| 430/430 [00:59<00:00,  7.28it/s]
100%|██████████| 430/430 [01:06<00:00,  6.43it/s]
100%|██████████| 430/430 [01:00<00:00,  7.08it/s]
100%|██████████| 430/430 [01:05<00:00,  6.55it/s]
100%|██████████| 430/430 [01:00<00:00,  7.07it/s]
100%|██████████| 430/430 [01:16<00:00,  5.59it/s]
100%|██████████| 430/430 [00:59<00:00,  7.25it/s]
100%|██████████| 430/430 [01:11<00:00,  6.05it/s]
100%|██████████| 430/430 [00:58<00:00,  7.37it/s]
100%|██████████| 430/430 [01:11<00:00,  6.00it/s]
100%|██████████| 430/430 [01:08<00:00,  6.29it/s]
100%|██████████| 430/430 [01:06<00:00,  6.49it/s]
100%|██████████| 430/430 [01:06<00:00,  6.51it/s]
100%|██████████| 430/430 [01:10<00:00,  6.12it/s]
100%|██████████| 430/430 [01:03<00:00,  6.72it/s]
100%|██████████| 430/430 [01:16<00:00,  5.65it/s]
100%|██████████| 430/430 [01:04<00:00,  6.70it/s]
100%|██████████| 430/430 [05:23<00:00,  1.33it/s]  
100%|██████████| 430/430 [02:05<00:00,  3.41it/s

In [8]:

df2 = pd.read_csv('extracted_text_data.csv')
df2['text'].isna().value_counts()


text
True     3162
False    2323
Name: count, dtype: int64

In [None]:
""" ARCHIVE """
#################### SECOND VERSION ####################
# # Send a GET request to the webpage
# response = requests.get(url)

# # Parse the HTML content of the webpage
# soup = BeautifulSoup(response.content, 'html.parser')

# # Initialize an empty string to store the extracted text
# extracted_text = ''

# # Find all <strong> tags
# strong_tags = soup.find_all('strong')

# # Iterate through all <strong> tags
# for strong_tag in strong_tags:
#     text = strong_tag.get_text()
#     # Check if 'background fact' is in the text
#     if any(keyword in text for keyword in ['Background facts', 'The facts', 'Background', 'Facts', 'Finding of Fact', 
#                                            'The issues', 'The appeal', 'Appeal', 'The issues on appeal', 'The claim', 'The present claim', 
#                                            'Background to the dispute', 'The substantive issue', 'Issues to be determined', 'The relevant issues']):
#         # Find the parent <p> tag of the current <strong> tag
#         parent_p_tag = strong_tag.find_parent('p')
#         if parent_p_tag:
#             # Find all subsequent <p> tags
#             next_p_tags = parent_p_tag.find_next_siblings('p')
#             # Extract the text content of each subsequent <p> tag until 'Conclusion' is found
#             for p_tag in next_p_tags:
#                 # Check if the 'Conclusion' paragraph is found
#                 if 'Conclusion' in p_tag.get_text():
#                     break  # Stop iteration if 'Conclusion' is found
#                 # Check word count of current paragraph
#                 words_count_current = len(p_tag.get_text().split())
#                 # Check word count of subsequent paragraphs
#                 words_count_next = len(' '.join([tag.get_text() for tag in p_tag.find_next_siblings('p')]).split())
#                 # Check if current paragraph has more than 10 words and subsequent paragraphs have no more than 10 words
#                 if words_count_current > 10 and words_count_next <= 10:
#                     break  # Stop iteration if condition is met
#                 extracted_text += p_tag.get_text().strip() + '\n'
#             break  # Stop searching once 'background fact' is found

# # Find all paragraphs with class attribute 'Judg-Heading-x'
# judg_heading_paragraphs = soup.find_all('p', class_=lambda x: x and x.startswith('Judg-Heading-'))

# # Iterate through each paragraph
# for paragraph in judg_heading_paragraphs:
#     # Check if 'background fact' is in the paragraph text
#     if any(keyword in paragraph.get_text() for keyword in ['Background facts', 'The facts', 'Background', 'Facts', 'Finding of Fact', 
#                                            'The issues', 'The appeal', 'Appeal', 'The issues on appeal', 'The claim', 'The present claim', 
#                                            'Background to the dispute', 'The substantive issue', 'Issues to be determined', 'The relevant issues']):
#         # Find all subsequent <p> tags
#         next_p_tags = paragraph.find_next_siblings('p')
#         # Extract the text content of each subsequent <p> tag until 'Conclusion' is found
#         for p_tag in next_p_tags:
#             # Check if the 'Conclusion' paragraph is found
#             if 'Conclusion' in p_tag.get_text():
#                 break  # Stop iteration if 'Conclusion' is found
#             # Check word count of current paragraph
#             words_count_current = len(p_tag.get_text().split())
#             # Check word count of subsequent paragraphs
#             words_count_next = len(' '.join([tag.get_text() for tag in p_tag.find_next_siblings('p')]).split())
#             # Check if current paragraph has more than 10 words and subsequent paragraphs have no more than 10 words
#             if words_count_current > 10 and words_count_next <= 10:
#                 break  # Stop iteration if condition is met
#             extracted_text += p_tag.get_text().strip() + '\n'

# # Print the extracted text
# print(index_paragraph_exclude_last_15(extracted_text))


#################### FIRST VERSION ####################

# url = 'https://www.elitigation.sg/gd/gd/2000_SGHC_5'

# # Send a GET request to the webpage
# response = requests.get(url)

# # Parse the HTML content of the webpage
# soup = BeautifulSoup(response.content, 'html.parser')

# # Find all <strong> tags
# strong_tags = soup.find_all('strong')
# #print(strong_tags)



# def index_paragraph_exclude_last_15(paragraph):
#     # Tokenize the paragraph into sentences
#     sentences = nltk.sent_tokenize(paragraph)

#     # Calculate the index range to exclude the last 10 sentences
#     start_index = 0
#     end_index = max(0, len(sentences) - 15)

#     # Reconstruct the paragraph excluding the last 10 sentences
#     indexed_paragraph = ' '.join(sentences[start_index:end_index])

#     return indexed_paragraph
# text = ''
# # Iterate through all <strong> tags
# for strong_tag in strong_tags:
#     #print(strong_tag)
#     text = strong_tag.get_text()

#     # Check if 'background fact' is in the text
#     if 'Background facts' in text or 'The facts' in text or 'Background' in text or 'Facts' in text or 'Finding of Fact' in text:
#         print(text)
#     if 'Background facts' or 'The facts' or 'Background' or 'Facts' or 'Finding of Fact' in text:
#         # Find the parent <p> tag of the current <strong> tag
#         parent_p_tag = strong_tag.find_parent('p')
#         if parent_p_tag:
#             # Find all subsequent <p> tags
#             next_p_tags = parent_p_tag.find_next_siblings('p')
#             #print(len(next_p_tags))
#             # Extract and print the text content of each subsequent <p> tag until 'Conclusion' is found
#             for p_tag in next_p_tags:
#                 # Check if the 'Conclusion' paragraph is found
#                 if 'Conclusion' in p_tag.get_text():
#                     break  # Stop iteration if 'Conclusion' is found
#                 text += p_tag.get_text().strip()
#             break  # Stop searching once 'background fact' is found

#print(remove_html_tags(index_paragraph_exclude_last_15(text)))