In [74]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

from tqdm.notebook import tqdm
from pathlib import Path
import re

## Data Scrapping

#### 1. 

In [154]:
# set up the path 
data_path = Path.cwd().parent / 'data'

data_path.mkdir(exist_ok=True)
print(data_path)


/Users/zoeyy/Desktop/xlab Intern/ds-ciss-colonial-law/data


In [155]:

# Function to scrape data from a given URL
def scrape_data(year, id_range):
    data = []
    for i in id_range:
        url = f"https://chanrobles.com/cralaw/{year}januarydecisions.php?id={i}"
        print(f"Scraping URL: {url}")
        
        # Define headers to mimic a browser request
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }

        # Send a GET request to the webpage with headers
        response = requests.get(url, headers=headers)

        # If status code 404, break the loop
        if response.status_code == 404:
            print(f"No more data available for {year} with id={i}")
            break

        # If any other status code, continue to the next ID
        if response.status_code != 200:
            print(f"Failed to retrieve the webpage for {year} with id={i}. Status code: {response.status_code}")
            continue

        # Parse the webpage content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the relevant section containing the case details
        main_content = soup.find('div', class_='topcontent')
        if not main_content:
            continue

        # Initialize variables to store case details
        case_details = {'year': year, 'url': url, 'id': i}

        # Find the date, plaintiffs, and defendants from the justified content
        justify_div = main_content.find('div', align='justify')
        if justify_div:
            text = justify_div.get_text(separator=' ', strip=True)
            #print(f"text info is '{text}'")

            parts = re.split(r' - |\]', text)
            #print(f"parts info is '{parts}'")
            
            if len(parts) >= 2:
                date_part = parts[0].strip()
                case_info = parts[1].strip()
            else:
                date_part = parts[0].strip()
                case_info = ''

            # Extract date and case number from date_part
            date_match = re.search(r'(\w+ \d{1,2}, \d{4})', date_part)
            if date_match:
                case_details['date'] = date_match.group(1)
            
            case_id_match = re.search(r'G\.? ?R\.? No\.? ?(\S+)', date_part)
            if case_id_match:
                case_details['case_NO'] = case_id_match.group(1)


                case_parts = re.split(r' v\. | vs\. ', case_info)
                #print(f"case_parts info is '{case_parts}'")
                if len(case_parts) == 2:
                    case_details['plaintiffs'] = case_parts[0].strip()
                    
                    # Remove extra text after the defendant's name
                    defendant_info = case_parts[1].strip()
                    defendant_name = re.match(r'^[^\d]+', defendant_info)
                    if defendant_name:
                        case_details['defendants'] = defendant_name.group(0).strip()
                    else:
                        case_details['defendants'] = defendant_info
        
        # Extract the lawyer for plaintiff
        center_divs = main_content.find_all('div', align='center')[1]
        lawyers = center_divs.find_all(string=re.compile(r', for'))
        if lawyers:
            case_details['lawyer_for_plaintiff'] = lawyers[0].strip(', for')
            if len(lawyers) > 1:
                case_details['lawyer_for_defendant'] = lawyers[1].strip(', for')


        # Extract syllabus
        syllabus_tag = main_content.find(string="SYLLABUS")
        if syllabus_tag:
            case_details['syllabus'] = syllabus_tag.find_next(string=True).strip()
        else:
            case_details['syllabus'] = "N/A"
        
        # Extract the content under align='justify' after decision
        decision_tag = main_content.find(string="D E C I S I O N")
        #print(f"decision_tag info is '{decision_tag}'")
        if decision_tag:
            writer_content = decision_tag.find_next(lambda tag: tag.name in ['p', 'div'] and tag.get('align') == 'right')
            if writer_content:
                writer = writer_content.get_text(separator=' ', strip=True)
                # match the text up to and including 'J'. 
                case_details['writer'] = re.match(r'.*?J\.?', writer).group(0)
            else:
                case_details['writer'] = "N/A"
                
            justify_content = decision_tag.find_next('div', align='justify')
            #print(f"justify_content info is '{justify_content}'")
            if justify_content:
                case_details['decision'] = justify_content.get_text(separator=' ', strip=True)
            else:
                p_tags = decision_tag.find_all_next('p', style=re.compile(r'margin: 6pt 0in; text-align: justify; line-height: normal;'))
                #print(f"p_tags info is '{p_tags}'")
                if p_tags:
                    decision_text = ' '.join(p.get_text(separator=' ', strip=True) for p in p_tags)
                    case_details['decision'] = decision_text
                    #print(f"decision_text info is '{decision_text}'")
                    #case_details['decision'] = "N/A"
            # find the concur tag
            concur_tag = soup.find(string=re.compile(r'concur\.'))
            concur_text = ""

            # Extract the text before "concur."
            if concur_tag:
                full_text = concur_tag.parent.get_text(separator=' ', strip=True)
                concur_text = re.search(r'(.*concur\.)', full_text).group(1).strip()
                case_details['concur'] = concur_text
            else:
                case_details['concur'] = "N/A"
                #print(f"concur_text info is '{concur_text}'")  

        # Extract separate opinions
        separate_opinion = main_content.find(string="Separate Opinions")
        if separate_opinion:
            case_details['separate_opinion'] = True
        else:
            case_details['separate_opinion'] = False

        # Extract endnotes  
        endnotes = main_content.find(string="Endnotes:")
        if endnotes:
            case_details['endnotes'] = True
        else:
            case_details['endnotes'] = False
        
        data.append(case_details)

    return data



In [156]:
# Define the range of years and ID range
years = range(1901, 1947)  
id_range = range(1,401)

# Initialize a list to store all scraped data
all_data = []

# Scrape data for each year
for year in tqdm(years, total=len(years)):
    data = scrape_data(year, id_range)
    all_data.extend(data)

# Create a DataFrame from the scraped data
df = pd.DataFrame(all_data)
df.head()

#df_clean = df.drop_duplicates(inplace = True)
#print(f"'{df_clean.shape[0]}' rows, without duplicates")


  0%|          | 0/46 [00:00<?, ?it/s]

Scraping URL: https://chanrobles.com/cralaw/1901januarydecisions.php?id=1
Scraping URL: https://chanrobles.com/cralaw/1901januarydecisions.php?id=2
Scraping URL: https://chanrobles.com/cralaw/1901januarydecisions.php?id=3
Scraping URL: https://chanrobles.com/cralaw/1901januarydecisions.php?id=4
Scraping URL: https://chanrobles.com/cralaw/1901januarydecisions.php?id=5
justify_content info is '<div align="justify">This is an appeal from the judgment of the Court of First Instance of Intramuros (Manila) in an action for an accounting instituted by Walter Jackson against Paul Blum, H. Blum, W. A. Whaley, and L. M. Johnson. The matter involved is a leasehold interest in the business property known as the  "Alhambra," situated on the Escolta in Manila, together with the furniture and fixtures and other appurtenances. <br/><br/>In August, 1898, Señor Roca took a lease from the owner of the Alhambra and a short time afterwards transferred the same to Evans, Jackson, and Williams. Williams conv

AttributeError: 'NoneType' object has no attribute 'group'

In [None]:

# Save the DataFrame to a CSV file
df.to_csv(data_path / 'case_details.csv', index=False)

print("Scraping completed and data saved to case_details.csv")

In [None]:
# data preview 
df.head()

Unnamed: 0,year,url,syllabus,date,ID,plaintiffs,defendants,plaintiffs_backup,defendants_backup,lawyer_for_plaintiff,lawyer_for_defendant,decision
0,1909,https://chanrobles.com/cralaw/1909januarydecis...,,,,,,,,,,
1,1910,https://chanrobles.com/cralaw/1910januarydecis...,"1. VIOLATION OF THE ELECTION LAW. — Held, That...","Supreme Court, Jurisprudence",Jurisprudence,UNITED STATES,SALUSTIANO PULIDO ET AL. 017 Phil 579:,THE UNITED STATES,v. SALUSTIANO PULIDO ET AL.,"THE UNITED STATES,",", v. SALUSTIANO PULIDO ET AL.,",The defendants in this case were accused of a ...


## EDA

In [59]:
# check the quality of data scraping 
# number of row with non-na values

print(f"Number of rows: '{df.count()}'")

print(df.groupby('year').count())
      





Number of rows: 'year                 13455
url                  13455
syllabus             13455
date                  8622
ID                    8622
decision              8651
plaintiffs            8529
defendants            8529
plaintiffs_backup     5650
defendants_backup     6162
dtype: int64'
      url  syllabus  date   ID  decision  plaintiffs  defendants  \
year                                                               
1901  299       299    31   31        31          28          28   
1902  299       299   160  160       160         155         155   
1903  299       299   217  217       217         214         214   
1904  299       299   138  138       138         137         137   
1905  299       299   253  253       256         249         249   
1906  299       299   299  299       299         297         297   
1907  299       299   286  286       286         285         285   
1908  299       299   299  299       299         295         295   
1909  299       299