In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

from tqdm.notebook import tqdm
from pathlib import Path
import re
import time

## Data Scrapping

#### 1. 

In [4]:
# set up the path 
data_path = Path.cwd().parent / 'data'

data_path.mkdir(exist_ok=True)
print(data_path)


/Users/zoeyy/Desktop/xlab Intern/ds-ciss-colonial-law/data


In [3]:


# Function to scrape data from a given URL
def scrape_data(year, id_range):
    data = []
    for i in id_range:
        url = f"https://chanrobles.com/cralaw/{year}januarydecisions.php?id={i}"
        print(f"Scraping URL: {url}")
        
        # Define headers to mimic a browser request
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }

        try:
            # Send a GET request to the webpage with headers and timeout
            response = requests.get(url, headers=headers, timeout=10)
            
            # If status code 404, break the loop
            if response.status_code == 404:
                print(f"No more data available for {year} with id={i}")
                break

            # If any other status code, continue to the next ID
            if response.status_code != 200:
                print(f"Failed to retrieve the webpage for {year} with id={i}. Status code: {response.status_code}")
                continue

            # Parse the webpage content
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the relevant section containing the case details
            main_content = soup.find('div', class_='topcontent')
            if not main_content:
                continue

            # Initialize variables to store case details
            case_details = {'year': year, 'url': url, 'id': i}

            # Find the date, plaintiffs, and defendants from the justified content
            justify_div = main_content.find('div', align='justify')
            if justify_div:
                text = justify_div.get_text(separator=' ', strip=True)

                parts = re.split(r' - |\]', text)
                
                if len(parts) >= 2:
                    date_part = parts[0].strip()
                    case_info = parts[1].strip()
                else:
                    date_part = parts[0].strip()
                    case_info = ''

                # Extract date and case number from date_part
                date_match = re.search(r'(\w+ \d{1,2}, \d{4})', date_part)
                if date_match:
                    case_details['date'] = date_match.group(1)
                
                case_id_match = re.search(r'G\.? ?R\.? No\.? ?(\S+)', date_part)
                if case_id_match:
                    case_details['case_NO'] = case_id_match.group(1)

                    case_parts = re.split(r' v\. | vs\. ', case_info)
                    if len(case_parts) == 2:
                        case_details['plaintiffs'] = case_parts[0].strip()
                        
                        # Remove extra text after the defendant's name
                        defendant_info = case_parts[1].strip()
                        defendant_name = re.match(r'^[^\d]+', defendant_info)
                        if defendant_name:
                            case_details['defendants'] = defendant_name.group(0).strip()
                        else:
                            case_details['defendants'] = defendant_info
            
            # Extract the lawyer for plaintiff
            center_divs = main_content.find_all('div', align='center')[1]
            lawyers = center_divs.find_all(string=re.compile(r', for'))
            if lawyers and len(lawyers[0].strip(', for'))<50:
                case_details['lawyer_for_plaintiff'] = lawyers[0].strip(', for')
                if len(lawyers) > 1:
                    case_details['lawyer_for_defendant'] = lawyers[1].strip(', for')
            else:
                lawyers_text = center_divs.find_all(string=re.compile(r' for '))
                if lawyers_text:
                    case_details['lawyer_for_plaintiff'] = lawyers_text[0].strip(' for ')
                    if len(lawyers_text) > 1:
                        case_details['lawyer_for_defendant'] = lawyers_text[1].strip(' for ')

            # Extract syllabus
            syllabus_tag = main_content.find(string="SYLLABUS")
            if syllabus_tag:
                case_details['syllabus'] = syllabus_tag.find_next(string=True).strip()
            else:
                case_details['syllabus'] = "N/A"
            
            # Extract the content under align='justify' after decision
            decision_tag = main_content.find(string="D E C I S I O N")
            if decision_tag:
                writer_content = decision_tag.find_next(lambda tag: tag.name in ['p', 'div'] and tag.get('align') == 'right')
                if writer_content:
                    writer = writer_content.get_text(separator=' ', strip=True)
                    # match the text up to and including 'J'. 
                    match = re.match(r'.*?J\.?', writer)
                    if match and len(match.group(0)) > 5:
                        case_details['writer'] = match.group(0)
                    else:
                        case_details['writer'] = writer
                    
                justify_content = decision_tag.find_next('div', align='justify')
                if justify_content:
                    case_details['decision'] = justify_content.get_text(separator=' ', strip=True)
                else:
                    p_tags = decision_tag.find_all_next('p', style=re.compile(r'margin: 6pt 0in; text-align: justify; line-height: normal;'))
                    if p_tags:
                        decision_text = ' '.join(p.get_text(separator=' ', strip=True) for p in p_tags)
                        case_details['decision'] = decision_text

                # Find the concur tag
                concur_tag = soup.find(string=re.compile(r'concur\.'))
                concur_text = ""

                # Extract the text before "concur."
                if concur_tag:
                    full_text = concur_tag.parent.get_text(separator=' ', strip=True)
                    concur_text = re.search(r'(.*concur\.)', full_text).group(1).strip()
                    if len(concur_text) < 200:
                        case_details['concur'] = concur_text
                    else:
                        # the concur text is too long, extract the last 100 characters
                        case_details['concur'] = concur_text[-100:]

                else:
                    case_details['concur'] = "N/A"

            # Extract separate opinions
            separate_opinion = main_content.find(string="Separate Opinions")
            if separate_opinion:
                case_details['separate_opinion'] = True
            else:
                case_details['separate_opinion'] = False

            # Extract endnotes  
            endnotes = main_content.find(string="Endnotes:")
            if endnotes:
                case_details['endnotes'] = True
            else:
                case_details['endnotes'] = False
            
            data.append(case_details)
        
        except requests.exceptions.Timeout:
            print(f"Request timed out for URL: {url}")
            continue
        except requests.exceptions.RequestException as e:
            print(f"Failed to retrieve the webpage for URL: {url}. Error: {e}")
            continue

    return data


In [5]:
# Define the range of years and ID range
years = range(1906, 1910)  
id_range = range(1,800)

# Initialize a list to store all scraped data
all_data = []

# Scrape data for each year
for year in tqdm(years, total=len(years)):
    data = scrape_data(year, id_range)
    all_data.extend(data)

# Create a DataFrame from the scraped data
df = pd.DataFrame(all_data)
df.head(10)

#df_clean = df.drop_duplicates(inplace = True)
#print(f"'{df_clean.shape[0]}' rows, without duplicates")


  0%|          | 0/4 [00:00<?, ?it/s]

Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=1
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=2
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=3
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=4
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=5
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=6
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=7
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=8
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=9
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=10
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=11
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=12
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=13
Scraping URL: https://chanrobles.c

Unnamed: 0,year,url,id,date,case_NO,plaintiffs,defendants,lawyer_for_plaintiff,lawyer_for_defendant,syllabus,writer,decision,concur,separate_opinion,endnotes
0,1906,https://chanrobles.com/cralaw/1906januarydecis...,1,"January 2, 1906",2070,W.H. TIPTON,RAMON A. MARTINEZ,Solicitor-General Araneta,Coudert Brothers,1. ADMINISTRATOR; SPECIAL POWER. — Under artic...,"MAPA, J.","On the 30th day of October, 1899, Vicente Agui...",e court below for action in conformity herewit...,True,False
1,1906,https://chanrobles.com/cralaw/1906januarydecis...,2,"January 2, 1906",2227,MAXIMINO ESPIRITU,JOSE LUIS,W.J. Rohde,R. Moren,"1. CONTRACT; ""PACTO DE RETRO."" — The contract...","WILLARD, J.",The contract involved in this case is as follo...,the court below for execution. So ordered. Ar...,False,False
2,1906,https://chanrobles.com/cralaw/1906januarydecis...,3,"January 2, 1906",3021,"LEONISA YTURRALDE, ET AL.","ALBINO SANTOS, ET AL.",Mariano Monroy,Vicente Miranda,1. JUDGMENT; NEW TRIAL; BILL OF EXCEPTIONS. — ...,"WILLARD, J.",This was a motion to dismiss a bill of excepti...,costs against the Appellant . So ordered. Are...,False,True
3,1906,https://chanrobles.com/cralaw/1906januarydecis...,4,"January 4, 1906",2030,ALFRED DAVID OEHLERS,ROBERT HARTWIG,"Hartigan, Marple, Rohde and Gutierrez",A.D. Gibbs,1. CONTRACT LABOR; ACTION TO RECOVER PENALTIES...,"WILLARD, J.","Section 5 of the act of Congress of March 3, 1...",rt below for execution thereof. So ordered. Ar...,False,True
4,1906,https://chanrobles.com/cralaw/1906januarydecis...,5,"January 4, 1906",2050,UNITED STATES,ROHILLA MARU,Pillsbury & Sut,Attorney-General Wilfley,"1. ADMIRALTY; CHINESE EXCLUSION ACT, VIOLATION...","WILLARD, J.",Seven Chinese laborers arrived in Manila Bay f...,will be allowed in this court. So ordered. Ar...,False,False
5,1906,https://chanrobles.com/cralaw/1906januarydecis...,6,"January 4, 1906",2236,UNITED STATES,NETA SHIYOKISHI,Thomas L. McGi,Attorney-General Wilfley,1. COURT OF CUSTOMS APPEALS; JURISDICTION. — T...,"WILLARD, J.",The defendant in this case was prosecuted in t...,nstance against the Appellant . So ordered. Ar...,False,True
6,1906,https://chanrobles.com/cralaw/1906januarydecis...,7,"January 4, 1906",2397,LO SUI,HARDEE WYATT,W.L. Wright,Coudert Brothers,"1. PLEADING; COMPLAINT, SUFFICIENCY OF. — Wher...","JOHNSON, J. :",This was an action by the plaintiff to recover...,"ly, 1904, with the costs in both instances. Ar...",False,False
7,1906,https://chanrobles.com/cralaw/1906januarydecis...,8,"January 4, 1906",2555,UNITED STATES,ANDRES SALAZAR,Frederick Garfield Waite,Solicitor-General Araneta,1. CRIMINAL LAW; USURPATION OF PUBLIC FUNCTION...,"JOHNSON, J. :",The defendant was charged with the crime of us...,of trial in the inferior court. So ordered. Ar...,False,False
8,1906,https://chanrobles.com/cralaw/1906januarydecis...,9,"January 4, 1906",2567,UNITED STATES,"GERMAN DE TORRES, ET AL.",Pablo Barbon,Solicitor-General Araneta,1. CRIMINAL PROCEDURE; INFORMATION. — Where th...,"JOHNSON, J. :",This defendant was charged with the crime of r...,"ty pesos, and to pay the costs. So ordered. Ar...",False,False
9,1906,https://chanrobles.com/cralaw/1906januarydecis...,10,"January 5, 1906",1449,"VICENTE GOMEZ GARCIA, ET AL.","JACINTA HIPOLITO, ET AL.",Manuel Torres,Jos. N. Wolfson,1. POSSESSORY INFORMATION TITLES; PRESCRIPTION...,"CARSON, J.",Both the plaintiffs and defendants claim title...,ein it originated for proper action. So ordere...,False,True


In [6]:

# Save the DataFrame to a CSV file
df.to_csv(data_path/ 'scrap' / 'case_details_0607_1906_1909.csv', index=False)

print("Scraping completed and data saved to case_details.csv")
# data preview 
df.head()

Scraping completed and data saved to case_details.csv


Unnamed: 0,year,url,id,date,case_NO,plaintiffs,defendants,lawyer_for_plaintiff,lawyer_for_defendant,syllabus,writer,decision,concur,separate_opinion,endnotes
0,1906,https://chanrobles.com/cralaw/1906januarydecis...,1,"January 2, 1906",2070,W.H. TIPTON,RAMON A. MARTINEZ,Solicitor-General Araneta,Coudert Brothers,1. ADMINISTRATOR; SPECIAL POWER. — Under artic...,"MAPA, J.","On the 30th day of October, 1899, Vicente Agui...",e court below for action in conformity herewit...,True,False
1,1906,https://chanrobles.com/cralaw/1906januarydecis...,2,"January 2, 1906",2227,MAXIMINO ESPIRITU,JOSE LUIS,W.J. Rohde,R. Moren,"1. CONTRACT; ""PACTO DE RETRO."" — The contract...","WILLARD, J.",The contract involved in this case is as follo...,the court below for execution. So ordered. Ar...,False,False
2,1906,https://chanrobles.com/cralaw/1906januarydecis...,3,"January 2, 1906",3021,"LEONISA YTURRALDE, ET AL.","ALBINO SANTOS, ET AL.",Mariano Monroy,Vicente Miranda,1. JUDGMENT; NEW TRIAL; BILL OF EXCEPTIONS. — ...,"WILLARD, J.",This was a motion to dismiss a bill of excepti...,costs against the Appellant . So ordered. Are...,False,True
3,1906,https://chanrobles.com/cralaw/1906januarydecis...,4,"January 4, 1906",2030,ALFRED DAVID OEHLERS,ROBERT HARTWIG,"Hartigan, Marple, Rohde and Gutierrez",A.D. Gibbs,1. CONTRACT LABOR; ACTION TO RECOVER PENALTIES...,"WILLARD, J.","Section 5 of the act of Congress of March 3, 1...",rt below for execution thereof. So ordered. Ar...,False,True
4,1906,https://chanrobles.com/cralaw/1906januarydecis...,5,"January 4, 1906",2050,UNITED STATES,ROHILLA MARU,Pillsbury & Sut,Attorney-General Wilfley,"1. ADMIRALTY; CHINESE EXCLUSION ACT, VIOLATION...","WILLARD, J.",Seven Chinese laborers arrived in Manila Bay f...,will be allowed in this court. So ordered. Ar...,False,False


In [7]:
# drop rows if case_NO is missing
print(f"Before droping, the data has '{df.shape[0]}' rows")
df = df.dropna(subset=['case_NO'])
print(f"After droping, the data has '{df.shape[0]}' rows")


Before droping, the data has '3196' rows
After droping, the data has '2220' rows


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2220 entries, 0 to 2982
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   year                  2220 non-null   int64 
 1   url                   2220 non-null   object
 2   id                    2220 non-null   int64 
 3   date                  2197 non-null   object
 4   case_NO               2220 non-null   object
 5   plaintiffs            2194 non-null   object
 6   defendants            2194 non-null   object
 7   lawyer_for_plaintiff  2143 non-null   object
 8   lawyer_for_defendant  2024 non-null   object
 9   syllabus              2220 non-null   object
 10  writer                2203 non-null   object
 11  decision              2210 non-null   object
 12  concur                2210 non-null   object
 13  separate_opinion      2220 non-null   bool  
 14  endnotes              2220 non-null   bool  
dtypes: bool(2), int64(2), object(11)
memor

In [9]:
# store the url list where the decision is mission
missing_url = df[(df['decision'].isnull()) | (df['syllabus']=="N/A")]['url'].tolist()

print(f"Number of missing decision: '{len(missing_url)}'")

Number of missing decision: '85'


In [10]:

# Scrapping the url for decision is empty. 
def scrape_missing_data(url_list):
    data = []
    missing_url_list = []
    for url in url_list:
        print(f"Scraping URL: {url}")
        
        # Define headers to mimic a browser request
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }

        try:
            # Send a GET request to the webpage with headers and timeout
            response = requests.get(url, headers=headers, timeout=10)
            
            # If status code 404, break the loop
            if response.status_code == 404:
                print(f"No more data available for {url}")
                break

            # If any other status code, continue to the next ID
            if response.status_code != 200:
                print(f"Failed to retrieve the webpage for {url}. Status code: {response.status_code}")
                continue
            pattern = re.compile(r'/cralaw/(\d{4})januarydecisions\.php\?id=(\d+)')

            # Search for the pattern in the URL
            match = pattern.search(url)

            # Extract and print the year and id if the pattern is found
            if match:
                year = match.group(1)
                id_ = match.group(2)

            # Parse the webpage content
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the relevant section containing the case details
            main_content = soup.find('div', class_='topcontent')
            if not main_content:
                continue

            # Initialize variables to store case details
            case_details = {'year': year, 'url': url, 'id': id_}

            # Parse the HTML content with BeautifulSoup

            # Find the <p> tag with align="right" and style containing "margin"
            decision_tag = main_content.find(string="D E C I S I O N")

            if not decision_tag:
                missing_url_list.append(url)
                print(f"Decision not found for URL: {url}")
                continue

            # Find the next sibling tags after the second "DECISION"
            next_siblings = decision_tag.find_all_next('p', class_="MsoNormal", style=lambda value: 'margin' in value if value else False)

            # Extract and print the text from these <p> tags
            extracted_texts = [tag.get_text(strip=True) for tag in next_siblings]

            # Adding the text together
            case_details['decision'] = ' '.join(extracted_texts)
            
            data.append(case_details)

        except requests.exceptions.Timeout:
            print(f"Request timed out for URL: {url}")
            continue
        except requests.exceptions.RequestException as e:
            print(f"Failed to retrieve the webpage for URL: {url}. Error: {e}")
            continue

    return data, missing_url_list


In [11]:

# Scrape the missing data
missing_data = pd.DataFrame(scrape_missing_data(missing_url)[0])

missing_url_list = scrape_missing_data(missing_url)[1]

Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=65
Decision not found for URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=65
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=74
Decision not found for URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=74
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=77
Decision not found for URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=77
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=222
Decision not found for URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=222
Scraping URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=414
Decision not found for URL: https://chanrobles.com/cralaw/1906januarydecisions.php?id=414
Scraping URL: https://chanrobles.com/cralaw/1907januarydecisions.php?id=1
Scraping URL: https://chanrobles.com/cralaw/1907januarydecisions.php?id=2
Scraping URL: https://chanro

In [12]:
# Create a dictionary from missing_data for quick lookup
missing_data['unique_id'] = missing_data['year'].astype(str) + missing_data['id'].astype(str)
df['unique_id'] = df['year'].astype(str) + df['id'].astype(str)

unique_id = missing_data['unique_id']
missing_dict = dict(zip(missing_data['unique_id'], missing_data['decision']))

# Fill in the decision value from missing_data if the url is in missing_url
for key in missing_dict: 
    df.loc[df['unique_id'] == key, 'decision'] = missing_dict[key]


print(f"Before adding missing data, the data has '{df[df['syllabus'].isnull()].shape[0]}' rows")



Before adding missing data, the data has '0' rows


In [243]:
df[df['unique_id'] == '19052']

Unnamed: 0,year,url,id,syllabus,separate_opinion,endnotes,date,case_NO,lawyer_for_plaintiff,writer,decision,concur,plaintiffs,defendants,lawyer_for_defendant,unique_id
1996,1905,https://chanrobles.com/cralaw/1905januarydecis...,2,,False,False,"November 2, 1905",1207.0,,"ARELLANO, C.J.","ARELLANO,C.J.: 1.Dalmacio Arquiza died on the ...","Torres, Mapa, Johnson and Carson, JJ., concur.","PIA BASA, ET AL., Plaintiffs-Appellants,","JOSE CLARO ARQUIZA, ET AL., Defendants-Appelle...",,19052


In [13]:
df.to_csv(data_path / 'cleaned_case_details_0607_1906_1910.csv', index=False)

### Data Cleaning for Scrapping Data

In [14]:
# Read the data
data = pd.read_csv(data_path / 'cleaned_case_details_0607_1906_1910.csv')

# Reorder the column
data = data[['unique_id', 'year', 'date', 'case_NO', 'plaintiffs', 'defendants', 'lawyer_for_plaintiff', 'lawyer_for_defendant', 'writer', 'syllabus', 'decision', 'concur', 'separate_opinion', 'endnotes', 'url']]
# Clean the column "concur"

# Change the datatype
data['unique_id'] = data['unique_id'].astype(str)
data['year'] = data['year'].astype(str)
data['date'] = data['date'].astype(str)
data['case_NO'] = data['case_NO'].astype(str)
data['concur'] = data['concur'].astype(str)

data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2220 entries, 0 to 2219
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   unique_id             2220 non-null   object
 1   year                  2220 non-null   object
 2   date                  2220 non-null   object
 3   case_NO               2220 non-null   object
 4   plaintiffs            2194 non-null   object
 5   defendants            2194 non-null   object
 6   lawyer_for_plaintiff  2143 non-null   object
 7   lawyer_for_defendant  2024 non-null   object
 8   writer                2203 non-null   object
 9   syllabus              2142 non-null   object
 10  decision              2208 non-null   object
 11  concur                2220 non-null   object
 12  separate_opinion      2220 non-null   bool  
 13  endnotes              2220 non-null   bool  
 14  url                   2220 non-null   object
dtypes: bool(2), object(13)
memory usage: 2

In [15]:
# Clean the column "concur"
for i, row in data.iterrows():
    text = row['concur']
    
    if len(text) == 100:
        
        text = text.replace("J. ,", "J.,")
        index = text.find("JJ.")

        # Find the position of the period before "JJ."
        period_index = text.rfind(". ", 0, index)

        # Extract the text between the period and "JJ."
        extracted_text = text[period_index + 2:index + 3]
        data.at[i, 'concur'] = extracted_text

data.head()

Unnamed: 0,unique_id,year,date,case_NO,plaintiffs,defendants,lawyer_for_plaintiff,lawyer_for_defendant,writer,syllabus,decision,concur,separate_opinion,endnotes,url
0,19061,1906,"January 2, 1906",2070,W.H. TIPTON,RAMON A. MARTINEZ,Solicitor-General Araneta,Coudert Brothers,"MAPA, J.",1. ADMINISTRATOR; SPECIAL POWER. — Under artic...,"On the 30th day of October, 1899, Vicente Agui...",,True,False,https://chanrobles.com/cralaw/1906januarydecis...
1,19062,1906,"January 2, 1906",2227,MAXIMINO ESPIRITU,JOSE LUIS,W.J. Rohde,R. Moren,"WILLARD, J.","1. CONTRACT; ""PACTO DE RETRO."" — The contract...",The contract involved in this case is as follo...,"Arellano, C.J., Mapa, Johnson and Carson, JJ.",False,False,https://chanrobles.com/cralaw/1906januarydecis...
2,19063,1906,"January 2, 1906",3021,"LEONISA YTURRALDE, ET AL.","ALBINO SANTOS, ET AL.",Mariano Monroy,Vicente Miranda,"WILLARD, J.",1. JUDGMENT; NEW TRIAL; BILL OF EXCEPTIONS. — ...,This was a motion to dismiss a bill of excepti...,"Arellano, C.J., Mapa, Johnson, and Carson, JJ.",False,True,https://chanrobles.com/cralaw/1906januarydecis...
3,19064,1906,"January 4, 1906",2030,ALFRED DAVID OEHLERS,ROBERT HARTWIG,"Hartigan, Marple, Rohde and Gutierrez",A.D. Gibbs,"WILLARD, J.",1. CONTRACT LABOR; ACTION TO RECOVER PENALTIES...,"Section 5 of the act of Congress of March 3, 1...","Arellano, C.J., Mapa, Johnson and Carson, JJ.",False,True,https://chanrobles.com/cralaw/1906januarydecis...
4,19065,1906,"January 4, 1906",2050,UNITED STATES,ROHILLA MARU,Pillsbury & Sut,Attorney-General Wilfley,"WILLARD, J.","1. ADMIRALTY; CHINESE EXCLUSION ACT, VIOLATION...",Seven Chinese laborers arrived in Manila Bay f...,"Arellano, C.J., Mapa, Johnson and Carson, JJ.",False,False,https://chanrobles.com/cralaw/1906januarydecis...


In [22]:
backup_data.groupby('year').size()

year
1906    536
1907    677
1908    422
1909    585
dtype: int64

In [35]:
# read the data v1
data_v1 = pd.read_csv(data_path / 'final_case_details_0604_v1.csv')
backup_data = pd.read_csv(data_path / 'cleaned_case_details_0607_1906_1910.csv')

# Delete the rows with year 1906, 1907, 1908, 1909
data_v1['year'] = data_v1['year'].astype(int)
data_v2 = data_v1[data_v1['year'].isin([1906, 1907, 1908, 1909]) == False].append(backup_data)

data_v2.groupby('year').size()

  data_v2 = data_v1[data_v1['year'].isin([1906, 1907, 1908, 1909]) == False].append(backup_data)


year
1901     31
1902    160
1903    217
1904    140
1905    389
1906    536
1907    677
1908    422
1909    585
1910    226
1911    279
1912    234
1913     94
1914    238
1915    310
1916    313
1917    268
1918    214
1919    129
1920    107
1921    119
1922    168
1923    164
1924    200
1925    151
1926    210
1927    174
1928    196
1929    213
1930    201
1931    151
1932    151
1933    233
1934    257
1935    225
1936    141
1937    135
1938    164
1939    334
1940    278
1941    318
1942     94
1943     71
1944     33
1945     32
1946    182
dtype: int64

In [36]:
# save to csv
data.to_csv(data_path / 'final_case_details_0607.csv', index=False)

## EDA

In [37]:

# Preliminary scrapping data results 
data = pd.read_csv(data_path / 'final_case_details_0607.csv')

print(f"After droping, the data has '{data.shape[0]}' rows")

data.groupby('year').size()




After droping, the data has '10164' rows


year
1901     31
1902    160
1903    217
1904    140
1905    389
1906    536
1907    677
1908    422
1909    585
1910    226
1911    279
1912    234
1913     94
1914    238
1915    310
1916    313
1917    268
1918    214
1919    129
1920    107
1921    119
1922    168
1923    164
1924    200
1925    151
1926    210
1927    174
1928    196
1929    213
1930    201
1931    151
1932    151
1933    233
1934    257
1935    225
1936    141
1937    135
1938    164
1939    334
1940    278
1941    318
1942     94
1943     71
1944     33
1945     32
1946    182
dtype: int64

In [227]:
# how many columns that do not have year from 1901 to 1946
data['year'] = data['year'].astype(int)
data[(data['year'] < 1901) | (data['year'] > 1946)].shape[0]

0