# Extracts and organizes jurisprudence documents
* This notebook will extract all labor related and non labor related jurisprudence documents and organize them into a single folder for use and access by utilizing the juris_meta.csv file

In [102]:
import pandas as pd
import numpy as np
import re

from utils.data_loaders import load_juris_meta

%load_ext autoreload
%autoreload 2

pd.set_option('display.max_rows', None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [103]:
df = load_juris_meta('./juris_meta.csv')
df.head()

Unnamed: 0,file_path,answer,title,file_name,year,month,day,gr_number,division,case_code
0,d:\Projects\To Github\LaRJ-Corpus\raw jurispru...,NOT LABOR RELATED,"LONGINOS JAVIER, PLAINTIFF AND APPELLEE, VS. S...",126281645830902.html,1906,10,18,g.r. no. 2812,,g.r. no. 2812
1,d:\Projects\To Github\LaRJ-Corpus\raw jurispru...,TIME_LIMIT_REACHED,"LORENZA PAEZ, PETITIONER AND APPELLEE, VS. JOS...",126281645831590.html,1906,10,26,g.r. no. 3547,,6 phil. 521
2,d:\Projects\To Github\LaRJ-Corpus\raw jurispru...,RATE_LIMIT_ERROR,"THE UNITED STATES, COMPLAINANT AND APPELLEE, V...",126281645832666.html,1906,10,9,g.r. no. 2977,,6 phil. 486
3,d:\Projects\To Github\LaRJ-Corpus\raw jurispru...,NOT LABOR RELATED,"THE UNITED STATES, COMPLAINANT AND APPELLEE, V...",12628164584264.html,1906,10,2,g.r. no. 3038,,6 phil. 480
4,d:\Projects\To Github\LaRJ-Corpus\raw jurispru...,NOT LABOR RELATED,"ELENA JAVIER, WIDOW OF NER, PLAINTIFF AND APPE...",12628164585412.html,1906,10,3,g.r. no. 2875,,6 phil. 484


In [104]:
df.head()['file_path'][0]

'd:\\Projects\\To Github\\LaRJ-Corpus\\raw jurisprudence/juris/1901-1920/126281645830902.html'

We check if a specific citation made by a case appears in our 

In [105]:
print(df['gr_number'])

0                                            g.r. no. 2812
1                                            g.r. no. 3547
2                                            g.r. no. 2977
3                                            g.r. no. 3038
4                                            g.r. no. 2875
5                                            g.r. no. 2589
6                                            g.r. no. 1664
7                                            g.r. no. 1468
8                                            g.r. no. 1476
9                                            g.r. no. 1491
10                                           g.r. no. 1543
11                                           g.r. no. 1550
12                                           g.r. no. 1560
13                                           g.r. no. 1582
14                                           g.r. no. 1575
15                                           g.r. no. 1655
16                                           g.r. no. 15

In [106]:
df.iloc[26409]

file_path    d:\Projects\To Github\LaRJ-Corpus\raw jurispru...
answer                                           LABOR RELATED
title        EDWARD C. DE CASTRO AND MA. GIRLIE F. PLATON, ...
file_name                             1484788557913314112.html
year                                                      2016
month                                                       10
day                                                          5
gr_number                                       g.r.no. 204261
division                                       SECOND DIVISION
case_code                                        796 phil. 681
Name: 26409, dtype: object

In [107]:
df.iloc[23004]

file_path    d:\Projects\To Github\LaRJ-Corpus\raw jurispru...
answer                                           LABOR RELATED
title        BPI EMPLOYEES UNION-DAVAO CITY-FUBU (BPIEU-DAV...
file_name                            13765461411240584061.html
year                                                      2013
month                                                        7
day                                                         24
gr_number                                      g.r. no. 174912
division                                        THIRD DIVISION
case_code                                         715 phil. 35
Name: 23004, dtype: object

# Goal now is to extract information...
such that the gr_number column will be easily cross referenced by the data_transformed.xlsx file containing meta data of cases such as the case laws or in particular the labor related jurisprudence they cited in their case

* isolating it and removing consecutive numbers of the case
* removing the gr, adm, am, no, per, rec etc. and isolating the numbers

constraints:
* for jurisprudence with gr_number "a.c. no. 8608 [formerly cbd case no. 11-2907]" ---replace "formerly" with "or"--> "a.c. no. 8608 [or cbd case no. 11-2907]" ---remove "[" and "]" chars--> 
* there will be gr_numbers with:
a. "g.r. no. 46802-46812" which have numbers with a certain interval
b. "g.r. no. 42590, 42591" which have two numbers
c. "g.r. no. 43522, 43523, 43751-43753" which have certain numbers and numbers also with a certain interval
d. "per. rec. no. 714-a" which have numbers with hyphens and alphabets
e. "g.r. no. l-5984 and l-5985" which have two numbers with hyphens and alphabets
f. "g.r. no. l-11319-20; l-13504; l-13507-8" which have two or more numbers with hyphens and alphabets separated with semi-colon

In [108]:
sample_gr_nos = ["g.r. no. 46802-46812",
"g.r. no. 42590, 42591",
"adm. case no. 879",
"g.r. no. 45274 and 45275",
"g.r. no. 43522, 43523, 43751-43753",
"per. rec. no. 714-a",
"g.r. no. l-5984 and l-5985",
"g.r. no. l-11319-20; l-13504; l-13507-8",
"a.c. no. 9906",
"a.m. no. p-14-3233 [formerly oca ipi no. 12-37...",
"a.c. no. 8608 [formerly cbd case no. 11-2907]"]

# regex = r'([0-9a-zA-Z]{2,})|([\[])'
regex = r'(am)|(adm)|(no)|(per)|(rec)|(gr)|(ac)|(cbd)|(formerly)|(oca)|(ipi)|()'

# Remove the . char first to simply things

In [109]:
gr_numbers = df['gr_number'].apply(lambda gr_num: re.sub(r'[.\[\]\(\)]', '', gr_num))
print(gr_numbers)

0                                               gr no 2812
1                                               gr no 3547
2                                               gr no 2977
3                                               gr no 3038
4                                               gr no 2875
5                                               gr no 2589
6                                               gr no 1664
7                                               gr no 1468
8                                               gr no 1476
9                                               gr no 1491
10                                              gr no 1543
11                                              gr no 1550
12                                              gr no 1560
13                                              gr no 1582
14                                              gr no 1575
15                                              gr no 1655
16                                              gr no 15

# remove characters with gr, no, am, pec, rec

In [110]:
# matches all 2 or more consecutive characters with letters granmo
gr_numbers = gr_numbers.apply(lambda gr_num: re.sub(r'[gramno]{,2}', '', gr_num))
print(gr_numbers)

0                                                     2812
1                                                     3547
2                                                     2977
3                                                     3038
4                                                     2875
5                                                     2589
6                                                     1664
7                                                     1468
8                                                     1476
9                                                     1491
10                                                    1543
11                                                    1550
12                                                    1560
13                                                    1582
14                                                    1575
15                                                    1655
16                                                    15