### Unzip all files （unzip into one folder [extract_pdf] ，delete all doclist files）


```bash
for f in task3Data/*
do
    unzip "$f" -d extract_pdf
    # echo $f
done
```





In [1]:
import os
from tqdm import tqdm
import PyPDF2
import pandas as pd
import re

In [2]:
PDF_FILE_DIR='./extract_pdf'

def parse_by_PyPDF2(pdf_path):
    mypdf = open(pdf_path, mode='rb')

    pdf_document = PyPDF2.PdfFileReader(mypdf)

    page_num = pdf_document.numPages
    page_txts = []

    try:
        annots = pdf_document.getPage(0).getObject()['/Annots'][0]
        page_url = annots.getObject()['/A']['/URI']
    except:
        page_url = None

    for page in range(page_num):
        txt = pdf_document.getPage(page).extractText()
        page_txts.append(txt)

    return page_txts, page_url


PDF_PAGES={}
for fname in tqdm(os.listdir(PDF_FILE_DIR),desc='pdf converting'):
    if not fname.endswith('.pdf'):
        continue
    fp=os.path.join(PDF_FILE_DIR,fname)
    page_txts,page_url=parse_by_PyPDF2(fp)
    PDF_PAGES[fname]=(page_txts,page_url)

pdf文件解析中: 100%|██████████| 503/503 [01:11<00:00,  7.03it/s]


In [3]:
p_title=re.compile('\d{4}.*?\d{4,5}',re.IGNORECASE|re.S)
p_date=re.compile('[a-z]+\.* \d{1,2},* \d{4}',re.IGNORECASE|re.S)
p_citation=re.compile('Reporter\n(.*?)\n',re.IGNORECASE|re.S)

p_matterSMP=re.compile('\nMATTER OF \w-\w-\w-\n',re.IGNORECASE|re.S)
# p_appeal_motion=re.compile('Reporter\n(.*?)\n',re.IGNORECASE|re.S)
p_number_aao=re.compile('ID#\s+(\d+)\s+\((AAO.*?)\)',re.IGNORECASE|re.S)
p_status=re.compile('ORDER\s*:(.*?)\.',re.IGNORECASE|re.S)
p_counsel=re.compile('Counsel.*?ON BEHALF OF (APPLICANT|RESPONDENT):\s+([\S ]+)\n',re.IGNORECASE|re.S)
p_motion=re.compile('\Wmotion\W',re.IGNORECASE|re.S)

summary_begin=['The Applicant,* a native and citizen',
               'The applicant seeks',
               'The applicant is',
               'The Applicant, an',
               'The Applicant, who',
               'The respondent',
               'the petitioner',
               'The Obligor',
               ]
p_summary=re.compile(f"({'|'.join(summary_begin)}).*?(\.\n|$)",re.IGNORECASE|re.S)

core_terms_end=[
    'opinion',
    'headnotes',
    '\[',
    'counsel',
]
p_core_terms=re.compile(f"Core Terms(.*?)({'|'.join(core_terms_end)})",re.IGNORECASE|re.S)


BASE_DICT = {
    'title': None,
    'date': None,
    'appealOrMotion':None,
    'core_terms':None,
    'citation': None,
    'S-M-P':None,
    'number': None,
    'aao': None,
    'counsel': None,
    'status': None,
    'summary': None,
}

def extract_info(pages):

    page_first=pages[0]
    page_last=pages[-1]

    title=p_title.search(page_first).group()
    date=p_date.search(page_first).group()
    citation=p_citation.search(page_first).group(1)


    # core terms
    if p_core_terms.search(pages[0]):
        core_terms=p_core_terms.search(pages[0]).group(1)
        core_terms=core_terms.replace('\n','')
    else:
        core_terms=None

    SMP_FLAG= True if p_matterSMP.search(page_first) else False
    if SMP_FLAG:
        if p_number_aao.search(page_last):
            number,aao=p_number_aao.search(page_last).groups()
        elif p_number_aao.search(pages[-2]):
            number,aao=p_number_aao.search(pages[-2]).groups()
        else:
            number,aao=None,'UNKNOWN'    
    else:
        number,aao=None,'UNKNOWN'

    COUNSEL_FLAG=p_counsel.search(page_first)
    if COUNSEL_FLAG:
        counsel=COUNSEL_FLAG.group()
        if 'REPRESENTATIVE' in counsel.upper():
            counsel='not represented'
        elif 'redacted' in counsel.lower():
            counsel='represented' 
        # 9020 special case
        elif 'RESPONDENT' in counsel.upper():
            counsel='represented' 
    else:
        counsel='UNKNOWN'

    if p_status.search(page_last):
        status=p_status.search(page_last).group(1).strip()
    elif p_status.search(pages[-2]):
        status=p_status.search(pages[-2]).group(1).strip()
    else:
        status=None

    A_or_M=None
    if status:
        if 'motion' in status:
            A_or_M='motion'
        elif 'appeal' in status:
            A_or_M='appeal'
    if A_or_M is None:
        if p_motion.search(page_first):
            A_or_M='motion'
        else:
            A_or_M='appeal'
    
    
    if p_summary.search(page_first):
        summary=p_summary.search(page_first).group()
    elif p_summary.search(pages[1]):
        summary=p_summary.search(pages[1]).group()
    else:
        summary=None
    

    
    THIS_DIC=BASE_DICT.copy()
    '''
    AAO\n [*6] \n Dec. 13, 2016
    '''
    p_clean_text=re.compile('\n\s+\[\*\d+\]\s+\n',re.IGNORECASE|re.S)
    aao=p_clean_text.sub('',aao) if aao else aao
    summary=p_clean_text.sub('',summary) if summary else summary
    status=p_clean_text.sub('',status) if status else status

    THIS_DIC.update([
        ('title',title),
        ('date',date),
        ('appealOrMotion',A_or_M),
        ('core_terms',core_terms),
        ('citation',citation),
        ('number',number),
        ('aao',aao),
        ('counsel',counsel),
        ('status',status),
        ('summary',summary),
        ('S-M-P',SMP_FLAG),
    ])
    return THIS_DIC


In [4]:
ALL_DICT=[]
for fname,(page_txts,page_url) in tqdm(PDF_PAGES.items(),desc='matching'):
    try:
        this_dict=extract_info(page_txts)
        this_dict['URL']=page_url
        ALL_DICT.append(this_dict)
    except:
        print(fname)
        import traceback
        traceback.print_exc()
        break

df=pd.DataFrame(ALL_DICT) 
df

信息匹配解析中: 100%|██████████| 502/502 [00:00<00:00, 2174.67it/s]


Unnamed: 0,title,date,appealOrMotion,core_terms,citation,S-M-P,number,aao,counsel,status,summary,URL
0,2015 Immig. Rptr. LEXIS 29901,"APR 22, 2015",appeal,"redact, traffic, teacher, recruiter, applicant...",2015 Immig. Rptr. LEXIS 29901 *,False,,UNKNOWN,not represented,The appeal will be dismissed,The applicant seeks nonimmigrant classificatio...,https://plus.lexis.com/api/document?collection...
1,2016 Immig. Rptr. LEXIS 7578,"December 13, 2016",appeal,"traffic, nonimmigrant",2016 Immig. Rptr. LEXIS 7578 *,True,62410,"AAO Dec. 13, 2016",UNKNOWN,"The initial decision of the Director, Vermont ...","The Applicant seeks ""T-1"" nonimmigrant classif...",https://plus.lexis.com/api/document?collection...
2,2007 Immig. Rptr. LEXIS 11984,"SEP 28, 2007",appeal,"traffic, redact, severe form, slavery, border,...",2007 Immig. Rptr. LEXIS 11984 *,False,,UNKNOWN,represented,The appeal is dismissed,The applicant is a native and citizen of China...,https://plus.lexis.com/api/document?collection...
3,2005 Immig. Rptr. LEXIS 34475,"JUN 30, 2005",appeal,"redact, traffic, severe form, endorsement, inv...",2005 Immig. Rptr. LEXIS 34475 *,False,,UNKNOWN,UNKNOWN,The decision of the director is withdrawn,The applicant is a native and citizen of Armen...,https://plus.lexis.com/api/document?collection...
4,2019 Immig. Rptr. LEXIS 3800,"FEB 27, 2019",appeal,"redact, traffic, severe form, teacher, visa, s...",2019 Immig. Rptr. LEXIS 3800 *,False,,UNKNOWN,UNKNOWN,The appeal is dismissed,The Applicant seeks T-1 nonimmigrant classific...,https://plus.lexis.com/api/document?collection...
...,...,...,...,...,...,...,...,...,...,...,...,...
497,2014 Immig. Rptr. LEXIS 6910,"August 23, 2014",appeal,"visa, closure, immigrate, removal proceedings,...",2014 Immig. Rptr. LEXIS 6910 *,False,,UNKNOWN,represented,The DHS's appeal is sustained,"the respondent's applications for ""T"" and ""U"" ...",https://plus.lexis.com/api/document?collection...
498,2006 Immig. Rptr. LEXIS 18892,"FEB 16, 2006",appeal,"traffic, severe form, endorsement, involuntary...",2006 Immig. Rptr. LEXIS 18892 *,False,,UNKNOWN,represented,The appeal is dismissed,The applicant is a native and citizen of India...,https://plus.lexis.com/api/document?collection...
499,2006 Immig. Rptr. LEXIS 18267,"MAR 17, 2006",appeal,"traffic, severe form, endorsement, involuntary...",2006 Immig. Rptr. LEXIS 18267 *,False,,UNKNOWN,represented,The appeal is dismissed,The applicant is a native and citizen of India...,https://plus.lexis.com/api/document?collection...
500,2006 Immig. Rptr. LEXIS 18785,"JAN 31, 2006",appeal,"traffic, severe form, endorsement, involuntary...",2006 Immig. Rptr. LEXIS 18785 *,False,,UNKNOWN,represented,The appeal is dismissed,The applicant is a native and citizen of India...,https://plus.lexis.com/api/document?collection...


In [104]:
df.to_csv('/tvisa_final.csv',index=None)