In [1]:
import pandas as pd
import fitz #PyMuPDF
import re

from pdfminer.high_level import extract_text

In [2]:
test_file = '/Users/johnathonsmith/Downloads/400TH PENDING CIVIL.pdf'

In [3]:
#open the pdf file
document = fitz.open(test_file)

In [4]:
page = document.load_page(0)

In [5]:
text = page.get_text('text')
text

'Case Number\nLocation\nJudicial Officer\nCase\nCategory\nCase Type\nStyle\nCase Status\nCase Subtype\n06-DCV-151861\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCivil\nContract -\nConsumer/Commercial/Debt\nHealix Infusion Therapy, Inc. Plaintiff V. Associates of\nPulmonary & Critical Care Medicine, P.A. and Alan R.\nVarraux, M.D.\nRe-Opened\n18-DCV-254831\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCivil\nInjury or Damage - Other Injury\nor Damage\nDarrell Hall Vs. Texas Department of Criminal Justice\nRe-Opened\n18-DCV-255138\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCivil\nInjury or Damage Involving Motor\nVehicle\nSusana Tena Individually and as Next Friend of Minor vs\nGerald Murray\nRe-Opened\n18-DCV-255138A\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCivil\nInjury or Damage Involving Motor\nVehicle\nSusana Tena Individually and as Next Friend of Lilyann\nRodriguez, Minor v. Gerald Murray\nActive\n18-DCV-256820\n400th Judicial\nDistrict Court\

In [10]:
#Cut off the end of the page after the last case's info
text = text[:text.find('\nCase Filed Date\nDate Range Type:\n')]
text

'Case Number\nLocation\nJudicial Officer\nCase\nCategory\nCase Type\nStyle\nCase Status\nCase Subtype\n06-DCV-151861\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCivil\nContract -\nConsumer/Commercial/Debt\nHealix Infusion Therapy, Inc. Plaintiff V. Associates of\nPulmonary & Critical Care Medicine, P.A. and Alan R.\nVarraux, M.D.\nRe-Opened\n18-DCV-254831\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCivil\nInjury or Damage - Other Injury\nor Damage\nDarrell Hall Vs. Texas Department of Criminal Justice\nRe-Opened\n18-DCV-255138\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCivil\nInjury or Damage Involving Motor\nVehicle\nSusana Tena Individually and as Next Friend of Minor vs\nGerald Murray\nRe-Opened\n18-DCV-255138A\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCivil\nInjury or Damage Involving Motor\nVehicle\nSusana Tena Individually and as Next Friend of Lilyann\nRodriguez, Minor v. Gerald Murray\nActive\n18-DCV-256820\n400th Judicial\nDistrict Court\

In [11]:
#Find all cause numbers
cause_nums = re.findall(r'[0-9]{2}-[DCVR]{3}-[0-9A-Z]*', text)
cause_nums

['06-DCV-151861',
 '18-DCV-254831',
 '18-DCV-255138',
 '18-DCV-255138A',
 '18-DCV-256820',
 '18-DCV-258117',
 '19-DCV-264381',
 '20-DCV-272533',
 '20-DCV-273051']

In [28]:
case = text[text.find(cause_nums[0]):text.find(cause_nums[1])]
case = case[case.find('\nCivil'):].strip()
case = case[case.find('\n'):].strip()
case

'Contract -\nConsumer/Commercial/Debt\nHealix Infusion Therapy, Inc. Plaintiff V. Associates of\nPulmonary & Critical Care Medicine, P.A. and Alan R.\nVarraux, M.D.\nRe-Opened'

In [12]:
num_cases = len(cause_nums)
num_cases

9

In [34]:
#Create empty lists to store info
case_types = []
styles = []
statuses = []

single_line_case_types = [
    'Other Civil',
    'Other Tax',
    'Expunction',
    'Judgment NISI',
    'Contract - Other Contract',
    'Seizure and Forfeiture',
    'Non-Disclosure',
    'Sexual Predator'
]

In [35]:
#Now loop through each cases info by segmenting the text with the locations of each cause num
for i, cause_num in enumerate(cause_nums):
    #Check if this is the last case in the list
    if i == len(cause_nums) - 1:
        case_info = text[text.find(cause_num):].strip()
    else:
        case_info = text[text.find(cause_num):text.find(cause_nums[i+1])].strip()
    
    #We already have cause numbers
    #Location, Officer, and Category are all static in this report
    #So those will be hardcoded and added later
    case_info = case_info[case_info.find('\nCivil'):].strip()
    case_info = case_info[case_info.find('\n'):].strip()
    
    #We need to gather the case type, style, and status
    #Status is everything from the last '\n' to the end of the string
    statuses.append(case_info[case_info.rfind('\n'):].strip())
    
    #Now remove that portion of the string from case_info
    case_info = case_info[:case_info.rfind('\n')]
    
    #If the string up to the first '\n' matches any of the single line case types, then that is the entire case type
    #If it does not, then the case type is comprised of the strings from the first TWO '\n' characters
    case_type = case_info[:case_info.find('\n')]
    case_info = case_info[case_info.find('\n') + 1:]
    
    if case_type in single_line_case_types:
        case_types.append(case_type.strip())
    else:
        case_types.append(case_type + case_info[:case_info.find('\n')].strip())
        case_info = case_info[case_info.find('\n') + 1:]
    
    #At this point, the rest of the string should be the style
    styles.append(case_info.strip())

In [36]:
case_types

['Contract -Consumer/Commercial/Debt',
 'Injury or Damage - Other Injuryor Damage',
 'Injury or Damage Involving MotorVehicle',
 'Injury or Damage Involving MotorVehicle',
 'Other Civil',
 'Other Civil',
 'Injury or Damage Involving MotorVehicle',
 'Other Tax',
 'Injury or Damage Involving MotorVehicle']

In [37]:
statuses

['Re-Opened',
 'Re-Opened',
 'Re-Opened',
 'Active',
 'Pending',
 'Pending',
 'Re-Opened',
 'Pending',
 'Re-Opened']

In [38]:
styles

['Healix Infusion Therapy, Inc. Plaintiff V. Associates of\nPulmonary & Critical Care Medicine, P.A. and Alan R.\nVarraux, M.D.',
 'Darrell Hall Vs. Texas Department of Criminal Justice',
 'Susana Tena Individually and as Next Friend of Minor vs\nGerald Murray',
 'Susana Tena Individually and as Next Friend of Lilyann\nRodriguez, Minor v. Gerald Murray',
 'Momentum Project Controls, LLC d/b/a Momentum\nGeneral Contractors v. Booflies to Breefras LLC, D/B/A\nKiddie Academy, Ian Baierlipp, Individually and Teri\nBaierlipp, Individually',
 'Douglas Duncan Lewis and Elizabeth Pamela Lewis vs\nDimension Homes, Inc. and Jeffrey Dziuk',
 'Habtemariam Kubrom, Individually and A/N/F of Minor\nChildren T.K and S.K vs Mindy Gail Gilbert',
 'Fort Bend County vs Destiny Faith Johnson, Et Al',
 'Eucharia Nwabufo V. Holly Marie Arensman']

In [75]:
def parse_civil_report(document):
    """
    This function takes in the text of the PDF report and generates a df.
    The df will still need to be cleaned up, and other static columns added.
    """
    #Set up single line case types
    single_line_case_types = [
        'Other Civil',
        'Other Tax',
        'Expunction',
        'Judgment NISI',
        'Contract - Other Contract',
        'Seizure and Forfeiture',
        'Non-Disclosure',
        'Sexual Predator',
        'Filing of Fraudulent Lien - Civil'
    ]
    
    #Create an empty df. We will use this one to concat each page's dataframe to
    df = pd.DataFrame()
    
    for page_num, page in enumerate(document):
        #Plus one because page_num starts at zero
        page_num = page_num + 1
        
        #Create empty lists to store info
        case_types = []
        styles = []
        statuses = []
        
        #Rest my_dict
        my_dict = {}
        
        #Get page text
        text = page.get_text('text')
        
        #Cut off the end of the page after the last case's info
        #If last page, remove Total Report Count as well
        if page_num == document.page_count:
            text = text[:text.find('\nTotal Report Count\nCase Filed Date\nDate Range Type:\n')]
            text = text[:text.rfind('\n')]
        else:
            text = text[:text.find('\nCase Filed Date\nDate Range Type:\n')]
        
        #Find all cause numbers on this page
        cause_nums = re.findall(r'[0-9]{2}-[DCVR]{3}-[0-9A-Z]*', text)
        
        #Now loop through each case's info by segmenting the text with the locations of each cause num
        for i, cause_num in enumerate(cause_nums):
            #Check if this is the last case in the list
            if i == len(cause_nums) - 1:
                case_info = text[text.find(cause_num):].strip()
            else:
                case_info = text[text.find(cause_num):text.find(cause_nums[i+1])].strip()

            #We already have cause numbers
            #Location, Officer, and Category are all static in this report
            #So those will be hardcoded and added later
            case_info = case_info[case_info.find('\nCivil'):].strip()
            case_info = case_info[case_info.find('\n'):].strip()

            #We need to gather the case type, style, and status
            #Status is everything from the last '\n' to the end of the string
            statuses.append(case_info[case_info.rfind('\n'):].strip())

            #Now remove that portion of the string from case_info
            case_info = case_info[:case_info.rfind('\n')]

            #If the string up to the first '\n' matches any of the single line case types, then that is the entire case type
            #If it does not, then the case type is comprised of the strings from the first TWO '\n' characters
            case_type = case_info[:case_info.find('\n')]
            case_info = case_info[case_info.find('\n') + 1:]

            if case_type in single_line_case_types:
                case_types.append(case_type.strip())
            else:
                case_types.append(case_type + ' ' + case_info[:case_info.find('\n')].strip())
                case_info = case_info[case_info.find('\n') + 1:]

            #At this point, the rest of the string should be the style
            styles.append(case_info.strip())
            
        #For error checking, verify that each list is the same length. They should all have the same number of values
        num_cases = len(cause_nums)

        if num_cases != len(case_types):
            print(f"Error on Page {page_num}!")
            print('Num Cases Does Not Equal Number of Case Type Values')
            break

        if num_cases != len(styles):
            print(f"Error on Page {page_num}!")
            print('Num Cases Does Not Equal Number of Style Values')
            break

        if num_cases != len(statuses):
            print(f"Error on Page {page_num}!")
            print('Num Cases Does Not Equal Number of Status Values')
            break
        
        #Create a dictionary. Make the keys the names of the columns, and the values the lists.
        my_dict = {
            'Cause Number': cause_nums,
            'Case Type': case_types,
            'Style': styles,
            'Status': statuses
        }
        
        page_df = pd.DataFrame(my_dict)
        
        #Once we have the current page's dataframe, add it to the overall dataframe
        df = pd.concat([df,page_df], ignore_index = True)
        
    return df

In [76]:
df = parse_civil_report(document)
df

Unnamed: 0,Cause Number,Case Type,Style,Status
0,06-DCV-151861,Contract - Consumer/Commercial/Debt,"Healix Infusion Therapy, Inc. Plaintiff V. Ass...",Re-Opened
1,18-DCV-254831,Injury or Damage - Other Injury or Damage,Darrell Hall Vs. Texas Department of Criminal ...,Re-Opened
2,18-DCV-255138,Injury or Damage Involving Motor Vehicle,Susana Tena Individually and as Next Friend of...,Re-Opened
3,18-DCV-255138A,Injury or Damage Involving Motor Vehicle,Susana Tena Individually and as Next Friend of...,Active
4,18-DCV-256820,Other Civil,"Momentum Project Controls, LLC d/b/a Momentum\...",Pending
...,...,...,...,...
1465,25-DCV-336453,Injury or Damage Involving Motor Vehicle,"Donald Mcdaniel. Sylvia Idrogo, and Cynthia De...",Pending
1466,25-DCV-336455,Contract - Other Contract,Jimmy Hernandez and Hanna Nguyen V. State Farm...,Pending
1467,25-DCV-336456,Other Tax,Fort Bend Independent School District Vs. John...,Pending
1468,25-DCV-336479,Contract - Consumer/Commercial/Debt,JWB Architects Vs C3 Wellness Texas LLC,Pending


In [78]:
df['Case Type'].unique()

array(['Contract - Consumer/Commercial/Debt',
       'Injury or Damage - Other Injury or Damage',
       'Injury or Damage Involving Motor Vehicle', 'Other Civil',
       'Other Tax', 'Injury or Damage Medical Malpractice',
       'Real Property - Other Real Property', 'Contract - Other Contract',
       'Seizure and Forfeiture', 'Judgment NISI',
       'Filing of Fraudulent Lien - Civil',
       'Injury or Damage Involving Motor Vehicl', 'Expunction',
       'Injury or Damage Other Product Liability', 'Non-Disclosure',
       'Sexual Predator'], dtype=object)

In [79]:
df['Officer'] = 'Krenek, Edward M.'
df['Category'] = 'Civil'
df['Location'] = '400th District Judicial Court'

In [81]:
df = df[[
    'Cause Number',
    'Location',
    'Officer',
    'Category',
    'Case Type',
    'Style',
    'Status'
]]

In [82]:
df

Unnamed: 0,Cause Number,Location,Officer,Category,Case Type,Style,Status
0,06-DCV-151861,400th District Judicial Court,"Krenek, Edward M.",Civil,Contract - Consumer/Commercial/Debt,"Healix Infusion Therapy, Inc. Plaintiff V. Ass...",Re-Opened
1,18-DCV-254831,400th District Judicial Court,"Krenek, Edward M.",Civil,Injury or Damage - Other Injury or Damage,Darrell Hall Vs. Texas Department of Criminal ...,Re-Opened
2,18-DCV-255138,400th District Judicial Court,"Krenek, Edward M.",Civil,Injury or Damage Involving Motor Vehicle,Susana Tena Individually and as Next Friend of...,Re-Opened
3,18-DCV-255138A,400th District Judicial Court,"Krenek, Edward M.",Civil,Injury or Damage Involving Motor Vehicle,Susana Tena Individually and as Next Friend of...,Active
4,18-DCV-256820,400th District Judicial Court,"Krenek, Edward M.",Civil,Other Civil,"Momentum Project Controls, LLC d/b/a Momentum\...",Pending
...,...,...,...,...,...,...,...
1465,25-DCV-336453,400th District Judicial Court,"Krenek, Edward M.",Civil,Injury or Damage Involving Motor Vehicle,"Donald Mcdaniel. Sylvia Idrogo, and Cynthia De...",Pending
1466,25-DCV-336455,400th District Judicial Court,"Krenek, Edward M.",Civil,Contract - Other Contract,Jimmy Hernandez and Hanna Nguyen V. State Farm...,Pending
1467,25-DCV-336456,400th District Judicial Court,"Krenek, Edward M.",Civil,Other Tax,Fort Bend Independent School District Vs. John...,Pending
1468,25-DCV-336479,400th District Judicial Court,"Krenek, Edward M.",Civil,Contract - Consumer/Commercial/Debt,JWB Architects Vs C3 Wellness Texas LLC,Pending


In [83]:
df['Style'] = df['Style'].str.replace('\n', ' ')

In [86]:
df

Unnamed: 0,Cause Number,Location,Officer,Category,Case Type,Style,Status
0,06-DCV-151861,400th District Judicial Court,"Krenek, Edward M.",Civil,Contract - Consumer/Commercial/Debt,"Healix Infusion Therapy, Inc. Plaintiff V. Ass...",Re-Opened
1,18-DCV-254831,400th District Judicial Court,"Krenek, Edward M.",Civil,Injury or Damage - Other Injury or Damage,Darrell Hall Vs. Texas Department of Criminal ...,Re-Opened
2,18-DCV-255138,400th District Judicial Court,"Krenek, Edward M.",Civil,Injury or Damage Involving Motor Vehicle,Susana Tena Individually and as Next Friend of...,Re-Opened
3,18-DCV-255138A,400th District Judicial Court,"Krenek, Edward M.",Civil,Injury or Damage Involving Motor Vehicle,Susana Tena Individually and as Next Friend of...,Active
4,18-DCV-256820,400th District Judicial Court,"Krenek, Edward M.",Civil,Other Civil,"Momentum Project Controls, LLC d/b/a Momentum ...",Pending
...,...,...,...,...,...,...,...
1465,25-DCV-336453,400th District Judicial Court,"Krenek, Edward M.",Civil,Injury or Damage Involving Motor Vehicle,"Donald Mcdaniel. Sylvia Idrogo, and Cynthia De...",Pending
1466,25-DCV-336455,400th District Judicial Court,"Krenek, Edward M.",Civil,Contract - Other Contract,Jimmy Hernandez and Hanna Nguyen V. State Farm...,Pending
1467,25-DCV-336456,400th District Judicial Court,"Krenek, Edward M.",Civil,Other Tax,Fort Bend Independent School District Vs. John...,Pending
1468,25-DCV-336479,400th District Judicial Court,"Krenek, Edward M.",Civil,Contract - Consumer/Commercial/Debt,JWB Architects Vs C3 Wellness Texas LLC,Pending


In [88]:
df.to_csv('/Users/johnathonsmith/Downloads/400th_pending_civil_report.csv', index=False)