In [38]:
import csv
from datetime import datetime, timedelta
import shutil
from dateutil.relativedelta import relativedelta
from dateutil.parser import parse, parser
from pathlib import Path

import os
import pandas as pd
import pdfquery
import re




def is_date_within_a_year(order_date : str):

    prev_year = datetime.now() - relativedelta(years = 1)
    #if (order_date > (order_date - relativedelta(years = 1))): 
    order_date =datetime.strptime(order_date, "%m/%d/%Y")
    if (order_date > prev_year):   
        return True
    else:
        return False


def date_fixer(date_string, format="%m/%d/%Y"):    
    return parse(str(date_string)).strftime(format)
   

def get_date_mmddyyyy(label : pdfquery):
    m = re.search(r"\d\d/\d\d/\d\d\d\d",label)
    try:
        if m:
            return date_fixer(m[0])
    except:
        raise Exception(
                f"🔥 Error getting report date: '{ValueError}' - '{ValueError.text}'"
        )
   
def get_date_bddyyyy(label : pdfquery):

    # m = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-zA-Z.,-]*[\s-]?(\d{1,2})?[,\s-]?[\s]?\d{4}',
    #                                              label,re.I|re.M)
                                                 
    #m = re.search(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-zA-Z]*[\s]?(\d{1,2})?[,\s]?[\s]?\d{4}',
    #m = re.search(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)?[\s]\d{1,2}?[,\s]?[\s]\d{4}',label)

    # m = re.search(r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)*[\s](\d{1,2})?[,\s](\d{4})",label)

    m = re.search(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)?[\s]\d{1,2}?[,\s]?[\s]\d{4}', label)
    try:
        if m is not None:
            #strdate =  datetime.strptime(m[0], '%b %d, %Y').strftime('%m/%d/%Y')
            sdate =  datetime.strptime(m[0], '%b %d, %Y')
        else:
            # Check for this date format  dd mon yyyy
            m = re.search(r'\d{1,2}?[\s](Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)?[\s]\d{4}', label)
            #strdate =  datetime.strptime(m[0], '%d %b %Y').strftime('%m/%d/%Y') 
            sdate =  datetime.strptime(m[0], '%d %b %Y')
    except:
        raise Exception(
                f"🔥 Error getting report date: '{ValueError}' - '{ValueError.text}'"
        )  

    return date_fixer(sdate)

def report_completeddate(filepath: Path):
    pdf = pdfquery.PDFQuery(filepath)
    pdf.load()
    datelabel = pdf.pq('LTTextLineHorizontal:contains("Report completed on")')
    pdf.file.close()
    if (datelabel.attr('x0')) is not None:
        left_corner = float(datelabel.attr('x0'))
        bottom_corner = float(datelabel.attr('y0'))
        x1_corner = float(datelabel.attr('x1'))
        y1_corner = float(datelabel.attr('y1'))
        ldate = pdf.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner, bottom_corner, x1_corner, y1_corner)).text()
        report_date = get_date_bddyyyy(ldate)
        return report_date
    else:
        return ''
    
def report_orderdate(filepath: Path):
    pdf = pdfquery.PDFQuery(filepath)
    pdf.load()
    datelabel = pdf.pq('LTTextLineHorizontal:contains("Order date")')
    pdf.file.close()
    if (datelabel.attr('x0')) is not None:
        left_corner = float(datelabel.attr('x0'))
        bottom_corner = float(datelabel.attr('y0'))
        x1_corner = float(datelabel.attr('x1'))
        y1_corner = float(datelabel.attr('y1'))
        ldate = pdf.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner, bottom_corner, x1_corner, y1_corner)).text()
        report_date = get_date_mmddyyyy(ldate)   #different format than in report_completeddate
        return report_date
    else:
        return ''
 
def find_violations(filepath: Path):
    pdf = pdfquery.PDFQuery(filepath)
    pdf.load()
    label = pdf.pq('LTTextLineHorizontal:contains("There are no violations in the report.")')
    pdf.file.close()
    if (label.attr('x0')) is not None:
        return  'Pass'
    else:
        return 'Fail'

def find_status(filepath: Path):
    pdf = pdfquery.PDFQuery(filepath)
    pdf.load()
    label = pdf.pq('LTTextLineHorizontal:contains("Status")') 
    pdf.file.close()
    if (label.attr('x0')) is not None:
        left_corner = float(label.attr('x0'))
        bottom_corner = float(label.attr('y0'))
        status = pdf.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner, bottom_corner-30, left_corner+150, bottom_corner)).text()
        if (re.search('Clear', status)):
            return 'Pass'    
        else:
            return 'Fail'
    else:
        return 'Fail'
        
def find_pattern(filename: str):
    print(filename)
    m = re.search('_(BGC|MVR|DMV).pdf$', filename)
    #m = re.search('_(BGC|MVR|DMV).pdf[\S]*', filename)
    #m = re.search('_[\S]*', filename)
    if (m is not None):
        return m.group()
    return None

def write_csv(results: list[dict]):
    localpath = Path(r"C:\Projects\Pdfanalyzer")
    with open(localpath / 'drivers.csv', 'w', newline ='') as csv_file:  
        writer = csv.DictWriter(csv_file, fieldnames = ['Ftpfile', 'Localfile', 'Status', 'Renamedfile', 'Reportdate', 'Action'] ) 
        writer.writeheader()
        for row in results:
            writer.writerow(row)

def rename_file(results: list[dict]):

    for row in results:
        sourcefile = Path(row["Localfile"])
        targetfile = Path(row["Renamedfile"])
        print(f"🚗 {sourcefile} to {targetfile}")
        shutil.move(sourcefile, targetfile)
        row.update(Action='Renamed')
        print(f"🚗 {row}")
   
    return results

def download_files():
    files = []
    localpath = Path(r"C:\Projects\Pdfanalyzer")
    for ftpfile in  os.listdir(localpath):
        files.append(ftpfile)
    print(files)
    return files

def analyze_file(file_list: list[str]):    

    results = []
    filecount = 0
    trace = datetime.now()
    localpath = Path(r"C:\Projects\Pdfanalyzer")
    for ftpfile in  file_list:
        match_filename = find_pattern(ftpfile)
    
        if match_filename is not None:
            if ftpfile.endswith(match_filename):
                localfile = localpath / Path(ftpfile).name
            
                #get_reported_date(localfile)
                if match_filename in ['_BGC.pdf','_MVR.pdf']:
                    status = find_status(localfile)
                    reportdate = report_completeddate(localfile)
                else:
                    if match_filename in ['_DMV.pdf']:
                        status = find_violations(localfile)
                        reportdate = report_completeddate(localfile)
                    # else:
                    #     print('Not DMV, BGC or MVR file')
                    #     status = 'Not DMV, BGC or MVR file'
                if status in 'Pass':
                    pass_filename  = match_filename.split('.')[0]+ "_Pass" + ".pdf"

                    renamedfile = re.sub(match_filename,pass_filename,str(localfile),flags=re.IGNORECASE)    
                    results.append({"Ftpfile": ftpfile, "Localfile": str(localfile),"Status": "Pass", "Renamedfile": renamedfile, "Reportdate": reportdate, "Action": 'Validated'})
                else:
                    fail_filename  = match_filename.split('.')[0]+ "_Fail" + ".pdf"
                    renamedfile = re.sub(match_filename,fail_filename,str(localfile),flags=re.IGNORECASE)  
                    results.append({"Ftpfile": ftpfile, "Localfile": str(localfile), "Status": "Fail", "Renamedfile": renamedfile, "Reportdate": reportdate, "Action": 'Validated'})
                print(f"{filecount}) {results[filecount]}")
                filecount +=1          
    return results
   
write_csv(rename_file(analyze_file(download_files())))
  

['01072388_MVR.pdf', '035B041008_BGC.pdf', '108D355002_BGC.pdf', '108D355002_DL.jpg', '108D355002_DMV.pdf', '108D355002_secondaryDMV.pdf', '1204667_MVR.pdf', '134228_BGC.pdf', '134228_MVR.pdf', '1954237_ASEInspection.jpg', '1954237_BGC.pdf', '1954237_DL.jpg', '1954237_DMV.pdf', '215ZZ8790_BGC.pdf', '4362577_BGC.pdf', 'drivers.csv', 'Original', 'Redacted TNC Good_DMV.pdf']
01072388_MVR.pdf
0) {'Ftpfile': '01072388_MVR.pdf', 'Localfile': 'C:\\Projects\\Pdfanalyzer\\01072388_MVR.pdf', 'Status': 'Pass', 'Renamedfile': 'C:\\Projects\\Pdfanalyzer\\01072388_MVR_Pass.pdf', 'Reportdate': '03/22/2023', 'Action': 'Validated'}
035B041008_BGC.pdf
1) {'Ftpfile': '035B041008_BGC.pdf', 'Localfile': 'C:\\Projects\\Pdfanalyzer\\035B041008_BGC.pdf', 'Status': 'Pass', 'Renamedfile': 'C:\\Projects\\Pdfanalyzer\\035B041008_BGC_Pass.pdf', 'Reportdate': '09/20/2022', 'Action': 'Validated'}
108D355002_BGC.pdf
2) {'Ftpfile': '108D355002_BGC.pdf', 'Localfile': 'C:\\Projects\\Pdfanalyzer\\108D355002_BGC.pdf', 'St

AttributeError: type object 'ValueError' has no attribute 'text'