In [1]:
import cv2
import pytesseract
import os
from os import walk
import pandas as pd
import re
from PIL import Image
from PIL.ExifTags import GPSTAGS, TAGS
import numpy as np
import pygsheets


#Set this to 1 to see diagnostics
displayDebugInfo = 0
#Set this to 1 to export the resulting dataframe to Sheets
exportFlag = 0

In [2]:
#STEP ONE, get file names from the specified directory

# Set a variable to the path of where screenshots are located, then grab all the file names in that directory
dir1 = os.getcwd() + "/ScreenShots"
# walk returns (current path, directories in current path, files in current path) so [2] returns the files
# next is for error handling, but might not be necessary given the input context
filenames = next(walk(dir1), (None, None, []))[2]
filenames = sorted(filenames)

In [3]:
#STEP TWO, extract EXIF data, create df of file name and time stamp

# Only PNG screenshots are used, so experiencing errors here is unlikely, but error handling could be improved
timestampList = []
#fileNameList = []
for file in filenames:
    try:
        #fileNameList.append(file)
        file = dir1 + "/" + file
        image = Image.open(file)
        if image._getexif() == None:
            #Ideally, remove file from filenames if error occurs
            print(f"{file} contains no exif data.")
        else:
            for tag, value in image._getexif().items():
                tag_name = TAGS.get(tag)
                if tag_name == "DateTimeOriginal":
                    timestampList.append(value)
    except IOError:
        #Ideally, remove file from filenames if error occurs
        print("File format not supported!", file)
dfTuples = list(zip(filenames, timestampList))    
timeDF = pd.DataFrame(dfTuples, columns=["FileName", "Time"])


# DEBUG #
if displayDebugInfo:
    display(timeDF)

In [4]:
#STEP THREE : 
### For each file, read text from file
### Establish if image represents a delivery offer or completed delivery
### Strips large amount of unwanted text (input screenshots often include image of map with street names, etc)
### Add text as list, to list of values representing offers OR completions
### Add file name to offer or completion list depending on contents
### Return 4 lists: offer text, completion text, offer files, completion files

def testFunc(files_to_process_list):
    theList = []
    completionList = []
    offerFiles = []
    compFiles = []
    for file in files_to_process_list:
        #file to image
        img = cv2.imread(dir1 + "/" + file)
        #image to text
        data = pytesseract.image_to_string(img)
        
        # There are, to date, three classes of images.
        # One for orders through Marketplace, one for orders through Drive, and one for completions
        # Marketplace is in app, Drive is the white label solution
        # "Completed" should only appear in images of completed deliveries
        # "Guaranteed" should only appear in images from Marketplace offers
        # "items" should only appear in images from Drive offers 
        
        #MARKETPLACE DELIVERY OFFER
        if("Guaranteed" in data):
            # Add file to offer list
            # If $ found, strip text from before $, else store all text
            offerFiles.append(file)
            if("$" in data):
                position = data.index("$")
                textsplit = data[position:].splitlines()
                while ("" in textsplit):
                    textsplit.remove("")
                theList.append(textsplit)
            else:
                data = data.splitlines()
                theList.append(data)
        #DELIVERY COMPLETIONS
        elif("Completed" in data):
            # For these images, extraction works better when filters are applied, so they are reread with filters
            compFiles.append(file)
            img = cv2.imread(dir1 + "/" + file)
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
            data = pytesseract.image_to_string(thresh, lang='eng',config='--psm 6')

            # The below section works as before to exclude anything read from the map
            if("$" in data):
                position = data.index("$")
                textsplit = data[position:].splitlines()
                while("" in textsplit):
                    textsplit.remove("")
                completionList.append(textsplit)
            else:
                print("Error in", file, "$ not found")
                data = data.splitlines()
                completionList.append(data)
        #DRIVE DELIVERY OFFER
        elif("item" in data):
            # The interface has a slightly different format, so split the text into list of lines and process later
            offerFiles.append(file)
            data = data.splitlines()
            theList.append(data)                   
        else:
            print(file)
            print("ERROR WITH THIS FILE")
    return theList, completionList, offerFiles, compFiles

x = testFunc(filenames)
offersList     = x[0]
completionList = x[1]
offerFiles     = x[2]
compFiles      = x[3]

# DEBUG #
if displayDebugInfo:
    print("Offer Files: ", offerFiles)
    print("Completed Files: ", compFiles)
    print("Text from offers: ", offersList)
    print("Text from completions: ", completionList)

In [5]:
# STEP 4: OFFER PROCESSING

In [6]:
# STEP 4.1: for Marketplace delivery offers, strip text between "Guaranteed" and "Customer dropoff"
# and for Drive offers, just move the data along
# If something doesn't work out at this point, error should be caught for examination

offer_value_list = []
error_list = []
for x in offersList:
    indexA = 0
    indexB = 0
    startVal = endVal = 0
    DriveOrderFlag = 0
    # For every string in the list
    for y in x:
        # If Drive order, add string and break
        if "item" in y:
            offer_value_list.append(x)
            DriveOrderFlag = 1
            break
            
            
        # If Marketplace order
        if "Guaranteed" in y:
            startVal = indexA
        if "Customer dropoff" in y:
            endVal = indexB
        indexA +=1
        indexB +=1
        if endVal != 0:
            offer_value_list.append(x[startVal: endVal+1])
            break
    if(endVal == 0 and DriveOrderFlag != 1):
        print("Error found, printing diagnostics and updating error_list", indexA, indexB, startVal, endVal)
        error_list.append(x)

# DEBUG #
if displayDebugInfo:
    print("Offer Values List: ", offer_value_list)
    print("Error List: ", error_list)

In [7]:
# STEP 4.2: Pulling strings with relevant values to front columns

offerProcessDF = pd.DataFrame(offer_value_list)
offerProcessDF.fillna("", inplace=True)

# Iterate through DF, in each row find a string that contains "DoorDash pay" 
# Then use that as an anchor point to bring other values towards front columns
for col in offerProcessDF:
    targetRow = targetCol = -1
    for index, value in offerProcessDF[col].items():
        if "DoorDash pay" in value:
            targetRow = index
            targetCol = col
            offerVal = offerProcessDF[targetCol-2][targetRow]
            mileVal = offerProcessDF[targetCol-4][targetRow]
            restVal = offerProcessDF[targetCol-6][targetRow]
            offerProcessDF.at[targetRow, 0] = offerVal + " Guaranteed"
            offerProcessDF.at[targetRow, 1] = mileVal
            offerProcessDF.at[targetRow, 3] = "Pickup"
            offerProcessDF.at[targetRow, 4] = restVal
            
# DEBUG #
if displayDebugInfo:
    display(offerProcessDF)

In [8]:
# Step 4.3: strip all text from string in first column aside into "$#.##" format
def getDollarString(df):
    index = 0
    for x in df[0]:
        if("$" in x):
            x = x.replace(" ", "")
            x = x.replace("+", "")
            a = x.index("$")
            b = x.index("Guaranteed")
            the_float = x[a:b]
            df.at[index, 0] = the_float
        index +=1
getDollarString(offerProcessDF)

# DEBUG #
if displayDebugInfo:
    display(offerProcessDF)

In [9]:
# Step 4.4: Extract mileage value and stack status into two new columns

# NOTE: This checks for "mi" as a substring, but technically, it can show up as yards


# At this point, one of two columns will have the mileage information, so they are combined and the value extracted
# If the offer is part of a staggered stack, it is flagged in a new column with a 1
def mi2fl(df, columns):
    df["Mileage"] = df[columns[0]] + " " + df[columns[1]]
    df["Stacked Order"] = 0
    itr = 0
    for x in df["Mileage"]:
        if( "mi" in x):
            #REGEX: \S+ is string, \s+ is whitespace, so find 1 'word' appearing before 'mi'
            match = re.search('(\S+\s+){1}(?=mi)', x)
            df.at[itr, "Mileage"] = float(match.group(0))
            if ("Additional" in x):
                df.at[itr, "Stacked Order"] = 1
        else:
            df.at[itr, "Mileage"] = "ERROR"
        itr +=1       
mi2fl(offerProcessDF, [1,2])


# DEBUG #
if displayDebugInfo:
    display(offerProcessDF)

In [10]:
# STEP 4.5: Extract one or two restaurant names from each row, store values in new columns

# The main idea here, is to pull the restaurant name(s) from a range of four possible columns
### A "natural stack" offer will have Pickup appear twice

def getRestNames(df):
    restNames = []
    rest2Names = []
    for index, row in df.iterrows():
        restName = ""
        rest2 = ""
        if("Pickup" in row[3]):
            restName = row[4].replace("| ", "")
            restNames.append(restName)
            if("Pickup" in row[5]):
                rest2 = row[6]
                rest2Names.append(rest2)
            else:
                rest2Names.append(rest2)
        elif("Pickup" in row[4]):
            restName = row[5].replace("| ", "")
            restNames.append(restName)
            if("Pickup" in row[6]):
                rest2 = row[7]
                rest2Names.append(rest2)
            else:
                rest2Names.append(rest2)
        # If target values not found, store names as ERROR
        else:
            print("Error detected")
            restNames.append("ERROR")
            rest2Names.append("ERROR")
    df["RestNames"] = restNames
    df["Rest 2"] = rest2Names 
getRestNames(offerProcessDF)

# DEBUG #
if displayDebugInfo:
    display(offerProcessDF)

Error detected


In [11]:
# STEP 4.6: Extract useful columns into succinct dataframe
offerProcessDF = offerProcessDF.rename(columns={0: "Dollars"})
offerDF = offerProcessDF[["Dollars", "Mileage", "RestNames", "Rest 2", "Stacked Order"]].copy()

# DEBUG #
if displayDebugInfo:
    display(offerDF)

In [12]:
# STEP 5: Delivery Completions Processing

In [13]:
# STEP 5.1: List to DF, replace empty with empty string

#cDF is processing dataframe for completions

cDF = pd.DataFrame(completionList)
cDF.fillna("", inplace=True)

In [14]:
# STEP 5.2: Extract all relevant values to completedDF

# NOTE: Due to the data extracted from these images being more structured than the text from offer images
### the process was simpler


# Index 1 is row index 2 is col 
# "Customer Tips" as anchor, if found anchorFlag set to 1


completedDF = pd.DataFrame(columns = ["Base Pay", "Tip", "Peak Pay", 
                                          "Restaurant Name", "Restaurant Name 2", "Tip 2", "Total"])

for index1, row in cDF.iterrows():
    
    # FLAGS #
    anchorFlag = False
    stackFlag = False
    
    # BASE CASE ELEMENT ATTRIBUTES
    basePay = 0
    tipPay = 0
    peakPay = 0
    totalPay = 0
    rest2 = "N/A"
    tip2 = "N/A"
    
    # Loop through entire dataframe
    for index2, value in row.items():
        if("Customer Tips" in value):
            anchorFlag = True
            ############################ PEAK PAY INDEXING ##############################
            if("Peak Pay" in cDF[index2-1][index1]):
                
                # PEAK PAY SUB PROCESS #
                peakPay = cDF[index2-1][index1]
                if ("$" in peakPay):
                    peakPay = peakPay[peakPay.index("$"):] 
                else:
                    peakPay = "ERROR"
                
                # RESTAURANT NAME + TIP SUBPROCESS #
                restName = cDF[index2+1][index1]
                if ("$" in restName):
                    tipPay = restName[restName.index("$"):]
                    restName = restName[:restName.index("$")]
                else:
                    tipPay = "$0.00"
                    restName = restName[:restName.index("-")]
                    
                # BASE PAY SUBPROCESS #
                basePay = cDF[index2-2][index1]
                basePay = basePay[basePay.index("$"):]
                
                # TOTAL + RESTAURANT 2 + TIP 2 SUBPROCESS #
                if("Total" in cDF[index2+2][index1]):
                    total = cDF[index2+2][index1]
                    total = total[total.index("$"):]
                else:
                    rest2 = cDF[index2+2][index1]
                    if ("$" in rest2):
                        tip2 = rest2[rest2.index("$"):]
                        rest2 = rest2[:rest2.index("$")]
                    else:
                        tip2 = "$0.00"
                    if("Total" in cDF[index2+3][index1]):
                        stackFlag = True
                        total = cDF[index2+3][index1]
                        total = total[total.index("$"):]
                    else:
                        stackFlag = False
                        print("ERROR 2")
                    if(")" in rest2):
                        if not ("(" in rest2):
                            rest2 = "N/A"
                            tip2 = "N/A"
                            stackFlag = False            
            ################################## STANDARD INDEXING ##################################
            else:
                # PEAK PAY SUBPROCESS #
                peakPay = "$0.00"
                
                # RESTAURANT NAME + TIP SUBPROCESS #
                restName = cDF[index2+1][index1]
                if ("$" in restName):
                    tipPay = restName[restName.index("$"):]
                    restName = restName[:restName.index("$")]
                else:
                    tipPay = "$0.00"
                    
                # BASE PAY SUBPROCESS #
                basePay = cDF[index2-1][index1]
                basePay = basePay[basePay.index("$"):]
                
                # TOTAL + RESTAURANT 2 + TIP 2 SUBPROCESS #
                if("Total" in cDF[index2+2][index1]):
                    total = cDF[index2+2][index1]
                    total = total[total.index("$"):]
                else:
                    rest2 = cDF[index2+2][index1]
                    if ("$" in rest2):
                        tip2 = rest2[rest2.index("$"):]
                        rest2 = rest2[:rest2.index("$")]
                    else:
                        tip2 = "$0.00"
                    if("Total" in cDF[index2+3][index1]):
                        stackFlag = True
                        total = cDF[index2+3][index1]
                        total = total[total.index("$"):]
                    else:
                        stackFlag = False
                        print("ERROR 2")
                    if(")" in rest2):
                        if not ("(" in rest2):
                            rest2 = "N/A"
                            tip2 = "N/A"
                            stackFlag = False
    if not anchorFlag:
        newRowList = [0, 0, 0, 0, 0, 0, 0]
        completedDF.loc[len(completedDF)] = newRowList
    else:
        newRowList = [basePay, tipPay, peakPay, restName, rest2, tip2, total]
        completedDF.loc[len(completedDF)] = newRowList
        

# DEBUG #
if displayDebugInfo:
    display(completedDF)

In [15]:
# STEP 6: Join file name and timestamp to offersDF and completedDF

# Adding file name of source image of data and timestamp to each row of offerDF
offerDF["FileName"] = offerFiles
offerDF = offerDF.join(timeDF.set_index('FileName'), on="FileName")
offerDF = offerDF.rename(columns={"Time" : "Offer Timestamp"})
offerDF = offerDF.drop(["FileName"], axis=1)

# Adding file name of source image of data and timestamp to each row of completedDF
completedDF["FileName"] = compFiles
completedDF = completedDF.join(timeDF.set_index('FileName'), on="FileName")
completedDF = completedDF.rename(columns={"Time" : "Completed Timestamp"})
completedDF = completedDF.drop(["FileName"], axis=1)

In [16]:
# STEP 7: Join offers and completions together
totalDF = completedDF.join(offerDF)

# DEBUG #
if displayDebugInfo:
    display(totalDF)

In [17]:
# STEP 8: Pre-Export Transformations

In [18]:
# STEP 8.1: Convert strings with $ to floats

testOutputDF = totalDF.copy()
def dollarToFloatCols(df, cols):
    for x in cols:
        s = df[x].items()
        for y in s:
            #First, look for any values in the given columns with N/A, replace with 0.0
            if("N/A" in y):
                df.at[y[0], x] = float(0.0)
            #Next look for errors that replaced dollar values with 0
            elif(y[1]==0):
                df.at[y[0], x] = float(0.0)
            #Next look for values that existed correctly with the $
            elif("$" in y[1]):
                df.at[y[0], x] = float(y[1][y[1].index("$")+1:])
            # Error Handling
            if not (isinstance(df.at[y[0], x], float)):
                #print("ERROR AT:", y[0], x, type(df.at[y[0], x]))
                df.at[y[0], x] = float(0.0)
                
                
colsForFunc = ["Base Pay", "Tip", "Peak Pay", "Tip 2", "Total", "Dollars"]
dollarToFloatCols(testOutputDF, colsForFunc)
testOutputDF = testOutputDF.astype({"Base Pay": 'float', "Tip": 'float', "Peak Pay": 'float', 
                                    "Tip 2": 'float', "Total": 'float', "Dollars" : 'float'})

# DEBUG #
if displayDebugInfo:
    display(testOutputDF)

In [19]:
# STEP 8.2: Convert rows with stacked orders to individual rows

# At this point, only the second half of a staggered stack is flagged in the data, this flags both halves
for index, row in testOutputDF.iterrows():
    if row["Stacked Order"] == 1:
        testOutputDF.at[index-1, "Stacked Order"] = 1

# The main idea:
# find rows with a natural stack
# extract and establish values for new row
# update old row

for index, row in testOutputDF.iterrows():
    B_rest1 = row["RestNames"]
    B_rest2 = row["Rest 2"]
    A_rest1 = row["Restaurant Name"]
    A_rest2 = row["Restaurant Name 2"]
    
    
    if not B_rest2 == "":
        errorFlag = False
        
        #### NEW ROW VARS ######
        newBase = row["Base Pay"]/2
        newTip = row["Tip 2"]
        newPeak = row["Peak Pay"]
        newRestaurantName = A_rest2
        newRestaurantName2 = ""
        newTotal = row["Total"]
        newTS_A = row["Completed Timestamp"]
        newDollars = row["Dollars"]
        newMileage = row["Mileage"]
        newRestName = ""
        newRest2 = ""
        newStackedFlag = 1
        newTS_B = row["Offer Timestamp"]
        
        ### OLD ROW VARS ######
        oldRestaurantName = A_rest1
        oldRestName = B_rest1
        oldRestName2 = oldRest2 = ""
        oldBase = row["Base Pay"]/2
        oldStackedFlag = 1
        
        # LOGIC SECTION
        if B_rest1 in A_rest1:
            newRestName = B_rest2
        elif B_rest1 in A_rest2:
            newRestName = B_rest1
            oldRestName = B_rest2
        elif B_rest2 in A_rest1:
            newRestName = B_rest1
            oldRestName = B_rest2
        elif B_rest2 in A_rest2:
            newRestName = B_rest2
        else:
            #print("Error")
            errorFlag = True
            newRestaurantName = "ERROR"
            newRestName = "ERROR"
            oldRestaurantName = "ERROR"
            oldRestName = "ERROR"
            
        if not errorFlag:
            # The new index is .5 above the current index, which helps sneak the new row into the right position
            
            # NEW ROW #
            newIndex = index + 0.5
            testOutputDF.loc[newIndex] = newBase, newTip, newPeak, newRestaurantName, newRestaurantName2, 0.0, newTotal, newTS_A, newDollars, newMileage, newRestName, newRest2, newStackedFlag, newTS_B    
 
            # OLD ROW #
            testOutputDF.at[index, "Restaurant Name"] = oldRestaurantName
            testOutputDF.at[index, "Restaurant Name 2"] = oldRestName2
            testOutputDF.at[index, "RestNames"] = oldRestName
            testOutputDF.at[index, "Rest 2"] = oldRest2
            testOutputDF.at[index, "Base Pay"] = oldBase
            testOutputDF.at[index, "Stacked Order"] = oldStackedFlag
            
# re index the dataframe to maintain desired order
testOutputDF = testOutputDF.sort_index().reset_index(drop=True)


# DEBUG #
if displayDebugInfo:
    display(testOutputDF)

In [20]:
exportDeliveriesDF = testOutputDF[["RestNames", "Total", "Base Pay", 
                                   "Tip", "Peak Pay", "Stacked Order", "Offer Timestamp", "Mileage",
                                  "Completed Timestamp"]].copy()

In [21]:
# STEP 8.3: Extract Date, Start Time, and End Time

# Because the timestamp is directly from exif data, it always has the same structure
# yyyy:mm:dd hh:mm:ss


# Extract Date
dateFromDF = exportDeliveriesDF["Offer Timestamp"][0]
dateFromDF = dateFromDF[5:10] + "/" + dateFromDF[0:4]
dateFromDF = dateFromDF.replace(":", "/")
exportDeliveriesDF["Date"] = dateFromDF

# Extract Start and End Time
exportDeliveriesDF["Start Time"] = 1
exportDeliveriesDF["End Time"] = 1
for index, value in exportDeliveriesDF.iterrows():
    startTime = exportDeliveriesDF["Offer Timestamp"][index]
    startTime = startTime[11:16]
    endTime = exportDeliveriesDF["Completed Timestamp"][index]
    endTime = endTime[11:16]
    # Convert to 12 hour time from 24 hour
    convertTo12HrFlag = 1
    if(convertTo12HrFlag):
        intA = int(startTime[0:2])
        intB = int(endTime[0:2])
        if(intA >= 13):
            intA = intA-12
        if(intB >= 13):
            intB = intB-12
        strA = str(intA)
        strB = str(intB)
        startTime = strA + startTime[2:5]
        endTime = strB + endTime[2:5]
        exportDeliveriesDF.at[index, "Start Time"] = startTime
        exportDeliveriesDF.at[index, "End Time"] = endTime
        

# DEBUG #
if displayDebugInfo:
    display(exportDeliveriesDF)

In [22]:
# STEP 8.4: Calculate Duration
exportDeliveriesDF["Duration"] = 0
for index, value in exportDeliveriesDF.iterrows():
    #if length = 4 do x, if 5 do y
    endInt = value["End Time"]
    startInt = value["Start Time"]
    if(len(endInt) == 4):
        endInt = int(endInt[2:5])
    else:
        endInt = int(endInt[3:6])    
    if(len(startInt) == 4):
        startInt = int(startInt[2:5])
    else:
        startInt = int(startInt[3:6])
    durVal = endInt - startInt
    if (durVal <= 0):
        durVal = endInt + 60 - startInt
    exportDeliveriesDF.at[index, "Duration"] = durVal
    
# DEBUG #
if displayDebugInfo:
    display(exportDeliveriesDF)

In [23]:
# STEP 8.5: Setup pairing to be exported alongside Delivery DF
rawDataDF = testOutputDF[["RestNames", "Restaurant Name"]].copy()

In [24]:
# STEP 9: Export

#Establishes link to G Sheet, pulls in the existing Deliveries data to be used in the next cell
gc = pygsheets.authorize(service_file='file.json')
sh = gc.open('DashData')
sheets_DelDF       = sh[1].get_as_df()

In [25]:
# STEP 9.1: Establish ID

# Grab the most recent ID from Deliveries worksheet, add 1
# Create array of ints from the previous value through the size of the dataframe to be exported 
# Establish new DF attribute for ID


newIndexForExport = sheets_DelDF["ID"].iloc[-1] + 1
ID_array = np.arange(newIndexForExport, newIndexForExport+len(exportDeliveriesDF))
exportDeliveriesDF["ID"] = ID_array
exportDeliveriesDF_1 = exportDeliveriesDF[["ID", "Date", "RestNames", "Total", "Base Pay", 
                                          "Tip", "Peak Pay", "Stacked Order", "Start Time", "Mileage", 
                                          "End Time", "Duration"]].copy()

In [26]:
# STEP 9.2: Set Version to 3

exportDeliveriesDF_1["Version"] = 3

In [27]:
# STEP 9.3: Replace Stacked Order flag with the ID of the delivery the order was stacked with

secondHalf = False
for index, value in exportDeliveriesDF_1.iterrows():
    if secondHalf:
        secondHalf = False
        exportDeliveriesDF_1.at[index, "Stacked Order"] = value["ID"] - 1
    elif(value["Stacked Order"] == 1):
        secondHalf = True
        exportDeliveriesDF_1.at[index, "Stacked Order"] = value["ID"] + 1

In [28]:
# STEP 9.4: Join export DF and raw data

# I can understand why this might seem odd to do, but its nice to just have them side by side in sheets currently
rawDataDF = rawDataDF.rename(columns={"RestNames":"Name", "Restaurant Name": "raw data"})
exportDeliveriesDF_1 = exportDeliveriesDF_1.join(rawDataDF)

# DEBUG #
if displayDebugInfo:
    display(exportDeliveriesDF_1)

In [29]:
# STEP 9.5: Apply string mapping and export
exportDeliveriesDF_1 = exportDeliveriesDF_1.applymap(str)
if exportFlag:
    wks = sh[7]
    wks.set_dataframe(exportDeliveriesDF_1,(1,1))
else:
    exportDeliveriesDF_1