In [51]:
import os       # Used to delete old .png images
import datetime # For sorting screenshots into correct times
from skimage.measure import compare_ssim as ssim # Sctructural Similarity algorithm, used to get numbers from times
from skimage import data, img_as_float
from PIL import Image # For converting .png to .jpg
import numpy as np # Basic numper arithmetic
import cv2 # Computer Vision
import pandas as pd
import time
from dateutil.parser import parse # Needed for turning date + time strings into datetime format

In [52]:
path = "/Users/CollinHeist/Documents/GitHub/Personal-Projects/Statistical Formatting/Screenshots/"
csv_path = "/Users/CollinHeist/Documents/GitHub/Personal-Projects/Statistical Formatting/data.csv"
references = '/Users/CollinHeist/Documents/GitHub/Personal-Projects/Statistical Formatting/Reference Images/'

In [53]:
# Grab all images at the provided path
def getImageList(path):
    files = os.listdir(path)
    image_list = [item for item in files if item.endswith((".jpg", ".png"))]
    
    return image_list

In [54]:
# Convert the provided image list to JPG files
def convertToJPG(imageList, verbose=False):
    if verbose:
        print ("Converting all .png images to .jpg files..")
        
    for items in imageList: # Loop through all images
        if items.endswith(".png"):
            im = Image.open(path + items)
            rgb_im = im.convert("RGB")
            rgb_im.save(path + items[:-3] + "jpg") # Change extension
            os.system("rm \"%s\"" % (path + items)) # Delete old .png file
            time.sleep(0.06)

In [55]:
# Crop the provided image
def cropImage(image):
    im = Image.open(path + image)
    cropRegion = (0, 575, 1125, 795) # (startX, startY, endX, endY)
    region = im.crop(cropRegion)
    region.save(path + image[:-4] + " - Cropped.jpg")
    os.system("mv \"%s\" \"%s\"" % (path + image, path[:-12] + "/Processed/")) # Move the OG image into /Processed/
    time.sleep(0.06)

In [56]:
# Take the cropped image and divide it into each number for processing
def subdivideImage(cropped_image):
    min_l = (33, 0, 188, 220)
    min_r = (188, 0, 343, 220)
    sec_l = (403, 0, 558, 220)
    sec_r = (558, 0, 713, 220)
    milSec_l = (773, 0, 928, 220)
    
    # Crop and save the new images
    im = Image.open(path + cropped_image)
    region = im.crop(min_l)
    region.save(path + cropped_image[:-4] + " minL.jpg")
    region = im.crop(min_r)
    region.save(path + cropped_image[:-4] + " minR.jpg")
    region = im.crop(sec_l)
    region.save(path + cropped_image[:-4] + " secL.jpg")
    region = im.crop(sec_r)
    region.save(path + cropped_image[:-4] + " secR.jpg")
    region = im.crop(milSec_l)
    region.save(path + cropped_image[:-4] + " milSecL.jpg")

    return (cropped_image[:-4] + " minL.jpg", cropped_image[:-4] + " minR.jpg", cropped_image[:-4] + " secL.jpg", 
            cropped_image[:-4] + " secR.jpg", cropped_image[:-4] + " milSecL.jpg")

In [57]:
# Return a tuble containing the date of the picture and the end time ('MON DD', 'HH MM')
def grabTimeData(image):
    # date ('MON DD') and time ('HH MM')
    _, mon, day, hour, minute, _, am_pm = image.split(" ")
    date = mon + " " + day[:-1]
    hour = str(int(hour) + 12) if am_pm[:2] == "PM" and int(hour) < 12 else hour
    hour = "00" if hour == "12" and am_pm[:2] == "AM" else hour
    t = hour + ":" + minute

    return (date, t)

In [58]:
# Return a 0 or 9 depending on the number of white cells in the provided subImage
def testZeroNine(image):
    img = Image.open(path + image)
    sub_section = (40, 107, 115, 152)
    region = img.crop(sub_section)

    # Test whether it is a nine or zero by counting the number of white cells in the middle region
    whiteCount = 0
    for pixel in list(region.getdata()):
        whiteCount = (whiteCount + 1) if pixel == (255, 255, 255) else (whiteCount)

    return (9 if whiteCount > 150 else 0) # Arbitrary test of >150 seems to work all the time | initial test had 250

In [59]:
# Evaluates which number the passed image is using the SSI formula
def structuralSimilarityIndex(image):
    curr_img = Image.open(path + image)
    closeness = []
    # Compare the passed image to 0 through 9 to see which is closest
    for i in range(10):
        ref_img = Image.open("{}{}.jpg".format(references, i))
        struct_sim = ssim(img_as_float(ref_img), img_as_float(curr_img), multichannel=True)
        closeness.append(struct_sim)

    # If the highest val is a 0 or 9, and there is doubt, verify w/ testZeroNine | Else return index of max value
    return (testZeroNine(image) if np.argmax(closeness) in (0, 9) and abs(closeness[0]-closeness[8]) < .12 else np.argmax(closeness))

In [60]:
def obtainData(verbose = False):
    images = getImageList(path)
    convertToJPG(images, verbose)
    images = getImageList(path)

    if verbose:
        print ("Grabbing data from images..")

    # Fill each dataList with the corresponding data for each image
    endDateList = []
    endTimeList = []
    durationList = []
    for image in images:
        t = grabTimeData(image) # A tuple that is ("MON DD", "HH MM")

        cropImage(image) # Changes the name of the file itself
        image = image[:-4] + " - Cropped.jpg" # Update image file name to the cropped one
        im = Image.open(path + image)

        tempDur = [] # Use a list object so we can retroactively alter the seconds place for rounding
        dividedImages = subdivideImage(image) # Tuple of the subdivided image names
        for count, subImages in enumerate(dividedImages):
            val = structuralSimilarityIndex(subImages) # Grab the value that is the closest to the image
            os.system("rm \"%s\"" % (path + subImages)) # Delete the old file
            if count == 2:
                tempDur.append(":")
            elif count == 4: # We do not actually want to append the decimal value | Just use it for rounding
                if val >= 5: # "00:00"
                    if tempDur[4] == "9": # Round over to the next 10s second
                        if tempDur[3] == "5": # Round over to the next minute
                            if tempDur[1] == "9":
                                tempDur[0] = str(int(tempDur[0]) + 1)
                                tempDur[1] = "0"
                            else:
                                tempDur[1] = str(int(tempDur[1]) + 1)
                            tempDur[3] = "0"
                        else:
                            tempDur[3] = str(int(tempDur[3]) + 1)
                        tempDur[4] = "0"
                    else:
                        tempDur[4] = str(int(tempDur[4]) + 1)
                break
            tempDur.append(str(val))

        os.system("rm \"%s\"" % (path + image)) # Delete the old file

        tempDur.pop(0) if tempDur[0] == "0" else None # Remove the leading 0 on the duration
        tempDuration = ''.join(tempDur) # Convert the list tempDur object into a String

        endDateList.append(t[0])
        endTimeList.append(t[1])
        durationList.append(tempDuration)

    return (endDateList, endTimeList, durationList)

In [61]:
# Change the dateList format from strings to ordinal values
def formatData(dateList, verbose = False):
    if verbose:
        print ("Formatting date values, changing months to ordinal values..")

    # Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov, Dec
    monthDict = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06",
                 "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"}
    reformattedList = [] # Return a corrected version of dateList
    for days in dateList:
        month = days[:3] # Grab the month text
        monthVal = monthDict[month] # Reference the month-dict for the correct number
        reformattedList.append(monthVal + "/" + days[4:])

    return reformattedList

In [62]:
# Sort the provided DataFrame by combined Date and Time contained within
def sortDataFrame(df, verbose = False):
    dt = [parse(dt + " " + tm) for (dt, tm) in zip(df.Date, df.Time)]
    df.insert(0, "DateTime", dt) # Add the newly formed datetime column to the existing DF
    df.sort_values(by="DateTime", inplace=True) # Sort the DataFrame by the datetime column
    df.reset_index(drop=True, inplace=True) # Change the index of each row to 0->n-1 after the sorting
    df = df.drop(columns="DateTime") # Remove the datetime column
    
    return df

In [63]:
# Change duration and date values to their spreadsheet required formats
def addDurationYear(df, year = 2019, verbose = False):
    if verbose:
        print ("Appending year to all dates, changing duration values..")
        
    df.Date = df.Date + "/" + str(year) # Add year
    df.Duration = ["0:0" + d if d[1] == ":" else "0:" + d for d in df.Duration] # Add "0:0" or "0:" to durations
    
    return df

In [64]:
# Format the existing DataFrame for the export to Google Sheets, i.e. no repeat dates, and blank dates
def formatForOutput(df, verbose = False):
    date = parse(df.Date[0]) # The first date of the DataFrame (as datetime)
    last_date = parse(df.Date.iloc[-1]) # The last date of the DataFrame (as datetime)
    dt = [parse(dt + " " + tm) for (dt, tm) in zip(df.Date, df.Time)] # Create datetime column
    df.insert(0, "DateTime", dt) # Add the newly formed datetime column to the existing DF
    while True:
        if df[df.Date == date.strftime("%m/%d/%Y")].shape[0] == 0: # If the date doesn't exist in the DF
            df.loc[len(df)] = [date, date.strftime("%m/%d/%Y"), "", ""] # Add an empty date
        elif df[df.Date == date.strftime("%m/%d/%Y")].shape[0] > 1: # If there is more than one item per-date in the DF
            temp_date = [""] * df[df.Date == date.strftime("%m/%d/%Y")].shape[0] # Create a blank array of the proper size
            temp_date[0] = date.strftime("%m/%d/%Y") # Set the first value in the array to the current date
            df.loc[df.Date == date.strftime("%m/%d/%Y"), "Date"] = temp_date # Remove repeat dates
            
        # If we processed the last day, leave the infinite loop
        if date == last_date: 
            break
        date += datetime.timedelta(days=1) # Move onto the next day

    # Add a datetime column that's both Date+Time, used for sorting
    df.sort_values(by="DateTime", inplace=True) # Sort by those datetime columns
    df.reset_index(drop=True, inplace=True) # Reset indices now that the array is sorted
    df = df.drop(columns="DateTime") # Remove the datetime column, not used in export
    
    return df

## Read images, place into DataFrame, format for Google Sheets

In [65]:
# Grab the data from each image
date, Time, duration = obtainData(verbose=True)
# Turn all string months to numeric months
date = formatData(date, verbose=True)
# Convert each array of data to a combined DataFrame
df = pd.DataFrame(data=np.c_[date, Time, duration], columns=["Date", "Time", "Duration"])
# Sort the set of data by the combined date and time
df = sortDataFrame(df, verbose=True)
# Append 2019 to the DataFrame
df = addDurationYear(df, year=2020, verbose=True)
# Format the DataFrame so it can be direcly exported to Google Sheets
df = formatForOutput(df)

Converting all .png images to .jpg files..
Grabbing data from images..
Formatting date values, changing months to ordinal values..

Appending year to all dates, changing duration values..


## Export DataFrame to CSV

In [66]:
df.to_csv(csv_path, index=False)