In [3]:
import io
import os
from google.cloud import vision
from google.cloud.vision import types
import pandas as pd
import matplotlib.pyplot as plt
import time
import memory_profiler as mem_profile
import gc

In [4]:
DIR_PATH = "/Volumes/Cooper_TB_Drive/research/readmissions/image_files/Wayne/"

def get_image_file_chunks(DIR_PATH):
    """A function to gather image files by chunk.
    
    Args:
        - DIR_PATH (str): the path to the directory.

    Returns:
        - chunks (dict): keys are chunks, values are lists of image 
            filepaths for that chunk.
    """

    chunks = {}

    for f in os.listdir(DIR_PATH):
        chunk = "_".join(f.split("_")[:2])
        file = f"{DIR_PATH}{f}"

        if not chunk in chunks.keys():
            chunks[chunk] = [file]
        else:
            chunks[chunk].append(file)

    return chunks

# get Washtenaw chunks
chunks = get_image_file_chunks(DIR_PATH)

for k, v in chunks.items():
    print(f"{k} has {len(v)} files ")


chunk_1 has 1000 files 
chunk_2 has 1000 files 
chunk_3 has 1000 files 
chunk_4 has 1000 files 
chunk_5 has 1000 files 
chunk_6 has 1000 files 
chunk_7 has 1000 files 
chunk_8 has 695 files 


In [5]:
def list_chunker(arr, chunk_size):
    """ divide an array into chinks of size `chunk_size` """
    for i in range(0, len(arr), chunk_size):
        yield arr[i:i + chunk_size]

In [6]:
def get_response(filepath, features):
    """ A function to gather a response from the Google Vision API

    Args: 
        - filepath (str): a valid image filepath
        - features (list of dict): the enums.Feature.Type to include in response,
            if available for the given image

    Returns:
        - response
    """
    with io.open(filepath, 'rb') as image_file:
        content = image_file.read()

    image = types.Image(content=content)
    return client.annotate_image({'image': image, 'features':features})


def response_formatter(response, filepath, county, chunk):
    """A function to tidy the response for storage in a dataframe.

    Args:
        - response (response: the API repsonse
        - filepath (str): the file name for accounting
        - county (str): the county for accounting
        - chunk (str): the chunk label for accounting

    Returns:
        - row (dict): a dictionary for fast conversion to a pd.DataFrame
    """

    # agg labels into a list, single list per image
    labels = [label.description for label in response.label_annotations]
    confidence = [label.score for label in response.label_annotations]

    # get dominant colors by image
    dom_colors = response.image_properties_annotation.dominant_colors.colors

    rgb = [(int(c.color.red), int(c.color.green), int(c.color.blue)) for c in dom_colors]
    pixel_frac = [c.pixel_fraction for c in dom_colors]
    color_scores = [c.score for c in dom_colors]

    return {'file':filepath,
            'county':county,
            'labels': labels,
            'label_scores': confidence,
            'colors':rgb,
            'color_pixel_fraction':pixel_frac,
            'color_scores':color_scores}



In [7]:
%%time
"""
SUB_CHUNK_SIZE is the number of images per Google Vision API request. 

The API returns in ~30 seconds, so running all 1000 images in a chunk at once would require ~8 hours.
By breaking them up I can monitor status more effectively.
"""

COUNTY = 'Wayne'
DIR_PATH =  f"/Volumes/Cooper_TB_Drive/research/readmissions/image_files/{COUNTY}/"
SAVEPATH = f"/Volumes/Cooper_TB_Drive/research/readmissions/google_features/"
SUB_CHUNK_SIZE = 100 # how often to print status messages

chunks = get_image_file_chunks(DIR_PATH)
client = vision.ImageAnnotatorClient()

# the features to include in the response
features = [{"type": vision.enums.Feature.Type.LABEL_DETECTION},
            {"type": vision.enums.Feature.Type.IMAGE_PROPERTIES}]

# iterate through all chunks of 1000 files
for chunk, chunk_files in chunks.items():
    # for storing results, per chunk
    new_rows = [] 

    gc.collect()
    start_time = time.time()
    print(f'Memory (Before): {mem_profile.memory_usage()} MB')

    for idx, file in enumerate(chunk_files):

        # print status
        if idx % SUB_CHUNK_SIZE == 0:
            gc.collect()
            print(f"Working {COUNTY} {chunk}.{idx}")

        try:
            # the dirty work
            response = get_response(file, features)
            row = response_formatter(response, file, COUNTY, chunk)
        except Exception as e:
            print(f"ERROR on {chunk}.{idx}: {e}")
            continue
        
        new_rows.append(row)

    print(f"building {chunk} dataframe...")
    df = pd.DataFrame(new_rows)
    print(df.shape)
    f_name = f"{SAVEPATH}{COUNTY}_features_{chunk}.csv"
    df.to_csv(f_name, index=False)

    print(f'Memory (After): {mem_profile.memory_usage()} MB')
    end_time = time.time() - start_time
    print(f"{chunk} took: {end_time/60:.2f}")

Memory (Before): [117.48828125] MB
Working Wayne chunk_1.0
Working Wayne chunk_1.100
Working Wayne chunk_1.200
Working Wayne chunk_1.300
Working Wayne chunk_1.400
Working Wayne chunk_1.500
Working Wayne chunk_1.600
Working Wayne chunk_1.700
ERROR on chunk_1.740: 503 GOAWAY received
Working Wayne chunk_1.800
Working Wayne chunk_1.900
building chunk_1 dataframe...
(999, 7)
Memory (After): [209.0078125] MB
chunk_1 took: 279.79
Memory (Before): [209.0078125] MB
Working Wayne chunk_2.0
Working Wayne chunk_2.100
ERROR on chunk_2.190: 503 GOAWAY received
Working Wayne chunk_2.200
Working Wayne chunk_2.300
Working Wayne chunk_2.400
Working Wayne chunk_2.500
Working Wayne chunk_2.600
Working Wayne chunk_2.700
Working Wayne chunk_2.800
Working Wayne chunk_2.900
building chunk_2 dataframe...
(999, 7)
Memory (After): [209.3359375] MB
chunk_2 took: 418.79
Memory (Before): [209.3359375] MB
Working Wayne chunk_3.0
Working Wayne chunk_3.100
Working Wayne chunk_3.200
Working Wayne chunk_3.300
Working W

In [8]:
ALL_DIR = f"/Volumes/Cooper_TB_Drive/research/readmissions/image_files/Wayne/"
FEAT_DIR = f"/Volumes/Cooper_TB_Drive/research/readmissions/google_features/"

all_feats = []

for file in os.listdir(FEAT_DIR):
    if '.csv' in file:
        open_path = f"{FEAT_DIR}{file}"

        tmp = pd.read_csv(open_path, usecols=['file'])
        [all_feats.append(x) for x in tmp['file'].tolist()]

print(len(all_feats))
print(len(set(all_feats)))

recalls = []

for file in os.listdir(ALL_DIR):
    if '.png' in file:
        full_path = f"{ALL_DIR}{file}"

        if not full_path in all_feats:
            recalls.append(full_path)


print(len(os.listdir(ALL_DIR)))
print(len(recalls))


13654
13654
7695
5


In [9]:
SAVEPATH = f"/Volumes/Cooper_TB_Drive/research/readmissions/google_features/"
new_rows = []

for file in recalls:
    response = get_response(file, features)
    row = response_formatter(response, file, COUNTY, chunk)
    new_rows.append(row)

df = pd.DataFrame(new_rows)
f_name = f"{SAVEPATH}Wayne_features_RECALLS.csv"
df.to_csv(f_name, index=False)


In [10]:
for file in os.listdir("/Volumes/Cooper_TB_Drive/research/readmissions/google_features/"):
    print(file)

.DS_Store
first_pass
Washtenaw_features_chunk_1.csv
Washtenaw_features_chunk_2.csv
Washtenaw_features_chunk_3.csv
Washtenaw_features_chunk_4.csv
Washtenaw_features_chunk_5.csv
Washtenaw_features_chunk_6.csv
Washtenaw_features_RECALLS.csv
Wayne_features_chunk_1.csv
Wayne_features_chunk_2.csv
Wayne_features_chunk_3.csv
Wayne_features_chunk_4.csv
Wayne_features_chunk_5.csv
Wayne_features_chunk_6.csv
Wayne_features_chunk_7.csv
Wayne_features_chunk_8.csv
Wayne_features_RECALLS.csv


In [46]:
# f_test = '/Volumes/Cooper_TB_Drive/research/readmissions/google_features/Washtenaw_features_chunk_5.csv'

# test = pd.read_csv(f_test)
# print(test.shape)
# test.head()

In [47]:
# FEATURE_DIR = '/Volumes/Cooper_TB_Drive/research/readmissions/google_features/' 
# IMAGE_dir = "/Volumes/Cooper_TB_Drive/research/readmissions/image_files/Washtenaw/"

# in_feature_results = []


# for feature_file in os.listdir(FEATURE_DIR):
#     if 'csv' in feature_file:
#         f_path = f"{FEATURE_DIR}{feature_file}"
#         feat_df = pd.read_csv(f_path)

#         in_feature_results += feat_df['file'].to_list()

# print(len(in_feature_results)
# in_feature_results[:10]