In [1]:
import os
import pandas as pd
import multiprocessing
import sys
os.chdir("/home/ec2-user/SageMaker/foodi_ml/")
from config import configs as conf

In [2]:
from importlib import reload

In [3]:
import src.utils.utils_aws as utils_aws

In [16]:
utils_aws = reload(utils_aws)
conf = reload(conf)

# AWS

In [5]:
# AWS classes
aws_con = utils_aws.AWSConnector(conf.S3_BUCKET)
awstools = utils_aws.AWSTools(aws_con)
aws_basics = utils_aws.AWSBasics(conf.S3_BUCKET)

# List cities

In [6]:
l_cities = awstools.create_list_cities(conf.S3K_imgs)
print(l_cities)

['BCN', 'CUG']


# Download samples CSV

In [7]:
l_cities = awstools.downloading_city_csv(
    l_cities=l_cities,
    s3_key_prefix=conf.S3K_imgs,
    csv_name=conf.S3_file_samples,
    local_folder=conf.pth_dwn_samples,
    verbose=True
)

Key artifacts/002/BCN/training_data.csv not found in S3
Removing from l_cities city BCN
City CUG correctly downloaded to /home/ec2-user/SageMaker/tmp/training_data/CUG.csv


# Read and concatenate CSV

In [8]:
# Get all csv we downloaded
l_csv = os.listdir(conf.pth_dwn_samples)

# Impose only .csv
l_csv = [file_name for file_name in l_csv if file_name.endswith(".csv")]

In [9]:
# Concatenation to a single dataframe
samples = pd.DataFrame()

In [10]:
# Read and concatenate each city dataframe to samples
for city_csv_file in l_csv:
    path_csv = os.path.join(conf.pth_dwn_samples, city_csv_file)
    df_city_csv = pd.read_csv(path_csv)
    df_city_csv.insert(loc=0, 
                       column="city", 
                       value = city_csv_file.split(".csv")[0])
    samples = pd.concat([samples, df_city_csv])

# Download images specified in samples CSV

In [17]:
# Creates the folder to dump images
img_dwn_paral = utils_aws.ImageDownloaderParallelS3(
    base_path=conf.pth_dwn_pictures
)

In [18]:
# Create iterable of jobs
jobs = img_dwn_paral.create_jobs(samples)

In [19]:
# make a process pool to do the work
pool = multiprocessing.Pool(multiprocessing.cpu_count(), 
                            img_dwn_paral.initialize,
                            (conf.S3_BUCKET,)
                           )

In [None]:
%%time
pool.map(img_dwn_paral.download_images, jobs)
pool.close()
pool.join()

# [ARCHIVE]

## Image Downloader S3 without parallelization

In [18]:
"""
image_downloader = utils_aws.ImageDownloaderS3(
    bucket=conf.S3_BUCKET, 
    base_path=conf.pth_dwn_pictures)

city = l_cities[0]

samples_city = samples[samples["city"] == city].copy()

%%time
samples_city = image_downloader.download_imgs_in_disk(samples_city)
"""


'\nimage_downloader = utils_aws.ImageDownloaderS3(\n    bucket=conf.S3_BUCKET, \n    base_path=conf.pth_dwn_pictures)\n\ncity = l_cities[0]\n\nsamples_city = samples[samples["city"] == city].copy()\n\n%%time\nsamples_city = image_downloader.download_imgs_in_disk(samples_city)\n'