## Satellite Image Collection
- Summary: In this part, I'm going to download corresponding satellite images of each census tract through mapbox API. For each census tract, we randomly sample 10 images with each image covering an area of 50m * 50m. In the follow step, we are going to use these collected image to calculate the average house density of each census tract.
- Parallel strategy: To parallel this collection process, we use AWS lambda and step function to subdivide the work into 50 batches. And then we store the collected image into s3 bucket for following steps.

1. randomly sample 10 points within each census tract

In [2]:
import geopandas as gpd

In [3]:
# read the shape file of census tract
shp = gpd.read_file("data/boundaries/Boundaries - Census Tracts - 2010/geo_export_e157c189-cb89-47dd-97ed-50066e7f7496.shp")

In [4]:
shp.head(5)

Unnamed: 0,commarea,commarea_n,countyfp10,geoid10,name10,namelsad10,notes,statefp10,tractce10,geometry
0,44,44.0,31,17031842400,8424,Census Tract 8424,,17,842400,"POLYGON ((-87.62405 41.73022, -87.62405 41.730..."
1,59,59.0,31,17031840300,8403,Census Tract 8403,,17,840300,"POLYGON ((-87.68608 41.82296, -87.68607 41.823..."
2,34,34.0,31,17031841100,8411,Census Tract 8411,,17,841100,"POLYGON ((-87.62935 41.85280, -87.62934 41.852..."
3,31,31.0,31,17031841200,8412,Census Tract 8412,,17,841200,"POLYGON ((-87.68813 41.85569, -87.68816 41.856..."
4,32,32.0,31,17031839000,8390,Census Tract 8390,,17,839000,"POLYGON ((-87.63312 41.87449, -87.63306 41.874..."


In [5]:
# funtion of geting random points within an area
import random
from shapely.geometry import Point

def random_point_within_tract(polygon, num_points):
    minx, miny, maxx, maxy = polygon.bounds
    random_points = []
    while len(random_points) < num_points:
        point = Point(random.uniform(minx, maxx), random.uniform(miny, maxy))
        if polygon.contains(point):
            random_points.append((point.x, point.y))
    return random_points

In [8]:
# generate point_dict that mappes the points with census tract
point_dict = {}
for index, row in shp.iterrows():
    polygon = row['geometry']
    tract_id = row['geoid10']
    random_points = random_point_within_tract(polygon, 10)
    point_dict[tract_id] = random_points

In [10]:
point_dict['17031839000']

[(-87.63207951764853, 41.8674468882685),
 (-87.63520021621466, 41.870293603007184),
 (-87.63223583146228, 41.8740454775901),
 (-87.63240682838804, 41.87138106013676),
 (-87.63431964303369, 41.87202384395054),
 (-87.62784487680285, 41.86887605123645),
 (-87.62879137792132, 41.86833447796616),
 (-87.63160305511266, 41.87423099069787),
 (-87.63316905782128, 41.87037330477799),
 (-87.63322848633837, 41.868337438672825)]

In [18]:
point_pair_list = list(point_dict.items())
print(point_pair_list[0])

('17031842400', [(-87.63123144402078, 41.74966849626187), (-87.63191075061613, 41.735700144424484), (-87.62819662536639, 41.74628108249361), (-87.62778939521435, 41.73772859363239), (-87.62951971266116, 41.738454740450464), (-87.62554271848447, 41.73877773993926), (-87.63128479628858, 41.746245501958725), (-87.62467137316379, 41.74657195405231), (-87.63164422809653, 41.74198321475861), (-87.62522677156231, 41.74475000308879)])


In [19]:
# length of the point_pair_list
len(point_pair_list)

801

2. Create lambda function of collecting corresponding satellite images given the coordintates of the points

In [11]:
import boto3

In [39]:
# create lambda function
aws_lambda = boto3.client('lambda')
iam_client = boto3.client('iam')
role = iam_client.get_role(RoleName='LabRole')

# Open our Zipped directory
with open('final-scraping-package.zip', 'rb') as f:
    lambda_zip = f.read()

try:
    # If function hasn't yet been created, create it
    response = aws_lambda.create_function(
        FunctionName='finalscraping',
        Runtime='python3.9',
        Role=role['Role']['Arn'],
        Handler='lambda_function.lambda_handler',
        Code=dict(ZipFile=lambda_zip),
        Timeout=800
    )
except aws_lambda.exceptions.ResourceConflictException:
    # If function already exists, update it based on zip
    # file contents
    response = aws_lambda.update_function_code(
    FunctionName='finalscraping',
    ZipFile=lambda_zip
    )

lambda_arn = response['FunctionArn']

In [40]:
lambda_arn

'arn:aws:lambda:us-east-1:990765961068:function:finalscraping'

In [41]:
sfn = boto3.client('stepfunctions')

def make_def(lambda_arn):
    definition = {
      "Comment": "My State Machine",
      "StartAt": "Map",
      "States": {
        "Map": {
          "Type": "Map",
          "End": True,
          "Iterator": {
            "StartAt": "Lambda Invoke",
            "States": {
              "Lambda Invoke": {
                "Type": "Task",
                "Resource": "arn:aws:states:::lambda:invoke",
                "OutputPath": "$.Payload",
                "Parameters": {
                  "Payload.$": "$",
                  "FunctionName": lambda_arn
                },
                "Retry": [
                  {
                    "ErrorEquals": [
                      "Lambda.ServiceException",
                      "Lambda.AWSLambdaException",
                      "Lambda.SdkClientException",
                      "Lambda.TooManyRequestsException",
                      "States.TaskFailed"
                    ],
                    "IntervalSeconds": 2,
                    "MaxAttempts": 6,
                    "BackoffRate": 2
                  }
                ],
                "End": True
              }
            }
          }
        }
      }
    }
    return definition

sf_def = make_def(lambda_arn)

In [42]:
# Create step function
import json
try:
    response = sfn.create_state_machine(
        name='final',
        definition=json.dumps(sf_def),
        roleArn=role['Role']['Arn'],
        type='EXPRESS'
    )
except sfn.exceptions.StateMachineAlreadyExists:
    response = sfn.list_state_machines()
    state_machine_arn = [sm['stateMachineArn'] 
                         for sm in response['stateMachines'] 
                         if sm['name'] == 'final'][0]
    response = sfn.update_state_machine(
        stateMachineArn=state_machine_arn,
        definition=json.dumps(sf_def),
        roleArn=role['Role']['Arn']
    )

In [21]:
n = len(point_pair_list) // 50 
n

16

In [31]:
# subdivide the work into 50 equal batches
n = len(point_pair_list) // 50 
batches = [{'point_map': point_pair_list[i:i + n]} for i in range(0, len(point_pair_list), n)]

print(len(batches), len(batches[0]['point_map']))

51 16


In [38]:
batches[0]['point_map']

[('17031842400',
  [(-87.63123144402078, 41.74966849626187),
   (-87.63191075061613, 41.735700144424484),
   (-87.62819662536639, 41.74628108249361),
   (-87.62778939521435, 41.73772859363239),
   (-87.62951971266116, 41.738454740450464),
   (-87.62554271848447, 41.73877773993926),
   (-87.63128479628858, 41.746245501958725),
   (-87.62467137316379, 41.74657195405231),
   (-87.63164422809653, 41.74198321475861),
   (-87.62522677156231, 41.74475000308879)]),
 ('17031840300',
  [(-87.68680065363297, 41.831536409743514),
   (-87.68348014608517, 41.83484014997596),
   (-87.68365854120312, 41.830744959021686),
   (-87.67879457649504, 41.831151669350476),
   (-87.68564892937115, 41.83345747669575),
   (-87.6825666152255, 41.83538925652648),
   (-87.6755234624851, 41.8354968041975),
   (-87.67692553205215, 41.83062375578313),
   (-87.67829644164131, 41.83095800160686),
   (-87.67547550816002, 41.834781442448765)]),
 ('17031841100',
  [(-87.6390604955867, 41.853664572584776),
   (-87.631722306

In [None]:
import time
# Get arn for Step Function state machine
response = sfn.list_state_machines()
state_machine_arn = [sm['stateMachineArn']
                     for sm in response['stateMachines'] 
                     if sm['name'] == 'final'][0]

# Spread batches across Lambda workers
start = time.time()
response = sfn.start_sync_execution(
    stateMachineArn=state_machine_arn,
    name='final',
    input=json.dumps(batches)
)
time_elapsed = time.time() - start

In [44]:
import time
# Get arn for Step Function state machine
response = sfn.list_state_machines()
state_machine_arn = [sm['stateMachineArn']
                     for sm in response['stateMachines'] 
                     if sm['name'] == 'final'][0]

# Spread batches across Lambda workers
start = time.time()
response = sfn.start_sync_execution(
    stateMachineArn=state_machine_arn,
    name='final',
    input=json.dumps(batches)
)
time_elapsed = time.time() - start

In [46]:
print(time_elapsed)

74.87236475944519
