# Scraping Satellite Images for Model 2

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
#from google.colab import files
import io
import seaborn as sns
import urllib, os
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Setting parameters
lat_min, lat_max = 51.257, 51.719
lon_min, lon_max = -0.542, 0.291
grid_size = 0.0005

# Getting a list of all grid squares
# Got linspace numbers by backchecking len(lats/lons) using np.arange
lats = np.linspace(lat_min, lat_max, 925)
lons = np.linspace(lon_min, lon_max, 1667)
coords = [(round(x,4),round(y,4)) for x in lats for y in lons]

# Converting to a dataframe and adding a column for the grid square name
coords = pd.DataFrame(coords, columns=['lat_4dp', 'long_4dp'])
coords['grid_square'] = coords['lat_4dp'].map(str) + "," + coords['long_4dp'].map(str)

In [4]:
len(coords)

1541975

Coordinates need to have squares that are 56m high (latitude) and 35m wide (longitude).

# Importing and mapping the data

In [5]:
london_accidents_merged = pd.read_csv('data/accidents/London_accidents_merged.csv')
london_accidents_merged.head()

Unnamed: 0,Accident_Index,Longitude,Latitude,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,1st_Road_Class,Road_Type,...,LSOA,population_per_hectare,bicycle_aadf,motorbike_aadf,car_aadf,bus_aadf,light_goods_vehicle_aadf,heavy_goods_vehicle_aadf,Road,RCat
0,201301BS70003,-0.171402,51.486361,Serious,2,1,2013-01-02,Wednesday,A,Single carriageway,...,E01002844,110.8,1634.4,860.4,14888.0,1139.8,2297.0,352.0,A3217,PA
1,201301BS70005,-0.173356,51.495115,Slight,1,2,2013-01-04,Friday,A,Single carriageway,...,E01002821,74.6,559.6,1516.0,28505.6,1396.2,3868.6,1003.0,A4,PA
2,201301BS70006,-0.210767,51.518353,Slight,1,1,2013-01-07,Monday,B,Single carriageway,...,E01002878,133.4,2.6,3898.2,63274.8,763.4,15253.6,3185.8,A40,PA
3,201301BS70007,-0.209675,51.516808,Slight,2,1,2013-01-10,Thursday,B,Single carriageway,...,E01002831,179.2,2.6,3898.2,63274.8,763.4,15253.6,3185.8,A40,PA
4,201301BS70009,-0.194332,51.492922,Slight,2,1,2013-01-04,Friday,A,One way street,...,E01002851,272.3,869.2,1229.8,20478.6,897.2,4951.6,1251.4,A3220,PA


In [6]:
len(london_accidents_merged)

147089

In [7]:
london_accidents_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147089 entries, 0 to 147088
Data columns (total 35 columns):
Accident_Index              147089 non-null object
Longitude                   147089 non-null float64
Latitude                    147089 non-null float64
Accident_Severity           147089 non-null object
Number_of_Vehicles          147089 non-null int64
Number_of_Casualties        147089 non-null int64
Date                        147089 non-null object
Day_of_Week                 147089 non-null object
1st_Road_Class              147089 non-null object
Road_Type                   147089 non-null object
Speed_limit                 147089 non-null float64
Junction_Detail             147089 non-null object
2nd_Road_Class              147089 non-null object
Light_Conditions            147089 non-null object
Weather_Conditions          147089 non-null object
Road_Surface_Conditions     147089 non-null object
Urban_or_Rural_Area         147089 non-null object
Hour                 

In [8]:
def myround(x, base=.0005):
    return base * round(x/base)

In [9]:
# Adding rounded lat and long columns and a grid square column to the London accident dataset
london_accidents_merged['lat_4dp'] = myround(london_accidents_merged['Latitude'])
london_accidents_merged['long_4dp'] = myround(london_accidents_merged['Longitude'])
london_accidents_merged.long_4dp = london_accidents_merged.long_4dp.replace(-0,0)
london_accidents_merged['grid_square'] = round(london_accidents_merged['lat_4dp'],4).map(str) + "," + round(london_accidents_merged['long_4dp'],4).map(str)

In [10]:
# Creating a list of all squares
all_squares = coords.grid_square

# Creating a list of danger squares
danger_squares = london_accidents_merged.grid_square
danger_squares.drop_duplicates(inplace=True)

# Creating a list of safe squares (all_squares minus danger_squares)
safe_squares = list(set(all_squares) - set(danger_squares))

In [11]:
len(danger_squares)

69757

In [12]:
len(safe_squares)

1472218

In [13]:
# Check to see if squares add up
len(coords.grid_square) - len(danger_squares) - len(safe_squares)

0

In [14]:
danger_squares = list(danger_squares)

In [15]:
safe_squares_download = list(np.random.choice(list(safe_squares), size=5000, replace=False))

In [16]:
def GetImage(coord,SaveLoc):
    base = "https://maps.googleapis.com/maps/api/staticmap?size=375x600&zoom=20&scale=1&maptype=satellite&format=png&visual_refresh=true&center="
    MyUrl = base + coord + key
    file = coord + ".jpg"
    urllib.request.urlretrieve(MyUrl, os.path.join(SaveLoc,file))

In [24]:
key = "&key=" + "AIzaSyBef2x2Icl6BFqUVdnUv4JbfwAnk3O0QlA" 
myloc = r"/Users/sabatinochen/Google Drive/satellite-images/model2/safe" 

for i in safe_squares_download:
    GetImage(coord=i,SaveLoc=myloc)

# Extracting Satellite Images for Model 3

Will take a random sample of 5000 accidents from the merged London dataset in order to get the structured and image data with a danger classification for model 3. The dataset already has features from the population and traffic dataset that will be the primary features for the structured data part of the mixed data model.

In [17]:
danger_squares_london = london_accidents_merged.grid_square

In [18]:
danger_squares_london = list(danger_squares_london)
danger_squares_london[:5]

['51.4865,-0.1715',
 '51.495,-0.1735',
 '51.5185,-0.211',
 '51.517,-0.2095',
 '51.493,-0.1945']

In [26]:
#pd.Series(danger_squares_london).to_csv('danger_squares_london.csv', index=False)

In [19]:
# Take random sample of 20000 to drop duplicate grid squares
sample_20000_london = london_accidents_merged.sample(n=20000, random_state=42, 
                                                     replace=False).drop_duplicates(['grid_square'])
len(sample_20000_london)

16218

In [20]:
# Now take random sample from above to get final sample of 5000 accidents
danger_london_accidents_sample = sample_20000_london.sample(n=10000, random_state=0, replace=False)
print(len(danger_london_accidents_sample))
danger_london_accidents_sample.head()

10000


Unnamed: 0,Accident_Index,Longitude,Latitude,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,1st_Road_Class,Road_Type,...,motorbike_aadf,car_aadf,bus_aadf,light_goods_vehicle_aadf,heavy_goods_vehicle_aadf,Road,RCat,lat_4dp,long_4dp,grid_square
56476,2014450020376,-0.415736,51.338105,Slight,2,3,2014-11-29,Saturday,A,Dual carriageway,...,1048.4,62371.6,184.6,11286.0,2170.6,A3,TA,51.338,-0.4155,"51.338,-0.4155"
35882,201401KD50368,0.212878,51.592871,Slight,2,1,2014-07-26,Saturday,A,Roundabout,...,160.6,16451.0,363.4,1797.8,483.6,A118,PA,51.593,0.213,"51.593,0.213"
16414,201301TX20055,-0.404606,51.477806,Slight,2,1,2013-02-09,Saturday,A,Dual carriageway,...,294.0,29316.2,706.4,3331.6,911.0,A4,PA,51.478,-0.4045,"51.478,-0.4045"
124923,2017010036052,-0.161815,51.466753,Slight,2,1,2017-04-10,Monday,Unclassified,Single carriageway,...,954.0,12590.6,95.4,2325.0,341.4,A3220,PA,51.467,-0.162,"51.467,-0.162"
73930,201501SX20346,-0.231378,51.570964,Slight,2,1,2015-04-14,Tuesday,A,Roundabout,...,220.5,9388.0,389.0,2554.5,383.0,A5,PA,51.571,-0.2315,"51.571,-0.2315"


In [21]:
danger_london_accidents_sample.columns

Index(['Accident_Index', 'Longitude', 'Latitude', 'Accident_Severity',
       'Number_of_Vehicles', 'Number_of_Casualties', 'Date', 'Day_of_Week',
       '1st_Road_Class', 'Road_Type', 'Speed_limit', 'Junction_Detail',
       '2nd_Road_Class', 'Light_Conditions', 'Weather_Conditions',
       'Road_Surface_Conditions', 'Urban_or_Rural_Area', 'Hour',
       'Two_Hour_Groupings', 'Time_of_Day', 'Was_Daylight', 'Was_Bad_Weather',
       'Was_Road_Dry', 'log_Number_of_Casualties', 'log_Number_of_Vehicles',
       'LSOA', 'population_per_hectare', 'bicycle_aadf', 'motorbike_aadf',
       'car_aadf', 'bus_aadf', 'light_goods_vehicle_aadf',
       'heavy_goods_vehicle_aadf', 'Road', 'RCat', 'lat_4dp', 'long_4dp',
       'grid_square'],
      dtype='object')

In [22]:
# Create model 3 dataset just consisting of traffic and population features
# Keep location features to get safe grid squares later
model3_danger_dataset = danger_london_accidents_sample[['Latitude', 'Longitude', 'population_per_hectare', 'bicycle_aadf', 
                                'motorbike_aadf', 'car_aadf', 'bus_aadf', 'light_goods_vehicle_aadf', 
                                'heavy_goods_vehicle_aadf', 'lat_4dp', 'long_4dp', 'grid_square']]

In [23]:
# Combine traffic features for motor vehicles like we did in model 1 notebook
to_sum = ['motorbike_aadf', 'car_aadf', 'bus_aadf', 'light_goods_vehicle_aadf', 'heavy_goods_vehicle_aadf']
model3_danger_dataset['motor_vehicle_aadf'] = model3_danger_dataset[to_sum].sum(axis=1)
model3_danger_dataset.drop(to_sum, axis=1, inplace=True)

In [24]:
model3_danger_dataset.head()

Unnamed: 0,Latitude,Longitude,population_per_hectare,bicycle_aadf,lat_4dp,long_4dp,grid_square,motor_vehicle_aadf
56476,51.338105,-0.415736,3.8,2.6,51.338,-0.4155,"51.338,-0.4155",77061.2
35882,51.592871,0.212878,59.6,146.6,51.593,0.213,"51.593,0.213",19256.4
16414,51.477806,-0.404606,78.8,112.8,51.478,-0.4045,"51.478,-0.4045",34559.2
124923,51.466753,-0.161815,142.2,1333.6,51.467,-0.162,"51.467,-0.162",16306.4
73930,51.570964,-0.231378,23.6,95.0,51.571,-0.2315,"51.571,-0.2315",12935.0


In [25]:
model3_danger_dataset = model3_danger_dataset[['grid_square', 'Latitude', 'Longitude', 'population_per_hectare', 
                                 'bicycle_aadf', 'motor_vehicle_aadf']]

# Rename columns for merging with safe dataset
model3_danger_dataset = model3_danger_dataset.rename(columns={'Latitude': 'latitude', 'Longitude': 'longitude'})

# Add safe column of all 0's due to being danger dataset
model3_danger_dataset['safe']=0

In [26]:
model3_danger_dataset.head()

Unnamed: 0,grid_square,latitude,longitude,population_per_hectare,bicycle_aadf,motor_vehicle_aadf,safe
56476,"51.338,-0.4155",51.338105,-0.415736,3.8,2.6,77061.2,0
35882,"51.593,0.213",51.592871,0.212878,59.6,146.6,19256.4,0
16414,"51.478,-0.4045",51.477806,-0.404606,78.8,112.8,34559.2,0
124923,"51.467,-0.162",51.466753,-0.161815,142.2,1333.6,16306.4,0
73930,"51.571,-0.2315",51.570964,-0.231378,23.6,95.0,12935.0,0


In [44]:
# save dataset for model 3 notebook
model3_danger_dataset.to_csv('model3_danger_dataset.csv')

In [27]:
# Creating a list of danger squares
model3_danger_squares = model3_danger_dataset.grid_square
model3_danger_squares.drop_duplicates(inplace=True)

In [28]:
len(model3_danger_squares)

10000

In [29]:
model3_danger_squares.head()

56476     51.338,-0.4155
35882       51.593,0.213
16414     51.478,-0.4045
124923     51.467,-0.162
73930     51.571,-0.2315
Name: grid_square, dtype: object

In [30]:
model3_danger_squares = list(model3_danger_squares)
model3_danger_squares[:5]

['51.338,-0.4155',
 '51.593,0.213',
 '51.478,-0.4045',
 '51.467,-0.162',
 '51.571,-0.2315']

In [31]:
pd.Series(model3_danger_squares).to_csv('danger_squares_10000_list.csv', index=False)

In [51]:
# scrape danger square images for model 3
key = "&key=" + "AIzaSyBef2x2Icl6BFqUVdnUv4JbfwAnk3O0QlA" 
myloc = r"/Users/sabatinochen/Google Drive/satellite-images/model3/danger" 

for i in model3_danger_squares:
    GetImage(coord=i,SaveLoc=myloc)