In [1]:
# Dependencies
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import requests
import json
import gmaps
from config import g_key
gmaps.configure(api_key=g_key)

In [12]:
# import filtered shark csv file
filename = "Resource/Shark Data WA 2016-2021 (filtered).xls"
shark_data = pd.read_excel(filename)
shark_data.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2021.07.19,19-Jul-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Rottnest Island,Surfing,male,M,...,N,12h30,"White shark, 3m","B. Myatt & S. De Marchi, GSAF",2021.07.19-Rottnest.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.07.19,2021.07.19,6668
1,2021.07.05,05-Jul-2021,2021.0,Unprovoked,AUSTRALIA,New South Wales,Crescent Head,Surfing,Joe Hoffman,M,...,N,16h30,,"S. De Marchi, M. Michaelson, B. Myatt, GSAF an...",2021.07.05-Hoffman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.07.05,2021.07.05,6661
2,2021.06.23,23-Jun-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Gum Tree Bay,Surfing,Alex Dodds,M,...,N,14h00,,"B. Myatt, S. De Marchi & S. Curatololo-Wageman...",2021.06.23-Dodds.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.06.23,2021.06.23,6655
3,2021.06.11,11-Jun-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Five Fingers Reef,Snorkeling,Jackson Bartlett,M,...,N,11h00,"Bronze whaler shark, 2 m","B. Myatt, GSAF",2021.06.11-Bartlett.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.06.11,2021.06.11,6649
4,2021.05.21,21-May-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Quondong Beach,Spearfishing,Brett Highlands,M,...,N,11h20,,"B. Myatt, GSAF",2021.05.21-Highlands.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.05.21,2021.05.21,6646


In [13]:
shark_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             163 non-null    object 
 1   Date                    163 non-null    object 
 2   Year                    162 non-null    float64
 3   Type                    163 non-null    object 
 4   Country                 163 non-null    object 
 5   Area                    163 non-null    object 
 6   Location                161 non-null    object 
 7   Activity                156 non-null    object 
 8   Name                    160 non-null    object 
 9   Sex                     158 non-null    object 
 10  Age                     118 non-null    object 
 11  Injury                  162 non-null    object 
 12  Fatal (Y/N)             155 non-null    object 
 13  Time                    137 non-null    object 
 14  Species                 106 non-null    ob

In [15]:
shark_data['Area'].unique()

array(['Western Australia', 'New South Wales', 'Queensland',
       'South Australia', 'Victoria', 'Tasmania', 'Northern Territory ',
       'New South Wales ', 'Westerm Australia'], dtype=object)

In [16]:
# clean up misspelled State
shark_data['Area'] = shark_data['Area'].replace({'Westerm Australia':'Western Australia'})
shark_data['Area'] = shark_data['Area'].replace({'New South Wales ':'New South Wales'})

In [17]:
# remove unwanted columns
shark_df = shark_data.drop(['Name','Investigator or Source'], axis = 1)


In [18]:
shark_df['Location'].nunique()

148

In [19]:
# match location to lat and long coordinates

# create columns to hold data
shark_df['lat']=""
shark_df['lng']=""

# Build URL using the Google Maps API
base_url = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json"

params = {"key": g_key, "inputtype": "textquery", "fields":"geometry"}

for index, row in shark_df.iterrows():
    # get extra parameters
    params['input'] = (f'{row["Location"]}, {row["Area"]}')
    
    # Run request
    print(f"Retrieving Results for Index {index}: {row['Location']}.")
    response = requests.get(base_url, params=params)
    results = response.json()

    # Extract lat/lng
    try:
        shark_df.loc[index, 'lat'] = results['candidates'][0]['geometry']['location']['lat']
        shark_df.loc[index, 'lng'] = results['candidates'][0]['geometry']['location']['lng'] 
        
    except (KeyError, IndexError):
        print("Missing field/result... skipping.")

    


Retrieving Results for Index 0: Rottnest Island.
Retrieving Results for Index 1: Crescent Head.
Retrieving Results for Index 2: Gum Tree Bay.
Retrieving Results for Index 3: Five Fingers Reef.
Retrieving Results for Index 4: Quondong Beach.
Retrieving Results for Index 5: Turncurry Beach.
Retrieving Results for Index 6: Karratha.
Retrieving Results for Index 7: Near Coral Bay.
Retrieving Results for Index 8: Yallingup.
Retrieving Results for Index 9: Waterford West.
Retrieving Results for Index 10: Gleneg.
Retrieving Results for Index 11: Melaleuca Beach, Cowaramup Bay, Gracetown, .
Retrieving Results for Index 12: Lake Macquarie.
Retrieving Results for Index 13: Port MacDonnell.
Retrieving Results for Index 14: Blackwall Reach.
Retrieving Results for Index 15: 13th Beach.
Retrieving Results for Index 16: Cable Beach.
Retrieving Results for Index 17: D’Estrees Bay near Kangaroo Island.
Retrieving Results for Index 18: Cable Beach.
Retrieving Results for Index 19: Town Beach, Port Macqu

In [20]:
shark_df.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,...,Time,Species,pdf,href formula,href,Case Number.1,Case Number.2,original order,lat,lng
0,2021.07.19,19-Jul-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Rottnest Island,Surfing,M,,...,12h30,"White shark, 3m",2021.07.19-Rottnest.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.07.19,2021.07.19,6668,-32.0064,115.507
1,2021.07.05,05-Jul-2021,2021.0,Unprovoked,AUSTRALIA,New South Wales,Crescent Head,Surfing,M,25.0,...,16h30,,2021.07.05-Hoffman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.07.05,2021.07.05,6661,-31.177,152.911
2,2021.06.23,23-Jun-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Gum Tree Bay,Surfing,M,25.0,...,14h00,,2021.06.23-Dodds.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.06.23,2021.06.23,6655,-29.7844,114.962
3,2021.06.11,11-Jun-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Five Fingers Reef,Snorkeling,M,10.0,...,11h00,"Bronze whaler shark, 2 m",2021.06.11-Bartlett.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.06.11,2021.06.11,6649,-23.1797,113.766
4,2021.05.21,21-May-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Quondong Beach,Spearfishing,M,48.0,...,11h20,,2021.05.21-Highlands.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.05.21,2021.05.21,6646,-17.5804,122.157


In [21]:
# find record where location is not given
no_loc = shark_df.loc[shark_df['Location'].isna()]
print(no_loc)

# this record is not in WA, so will not affect study too much, so will be dropped from df
shark_df = shark_df.loc[shark_df['Location'].notna()]





      Case Number                  Date    Year      Type    Country  \
68   2018.12.15.b           15-Dec-2018  2018.0  Provoked  AUSTRALIA   
129  2017.01.08.R  Reported 08-Jan-2017     NaN   Invalid  AUSTRALIA   

                Area Location      Activity Sex  Age  ...   Time  \
68   New South Wales      NaN           NaN    M  48  ...  11h00   
129       Queensland      NaN  Spearfishing    M  35  ...    NaN   

            Species                            pdf  \
68   Wobbegong shark      2018.12.15-Wobbegong.pdf   
129       Bull shark  2017.01.08.R-KerryDaniel.pdf   

                                          href formula  \
68   http://sharkattackfile.net/spreadsheets/pdf_di...   
129  http://sharkattackfile.net/spreadsheets/pdf_di...   

                                                  href Case Number.1  \
68   http://sharkattackfile.net/spreadsheets/pdf_di...  2018.12.15.b   
129  http://sharkattackfile.net/spreadsheets/pdf_di...  2017.01.08.R   

    Case Number.2 origi

In [22]:
# convert lat and lng results to numbers
shark_df['lat'] = pd.to_numeric(shark_df['lat'])
shark_df['lng'] = pd.to_numeric(shark_df['lng'])

In [23]:
empty = shark_df.isna()
empty.sum()

Case Number        0
Date               0
Year               0
Type               0
Country            0
Area               0
Location           0
Activity           6
Sex                5
Age               45
Injury             1
Fatal (Y/N)        7
Time              25
Species           57
pdf                0
href formula       0
href               0
Case Number.1      0
Case Number.2      0
original order     0
lat                3
lng                3
dtype: int64

In [24]:
# find rows with empty locations
shark_df_empty_loc = shark_df.loc[shark_df['lat'].isna()]
shark_df_empty_loc

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,...,Time,Species,pdf,href formula,href,Case Number.1,Case Number.2,original order,lat,lng
21,2020.10.09,09-Oct-2020,2020.0,Unprovoked,AUSTRALIA,Western Australia,Kelp Beds Beach (Kelpies),Surfing,M,52.0,...,10h45,4m shark,2020.10.09-Sharpe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2020.10.09,2020.10.09,6586,,
131,2016.12.24,24-Dec-2016,2016.0,Unprovoked,AUSTRALIA,Western Australia,"Bundegi Sanctuary Zone, Ningaloo",Snorkeling,F,,...,Morning,a small shark,2016.12.24-Ningaloo.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.12.24,2016.12.24,6130,,
147,2016.07.20,20-Jul-2016,2016.0,Provoked,AUSTRALIA,Queensland,"20 k off The Spit, off the Gold Coast",Fishing,M,31.0,...,After noon,"reef shark, 1m",2016.07.20-Burck.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.07.20,2016.07.20,6079,,


In [25]:
# these rows are important to the findings, so will need to find location data for them

# Build URL using the Google Maps API

base_url = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json"

params = {"key": g_key, "inputtype": "textquery", "fields":"geometry"}

# Sam's Creek Area, Point Samson WA, case number 2017.09.10.b
params['input'] = "Point Samson WA"
response = requests.get(base_url, params=params)
results = response.json()
shark_df.loc[shark_df['Case Number'] == "2017.09.10.b",'lat'] = results['candidates'][0]['geometry']['location']['lat']
shark_df.loc[shark_df['Case Number'] == "2017.09.10.b",'lng'] = results['candidates'][0]['geometry']['location']['lng']

# bundegi sanctuary zone, case number 2016.12.24
params['input'] = "bundegi sanctuary zone"
response = requests.get(base_url, params=params)
results = response.json()
shark_df.loc[shark_df['Case Number'] == "2016.12.24",'lat'] = results['candidates'][0]['geometry']['location']['lat']
shark_df.loc[shark_df['Case Number'] == "2016.12.24",'lng'] = results['candidates'][0]['geometry']['location']['lng']

# The Spit Gold Coast, Seaworld Drive, Main Beach QLD, case number 2016.07.20
params['input'] = "The Spit Gold Coast, Seaworld Drive, Main Beach QLD"
response = requests.get(base_url, params=params)
results = response.json()
shark_df.loc[shark_df['Case Number'] == "2016.07.20",'lat'] = results['candidates'][0]['geometry']['location']['lat']
shark_df.loc[shark_df['Case Number'] == "2016.07.20",'lng'] = results['candidates'][0]['geometry']['location']['lng']

# Kelp Beds surf break, Wylie Bay, Esperance, case number 2020.10.09
params['input'] = "Wylie Bay, Esperance"
response = requests.get(base_url, params=params)
results = response.json()
shark_df.loc[shark_df['Case Number'] == "2020.10.09",'lat'] = results['candidates'][0]['geometry']['location']['lat']
shark_df.loc[shark_df['Case Number'] == "2020.10.09",'lng'] = results['candidates'][0]['geometry']['location']['lng']




In [27]:
# export data
shark_df.to_csv("shark_geo.csv")