# Project Shark


## Source Data

The main sources of data required for this project were available from the internet as downloadable excel files.


## Clean Data

Some data cleaning took place in Excel eg delete unwanted columns, filter rows by date or location.

Further data cleaning was undertaken in Python, examples below

In [None]:
# import filtered shark csv file
filename = "Resource/Shark Data WA 2016-2021 (filtered).xls"
shark_data = pd.read_excel(filename)

In [None]:
# clean up misspelled State
shark_data['Area'] = shark_data['Area'].replace({'Westerm Australia':'Western Australia'})
shark_data['Area'] = shark_data['Area'].replace({'New South Wales ':'New South Wales'})

In [None]:
# remove 2021 data
shark_df = shark_df.loc[shark_df['Year']<=2020]
# remove unwanted columns
shark_df = shark_df.drop([ 'pdf', 'href formula','href', 'Case Number.1', 'Case Number.2', 'original order'], axis = 1)

In [None]:
# only use data for WA
shark_wa = shark_df.loc[shark_df['Area']=="Western Australia"]


# Manipulate Data

Some examples are shown below

In [None]:
# Split into regions based on latitude
shark_wa_region["region"] = shark_wa_region.lat.apply(lambda x: "Northern" if x>-31.5471024260016 else ("Metro" if x> -32.6343558722623 else "Southern"))

In [None]:
# match location to lat and long coordinates, using Google Maps API

# create columns to hold data
shark_df['lat']=""
shark_df['lng']=""

# Build URL using the Google Maps API
base_url = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json"

params = {"key": g_key, "inputtype": "textquery", "fields":"geometry"}

for index, row in shark_df.iterrows():
    # get extra parameters
    params['input'] = (f'{row["Location"]}, {row["Area"]}')
    
    # Run request
    print(f"Retrieving Results for Index {index}: {row['Location']}.")
    response = requests.get(base_url, params=params)
    results = response.json()

    # Extract lat/lng
    try:
        shark_df.loc[index, 'lat'] = results['candidates'][0]['geometry']['location']['lat']
        shark_df.loc[index, 'lng'] = results['candidates'][0]['geometry']['location']['lng'] 
        
    except (KeyError, IndexError):
        print("Missing field/result... skipping.")


In [None]:
# put a weighting on fatality, 6 for fatal, 1 for not fatal
shark_df['fatal_weight']=shark_df['Fatal (Y/N)'].apply(lambda x: 6 if x == 'Y' else 1)

In [None]:
# group data into categories, in this example, year and region
result = shark_wa_region.groupby(['Year','region'])['Case Number'].count()

# Generate Output

examples shown below

In [None]:
# bar chart by year, grouped by region
result.unstack().plot(kind='bar')
plt.title("Shark attacks by region, WA")
plt.xlabel("")
plt.xticks(rotation = 0)

In [18]:
# Create a heat map with gmaps, showing the location of shark attacks, weighted to fatality
locations = shark_df[['lat', 'lng']]

# use fatality as the weight
map_weight = shark_df['fatal_weight']

# set figure layout
figure_layout = {
    'width': '800px',
    'height': '600px',
    'border': '1px solid black',
    'padding': '1px'
}

# Plot Heatmap
fig = gmaps.figure(map_type = 'HYBRID', layout=figure_layout)

# Create heat layer
heat_layer = gmaps.heatmap_layer(locations, 
                                weights=map_weight,
                                dissipating=False, 
                                max_intensity=10,
                                point_radius=1, 
                                opacity = 0.4)
                                

# Add layer
fig.add_layer(heat_layer)

# Display figure
fig

Figure(layout=FigureLayout(border='1px solid black', height='600px', padding='1px', width='800px'))

In [None]:
# create stacked bar chart with labels
fig, ax = plt.subplots()

ax.set_title("Shark attacks by year", size = 16)

p1 = ax.bar(attack_by_year.index, attack_by_year['WA'], label = 'WA', color = "slategray", width = 0.7 )
p2 = ax.bar(attack_by_year.index, attack_by_year['Other'], bottom = attack_by_year['WA'], label = 'Other States', color = "lightsteelblue", width = 0.7)

ax.legend()

ax.yaxis.set_visible(False)
ax.set_ylim(0,attack_by_year['Total'].max() + 6)

# add totals labels
y_offset = 2
for i, total in enumerate(attack_by_year['Total']):
    ax.text(attack_by_year['Total'].index[i], total + y_offset, total, ha='center',
           size = 14)

# Let's put the annotations inside the bars themselves by using a
# negative offset.
y_offset = -2
# For each patch (basically each rectangle within the bar), add a label.
for bar in ax.patches:
  ax.text(
      # Put the text in the middle of each bar. get_x returns the start
      # so we add half the width to get to the middle.
      bar.get_x() + bar.get_width() / 2,
      # Vertically, add the height of the bar to the start of the bar,
      # along with the offset.
      bar.get_height() + bar.get_y() + y_offset,
      # This is actual value we'll show.
      round(bar.get_height()),
      # Center the labels and style them a bit.
      ha='center',
      color='w',
      size=12
  )

plt.savefig('Output/plot_attack_by_year.png', dpi=300, bbox_inches='tight')

In [None]:
# waffle chart

from pywaffle import Waffle
import matplotlib.pyplot as plt
incidence = {'pedestrians hit by cars':380,'shark attacks':3}

fig = plt.figure(FigureClass=Waffle, 
                    figsize=(12,5), 
                    values=incidence, 
                    icons=['car','swimmer'],
                    icon_legend = True,
                    legend={'bbox_to_anchor':(1.55, 0.15), 'fontsize': 10, 'frameon': True, 'framealpha':1.0},
                    rows=20)
plt.title("Number of pedestrians involved in car accidents, compared to shark attacks \n Perth metro region, 2016 - 2020")