In [1]:
# imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
import scipy.stats as st
from scipy.stats import linregress
import json

In [2]:
# create base url
base_url = 'https://public.opendatasoft.com/api/explore/v2.1/catalog/datasets/global-shark-attack/records?limit=100'

In [3]:
# read base_url
requests.get(base_url)

<Response [200]>

In [4]:
# create list of #s for offet
total_count = requests.get(base_url).json()['total_count']

In [5]:
# hold the data
all_of_the_data = []

In [None]:
# loop through offsets to combine data into one dataframe
for currentOffset in np.arange(0,total_count,100):
  # print(currentOffset)
  cur_data = requests.get(f'{base_url}&offset={currentOffset}').json()['results']
  all_of_the_data.extend(cur_data) #extend is like
  time.sleep(0.5)

In [None]:
# check number of data points total in list of offsets
len(all_of_the_data)

In [None]:
# create a dataframe from offset list
all_of_the_data_df = pd.DataFrame(all_of_the_data)
all_of_the_data_df

In [None]:
# Changed all None to nan and then dropped all nans
new_data_df = all_of_the_data_df.replace(to_replace='None', value=np.nan).dropna()
new_data_df

In [None]:
# checked data types
new_data_df.dtypes

In [None]:
# change age from object to number
new_data_df['age'] = pd.to_numeric(all_of_the_data_df['age'], errors='coerce')
new_data_df

In [None]:
# change age from number to integer
new_data_df['age'] = new_data_df['age'].astype('Int64')
new_data_df

In [None]:
# deleted unnecessary column
del new_data_df['investigator_or_source']

In [None]:
# deleted unnecessary column
del new_data_df['pdf']

In [None]:
# deleted unnecessary column
del new_data_df['href_formula']

In [None]:
# deleted unnecessary column
del new_data_df['href']

In [None]:
# deleted unnecessary column
del new_data_df['case_number']

In [None]:
# deleted unnecessary column
del new_data_df['case_number0']

In [None]:
# deleted unnecessary column
del new_data_df['original_order']

In [None]:
# reprint dataframe with removed columns
new_data_df

In [None]:
# split text to columns on species and rename columns 
fixed_data_df = new_data_df.join(
     new_data_df['species'].str.split(',', expand=True).rename(
         columns={0:'Species_1', 1:'Species_2', 2:'Species_3',3:'Species_4'}
     )
 )

In [None]:
fixed_data_df

In [None]:
# remove unnecessary column
del fixed_data_df['species']

In [None]:
fixed_data_df

In [None]:
# remove unnecesary column
del fixed_data_df['time']

In [None]:
fixed_data_df

In [None]:
# check species_1 column to see what unique values exist
species= fixed_data_df['Species_1'].unique()
species

In [None]:
# look at value counts of species 1 to determine which values need to be cleaned/removed
fixed_data_df['Species_1'].value_counts().head(25)

In [None]:
# clean species_1
fixed_data_df['Species_1'] = fixed_data_df['Species_1'].replace({"4' shark": 'Unknown',"4' to 5' shark": 'Unknown',"1.8 m [6'] shark":'Unknown',
"6' shark":'Unknown',"Invalid":'Unknown',"3' shark": 'Unknown'})

In [None]:
# create a list of sharks
top_sharks = fixed_data_df['Species_1'].value_counts().head(10).keys().to_list()

In [None]:
# make an empty dataframe and create df list
top_shark_df = fixed_data_df.head(0).copy()
dfs = []

In [None]:
# create list of top shark dataframes
for shark in top_sharks:
    dfs.append(fixed_data_df.loc[fixed_data_df['Species_1'] == shark])

In [None]:
# combine dataframes
for df in dfs:
   top_shark_df= pd.concat([top_shark_df,df])

In [None]:
# view top 20 shark species
top_shark_df['Species_1'].value_counts().head(20)

In [None]:
# filter to white sharks only
fixed_data_df['Species_1'].loc[fixed_data_df['Species_1'] =='White shark' ]

In [None]:
# create list of top sharks
fixed_data_df['Species_1'].value_counts().head().keys().to_list()

In [None]:
# reprint dataframe
fixed_data_df

In [None]:
# drop row with incorrect date and reprint dataframe
clean_data_df = fixed_data_df.drop(fixed_data_df.index[808])
clean_data_df

In [None]:
#change date data type from object to date/time format
clean_data_df['date']=pd.to_datetime(clean_data_df['date'], format='%Y-%m-%d')

In [None]:
# verify datatypes to make sure date/time format is reflected
clean_data_df.dtypes

In [None]:
# reprint dataframe
clean_data_df

In [None]:
# look at count of species in list again
clean_data_df['Species_1'].value_counts().head(200)

In [None]:
# remove unnecessary column
del clean_data_df['Species_2']

In [None]:
# remove unnecessary column
del clean_data_df['Species_3']

In [None]:
# remove unnecessary column
del clean_data_df['Species_4']

In [None]:
# reprint dataframe
clean_data_df

In [None]:
# review tail of dataframe
clean_data_df['Species_1'].value_counts().tail(10)

In [None]:
# create white shark dataframe
white_shark_df = clean_data_df.loc[clean_data_df['Species_1'] =='White shark' ]
white_shark_df

In [None]:
# create tiger shark dataframe
tiger_shark_df = clean_data_df.loc[clean_data_df['Species_1'] =='Tiger shark' ]
tiger_shark_df

In [None]:
# create unknown shark dataframe
unknown_shark_df = clean_data_df.loc[clean_data_df['Species_1'] =='Unknown' ]
unknown_shark_df

In [None]:
# create bull shark dataframe
bull_shark_df = clean_data_df.loc[clean_data_df['Species_1'] =='Bull shark' ]
bull_shark_df

In [None]:
# create bronze whaler shark dataframe
bronze_whaler_shark_df = clean_data_df.loc[clean_data_df['Species_1'] =='Bronze whaler' ]
bronze_whaler_shark_df

In [None]:
# create blacktip shark dataframe
blacktip_shark_df = clean_data_df.loc[clean_data_df['Species_1'] =='Blacktip shark' ]
blacktip_shark_df

In [None]:
# create raggedtooh shark dataframe
raggedtooth_shark_df = clean_data_df.loc[clean_data_df['Species_1'] =='Raggedtooth shark' ]
raggedtooth_shark_df

In [None]:
#create nurse shark dataframe
nurse_shark_df = clean_data_df.loc[clean_data_df['Species_1'] =='Nurse shark' ]
nurse_shark_df

In [None]:
# combine different shark dataframes together
shark_clean_data_df = pd.concat([nurse_shark_df, raggedtooth_shark_df, blacktip_shark_df, bronze_whaler_shark_df,bull_shark_df, unknown_shark_df, 
tiger_shark_df,white_shark_df], ignore_index=True)

In [None]:
# print new clean shark dataframe
shark_clean_data_df

In [None]:
activity_df = shark_clean_data_df[['activity']]

In [None]:
activity_df

In [None]:
activity_df.value_counts()

In [None]:
activity_df = activity_df = activity_df.replace({"Free diving for abalone (surfacing)": 'Spearfishing',
"Free diving / spearfishing, from paddleboard & floating on the surface": 'Spearfishing',
"Free diving / spearfishing (resting on the surface)":'Spearfishing',
"Free diving / spearfishing":'Spearfishing'})

In [None]:
activity_df.value_counts()

In [None]:
activity_df = activity_df = activity_df.replace({"Free diving with seals ": 'Free Diving',
"Free diving or wading back to shore": 'Free Diving',
"Free diving for abalone diving but at surface)": 'Free Diving',
"Free diving (but on surface)":'Free Diving',
"Windsurfing, but sitting on his board":'Windsurfing'})

In [None]:
activity_df.value_counts()

In [None]:
activity_df = activity_df = activity_df.replace({"Abalone diving using Hookah (near calving whales)": 'Other',
"Diving into water ": 'Free Diving',
"Picking opihi ": 'Other',
"Paddling rescue ski":'Other',
"14 m prawn trawler New Venture capsized  & sank in heavy seas Three people in the water":'Other'})

In [None]:
activity_df .value_counts()

In [None]:
activity_df = activity_df = activity_df.replace({"Rolled off raft": 'Other',
"Riding floatation device": 'Other',
"Bathing in knee-deep water": 'Other',
"Dog paddling or standing":'Other',
"Playing in the water":'Other'})

In [None]:
activity_df.value_counts()

In [None]:
activity_df = activity_df = activity_df.replace({"Diving, but on the surface when bitten by the shark": 'Free Diving',
"Rowing": 'Other',
"Dress diving, filming shark & pulling it through the water for a motion picture scene": 'Other',
"Fishing - 'tag & release'":'Fishing',
"Spearfishing / free diving ":'Spearfishing'})

In [None]:
activity_df.value_counts()

In [None]:
activity_df = activity_df = activity_df.replace({"Scallop diving (using surface-supplied air & a POD)": 'Free Diving',
"Sailing on catamaran & fell into the water": 'Other',
"SUP Foil boarding": 'Other',
"Fishing / standing in waist deep water":'Fishing',
"Spearfishing, dived to pick up a float line":'Spearfishing'})

In [None]:
activity_df.value_counts()

In [None]:
activity_df = activity_df = activity_df.replace({"Diving into water": 'Free Diving',
"Scallop diving on hookah": 'Free Diving',
"Sailing": 'Other',
"Fishing for sharks":'Fishing',
"Spearfishing, holding mesh bag with speared fish":'Spearfishing'})

In [None]:
activity_df.value_counts()

In [None]:
activity_df = activity_df = activity_df.replace({"Diving for trochus": 'Free Diving',
"Scuba diving & spearfishing": 'Spearfishing',
"Diving from the lugger San, operated by the Protector of the Aborigines": 'Free Diving',
"Fishing from a kayak":'Fishing',
"Diving for Abalone":'Free Diving'})

In [None]:
activity_df.value_counts()

In [None]:
activity_df = activity_df = activity_df.replace({"Spearfishing, pulled shark’s tail": 'Spearfishing',
"Spearfishing, had fish on his spear": 'Spearfishing',
"Splashing": 'Free Diving',
"Dived overboard to check propeller of US Navy motor torpedo boat":'Free Diving',
"Standing / Snorkeling":'Snorkeling'})

In [None]:
activity_df.value_counts()

In [None]:
activity_df = activity_df = activity_df.replace({"Spearfishing / free diving": 'Spearfishing',
"Spearfishing with Scuba gear": 'Spearfishing',
"Standing in waist-deep water, helping his father tend a gill net containing dead fish": 'Other',
"Surf fishing in waist-deep water ":'Fishing',
"Kite Boarding":'Other'})

In [None]:
activity_df.value_counts().tail(60)

In [None]:
activity_df = activity_df = activity_df.replace({
"Cleaning hull of ship": 'Other',
"Surf fishing in waist-deep water": 'Fishing',
"Diving for abalone (Hookah, submerged)": 'Free Diving',
"Diving for abalone (Scuba)":'Free Diving',
"Diving for abaone (Scuba, but at surface)":'Free Diving',
"Swimming / Body Surfing":"Swimming",
"Swimming near pier":"Swimming",
"Body-boarding":"Body boarding",
"Swimming breast stoke":"Swimming",
"Casting a net":"Other",
"Swimming /  boogie boarding":"Swimming",
"Surfing with dolphins":"Surfing",
"Surfing, paddling seawards":"Surfing",
"Surfing, lying on surfboard":"Surfing",
"Surfing, fell off surfboard & stepped on the shark.":"Surfing",
"Swimming or Snorkeling":"Swimming",
"Swimming underwater from crayfish cage to a fishing bait":"Swimming",
"Swimming with dolphins":"Swimming",
"Swimming with other crew near wharf":"Swimming",
"Body surfing / treading water":"Body boarding",
"Swimming, naked":"Swimming",
"Swimming, poaching perlemoen":"Swimming",
"Swimming. Passer-by, Len Bedford, heard him shriek , saw shark leap from the water & swimmer disappeared":"Swimming",
"Teasing a shark":"Other",
"Thrashing the water / imitating the shark victim from Jaws":"Other",
"Body surfing & treading water":"Body boarding",
"Body boarding or surfing":"Body boarding",
"Wading / fishing & carrying a bag of fish":"Other",
"Wading in school of baitfish":"Other",
"Body Surfing":"Body boarding",
"Walking on reef":"Other",                                                                                             
"Water-skiing":"Other",
"Surfing, but treading water":"Surfing",
"Killing a shark":"Other",
"Surfing on air mattress":"Other",
"Cleaning fish":"Other",
"Kite surfing":"Other",
"Kiteboarding":"Other",
"Lifesaving drill":"Other",
"Lifesaving exhibition":"Other",
"Lying on surfboard":"Other",
"Lying prone in 2' of water":"Other",
"Night bathing":"Other",
"Kayak Fishing":"Fishing",
"Hand feeding sharks":"Other",
"Paddling":"Other",
"Paddling on surfboard":"Other",
"Picking opihi":"Other",
"Free diving, Spearfishing":"Spearfishing",
"Free diving with seals":"Free Diving",
"Free diving & spearfishing (ascending)":"Spearfishing",
"Free diving & spearfishing":"Spearfishing",
"Floating on his back":"Other",
"Floating in inflatable pool ring": "Other",
"Scuba diving (submerged riding a scooter)":"Free Diving",                                                                   
"Shark fishing": "Fishing",
"Fishing with hand net in 2' of water": "Fishing",                                                                     
"Spearfishing (Free diving)": "Spearfishing"
})

In [None]:
activity_df.value_counts()

In [None]:
activity_df = activity_df = activity_df.replace({
"Scuba diving":"Free Diving",
"Diving":"Free Diving",
"Body surfing":"Body Boarding",
"Treading water":"Swimming",
"Free diving":"Free Diving",
"Boogie boarding":"Body Boarding",
"Surf-skiing":"Other",
"Surf skiing":"Other",
"Walking":"Other",
"Free diving for abalone":"Free Diving",
"Body Boarding":"Body Boarding",
"Windsurfing":"Windsurfing",
"Paddle boarding":"Other",
"Stand-Up Paddleboarding":"Other",
"Swimming / snorkeling":"Swimming",
"Spearfishing / Scuba diving (at surface)":"Spearfishing",
"Swimming from the New Venture":"Swimming",
"Swimming, ducking for shells in water 0.9 m deep":"Swimming",
"Bathing":"Other",
"Spearfishing / Scuba diving":"Spearfishing",
"Spearfishing (free diving)":"Spearfishing",
"Scuba diving (but on surface)":"Free Diving",
"Scuba Diving":"Free Diving",
"Playing in the surf":"Other",
"Playing":"Other",
"Kite Surfing":"Other",
"Kayaking / Fishing":"Kayaking",
"Floating":"Other",
"Diving for abalone (Hookah)":"Free Diving",
"Diving for abalone":"Free Diving",
"Boogie Boarding":"Body Boarding",
"SUP":"Other",          
"Collecting fish by lamplight in gully":"Fishing"               
})

In [None]:
activity_df.value_counts()

In [None]:
activity_df = activity_df = activity_df.replace({"Body boarding":"Body Boarding"})

In [None]:
activity_df.value_counts()

In [None]:
activity_list = ['Surfing','Swimming','Spearfishing','Other','Free Diving','Body Boarding','Fishing','Snorkeling','Standing',
'Wading','Kayaking','Windsurfing']
attack_number_list = [200,133,77,70,56,45,28,16,16,16,13,3]

In [None]:
plt.bar(activity_list,attack_number_list)
plt.xticks(activity_list, rotation="vertical")
plt.xlabel("Activity")
plt.ylabel("Number of Shark Attacks")
plt.title("Number of Shark Attacks by Activity")
plt.tight_layout()

In [None]:
activity_type_df = shark_clean_data_df[['type','activity']]
activity_type_df = activity_type_df.replace(to_replace='Watercraft', value=np.nan).dropna()
activity_type_df.value_counts()

In [None]:
activity_type_df = activity_type_df.replace(to_replace='Invalid', value=np.nan).dropna()
activity_type_df

In [None]:
revised_activity_type_df = activity_type_df.set_index(['activity'])
value_counts_df = revised_activity_type_df.loc['Surfing'].value_counts()
index = ['Unprovoked', 'Provoked']
plt.ylabel("")
value_counts_df.plot(kind='pie',x='type',labels = index,autopct = "%1.1f%%",startangle= 200)
plt.title('Surfing Deep Dive: Provoked v. Unprovoked')
plt.axis("equal")
plt.legend()

In [None]:
value_counts = activity_df.value_counts()
value_counts_df = pd.DataFrame({"Number of Attacks":value_counts})