## Step 1: Importing

#### Import all necessary Python libraries, read in data

In [109]:
import pandas as pd
from matplotlib import pyplot as plt

%matplotlib inline


shark_raw = pd.read_csv('data/Shark Attack Data.csv')


In [110]:
shark_raw.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2017.06.11,11-Jun-17,2017.0,Unprovoked,AUSTRALIA,Western Australia,"Point Casuarina, Bunbury",Body boarding,Paul Goff,M,...,N,08h30,"White shark, 4 m","WA Today, 6/11/2017",2017.06.11-Goff.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.06.11,2017.06.11,6095.0
1,2017.06.10.b,10-Jun-17,2017.0,Unprovoked,AUSTRALIA,Victoria,"Flinders, Mornington Penisula",Surfing,female,F,...,N,15h45,7 gill shark,,2017.06.10.b-Flinders.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.06.10.b,2017.06.10.b,6094.0
2,2017.06.10.a,10-Jun-17,2017.0,Unprovoked,USA,Florida,"Ponce Inlet, Volusia County",Surfing,Bryan Brock,M,...,N,10h00,,"Daytona Beach News-Journal, 6/10/2017",2017.06.10.a-Brock.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.06.10.a,2017.06.10.a,6093.0
3,2017.06.07.R,Reported 07-Jun-2017,2017.0,Unprovoked,UNITED KINGDOM,South Devon,Bantham Beach,Surfing,Rich Thomson,M,...,N,,"3m shark, probably a smooth hound","C. Moore, GSAF",2017.06.07.R-Thomson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.06.07.R,2017.06.07.R,6092.0
4,2017.06.04,04-Jun-17,2017.0,Unprovoked,USA,Florida,"Middle Sambo Reef off Boca Chica, Monroe County",Spearfishing,Parker Simpson,M,...,N,,8' shark,"Nine News, 6/7/2017",2017.06.04-Simpson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.06.04,2017.06.04,6091.0


## Step 2: Preparing

#### Remove unneeded variables

In [111]:
shark_col_clean = shark_raw.drop(['Case Number', 'Name', 'Investigator or Source', 
                                  'pdf', 'href formula', 'href', 'Case Number.1', 'Case Number.2',
                                  'original order'], axis=1)

#### Rename variable names that contain spaces or special characters

In [112]:
shark_col_clean.rename(columns={'Fatal (Y/N)': 'Fatal', 'Species ': 'Shark'}, inplace=True)

#### Filter to only United States attacks

In [157]:
shark_usa = shark_col_clean[(shark_col_clean.Country == 'USA')]     

shark_usa = shark_usa.drop(['Country'], axis=1)
shark_usa.head(5)                  

Unnamed: 0,Date,Year,Type,Area,Location,Activity,Sex,Age,Injury,Fatal,Time,Shark
2,10-Jun-17,2017.0,Unprovoked,Florida,"Ponce Inlet, Volusia County",Surfing,M,19.0,Laceration to left foot,N,10h00,
4,04-Jun-17,2017.0,Unprovoked,Florida,"Middle Sambo Reef off Boca Chica, Monroe County",Spearfishing,M,,Laceration to shin,N,,8' shark
6,30-May-17,2017.0,Provoked,South Carolina,"Awendaw, Charleston County",Touching a shark,F,20.0,Right hand bitten by hooked shark PROVOKED INC...,N,,3' shark
7,28-May-17,2017.0,Unprovoked,Florida,Off Jupiter,Feeding sharks,M,,Lacerations to right arm,N,Morning,Tiger shark
12,03-May-17,2017.0,Invalid,California,"Sunset Beach, Orange County",Surfing,F,18.0,"Laceration to thigh, likely caused by surfboar...",N,14h30,Shark involvement highly doubtful


#### Identify missing data

In [158]:
print(shark_usa.isnull().sum())

Date          0
Year          0
Type          2
Area          4
Location     47
Activity    129
Sex         103
Age         696
Injury        5
Fatal         4
Time        818
Shark       941
dtype: int64


In [160]:
#remove NaN values, except Shark. This can be Unknown

shark_usa.Shark.fillna(value='Unknown', inplace=True)

shark_usa1 = shark_usa.dropna()

shark_usa1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1091 entries, 2 to 5763
Data columns (total 12 columns):
Date        1091 non-null object
Year        1091 non-null float64
Type        1091 non-null object
Area        1091 non-null object
Location    1091 non-null object
Activity    1091 non-null object
Sex         1091 non-null object
Age         1091 non-null object
Injury      1091 non-null object
Fatal       1091 non-null object
Time        1091 non-null object
Shark       1091 non-null object
dtypes: float64(1), object(11)
memory usage: 110.8+ KB


In [161]:
shark_usa1['Fatal'].value_counts()

N          1026
Y            64
UNKNOWN       1
Name: Fatal, dtype: int64

In [162]:
shark_usa2 = shark_usa1[(shark_usa1.Fatal == 'N') | (shark_usa1.Fatal == 'Y')]

shark_usa2['Fatal'].value_counts()

N    1026
Y      64
Name: Fatal, dtype: int64

In [201]:
shark_usa2['Type'].value_counts()

Unprovoked      980
Provoked         54
Invalid          46
Sea Disaster      6
Boat              4
Name: Type, dtype: int64

In [202]:
shark_usa3 = shark_usa2[(shark_usa1.Type != 'Invalid')]
shark_usa3.Type.replace('Sea Disaster', 'Unprovoked', inplace=True)
shark_usa3.Type.replace('Boat', 'Unprovoked', inplace=True)
shark_usa3['Type'].value_counts()

  """Entry point for launching an IPython kernel.


Unprovoked    990
Provoked       54
Name: Type, dtype: int64

In [203]:
shark_usa3.head()

Unnamed: 0,Date,Year,Type,Area,Location,Activity,Sex,Age,Injury,Fatal,Time,Shark
2,10-Jun-17,2017.0,Unprovoked,Florida,"Ponce Inlet, Volusia County",Surfing,M,19,Laceration to left foot,N,10h00,Unknown
15,29-Apr-17,2017.0,Unprovoked,South Carolina,"Folly Beach, Charleston County",Surfing,F,33,Left foot bitten,N,11h00,Unknown
23,14-Apr-17,2017.0,Unprovoked,Hawaii,"Kekaha Beach, Kauai",Surfing,M,28,Lower right leg severely injured,N,09h00,"Tiger shark, 12'"
24,13-Apr-17,2017.0,Unprovoked,Florida,"Hanna Park, Jacksonville, Duval County",Surfing,M,17,Lacerations to right foot,N,13h30,Unknown
27,11-Apr-17,2017.0,Unprovoked,Florida,"Ormond Beach, Volusia County",Surfing,F,35,Calf bitten,N,16h00,Unknown


In [204]:
shark_usa3.loc[shark_usa3['Activity'].str.contains('surf', case=False), 'Activity_category'] = 'Surfing'
shark_usa3.loc[shark_usa3['Activity'].str.contains('swim', case=False), 'Activity_category'] = 'Swimming'
shark_usa3.loc[shark_usa3['Activity'].str.contains('wading', case=False), 'Activity_category'] = 'Swimming'
shark_usa3.loc[shark_usa3['Activity'].str.contains('float', case=False), 'Activity_category'] = 'Swimming'
shark_usa3.loc[shark_usa3['Activity'].str.contains('stand', case=False), 'Activity_category'] = 'Swimming'
shark_usa3.loc[shark_usa3['Activity'].str.contains('paddle', case=False), 'Activity_category'] = 'Boarding'
shark_usa3.loc[shark_usa3['Activity'].str.contains('boogie', case=False), 'Activity_category'] = 'Boarding'
shark_usa3.loc[shark_usa3['Activity'].str.contains('body', case=False), 'Activity_category'] = 'Boarding'
shark_usa3.loc[shark_usa3['Activity'].str.contains('board', case=False), 'Activity_category'] = 'Boarding'
shark_usa3.loc[shark_usa3['Activity'].str.contains('snork', case=False), 'Activity_category'] = 'Snorkeling/Diving'
shark_usa3.loc[shark_usa3['Activity'].str.contains('scuba', case=False), 'Activity_category'] = 'Snorkeling/Diving'
shark_usa3.loc[shark_usa3['Activity'].str.contains('diving', case=False), 'Activity_category'] = 'Snorkeling/Diving'
shark_usa3.loc[shark_usa3['Activity'].str.contains('fishing', case=False), 'Activity_category'] = 'Fishing'
shark_usa3.head(40)

Unnamed: 0,Date,Year,Type,Area,Location,Activity,Sex,Age,Injury,Fatal,Time,Shark,Activity_category
2,10-Jun-17,2017.0,Unprovoked,Florida,"Ponce Inlet, Volusia County",Surfing,M,19,Laceration to left foot,N,10h00,Unknown,Surfing
15,29-Apr-17,2017.0,Unprovoked,South Carolina,"Folly Beach, Charleston County",Surfing,F,33,Left foot bitten,N,11h00,Unknown,Surfing
23,14-Apr-17,2017.0,Unprovoked,Hawaii,"Kekaha Beach, Kauai",Surfing,M,28,Lower right leg severely injured,N,09h00,"Tiger shark, 12'",Surfing
24,13-Apr-17,2017.0,Unprovoked,Florida,"Hanna Park, Jacksonville, Duval County",Surfing,M,17,Lacerations to right foot,N,13h30,Unknown,Surfing
27,11-Apr-17,2017.0,Unprovoked,Florida,"Ormond Beach, Volusia County",Surfing,F,35,Calf bitten,N,16h00,Unknown,Surfing
28,10-Apr-17,2017.0,Unprovoked,Florida,"Melbourne Beach, Brevard County",Paddle boarding,F,10,Laceration to calf,N,17h45,Unknown,Boarding
29,10-Apr-17,2017.0,Unprovoked,Florida,"Melbourne Beach, Brevard County",Swimming,F,21,Minor injury to hand,N,17h00,Unknown,Swimming
31,05-Apr-17,2017.0,Unprovoked,Florida,"New Smyrna Beach, Volusia County",Swimming,F,51,"Thigh nipped, minor injury",N,13h00,Unknown,Swimming
33,02-Apr-17,2017.0,Unprovoked,Florida,"Destin, Okaloosa County",Swimming,F,17,Abrasions to lower left leg & puncture wounds ...,N,15h00,5' shark,Swimming
34,27-Mar-17,2017.0,Unprovoked,Florida,"New Smyrna Beach, Volusia County",Surfing,M,58,Minor injury to left foot,N,10h00,Unknown,Surfing
