In [13]:
#always going to import these libraries, regardless of whether they get used in each noteboo for continuty
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [14]:
#importing the csv and making sure it can be read 
attack = pd.read_csv('data/attacks.csv', encoding= 'unicode_escape')
attack.columns = attack.columns.str.rstrip()
pd.set_option("display.max_rows", None)

In [15]:
#identifying columns that are not super relevant to my hypotheses
unwanted_data = ['pdf','href','href formula', 'original order','Case Number.1','Case Number.2', 'Unnamed: 22', 'Unnamed: 23', 'Year', 'Type', 'Date', 'Investigator or Source', 'Injury', 'Name']

#removing irrelevant columns
attack = attack.drop(columns = unwanted_data)

#removing rows with incomplete data
attack = attack.dropna()

In [16]:
#looking at the values for shark to remove any outliers/ incorrect formatting
shark_counts = attack["Species"].value_counts()

#keeping only the shark values I want
valid_species = ['White shark', 'Tiger shark', 'Bull shark']
attack = attack[attack['Species'].isin(valid_species)]

In [17]:
#setting the index as the case number
attack.set_index('Case Number', inplace=True)

#establishing the unique values for case numbers to check for formatting errors or irrelvant data
case_values = attack.index.value_counts()

#removing incorrectly formatted dates
attack = attack.loc[~attack.index.str.startswith('ND')]


In [18]:
#establishing all the unique time values to clean up this column
time_values = attack['Time'].value_counts()

#reformatting timing to 00h00 format
attack['Time'] = attack['Time'].replace({'Afternoon':'12h00', 'Morning':'09h00', 'Early afternoon':'14h00', '08h00 / 09h30 ': '08h30', '>06h45': '06h45', 'Shortly before 12h00': '12h00', 'Late afternoon': '16h00', '"Just before 11h00"':'11h00', 'Early morning ': '07h00'})

#replacing h with : to create h:m format
attack['Time'] = attack['Time'].str.replace('h', ':')


In [19]:
#looking at countries to filter geographically
country_values = attack['Country'].value_counts()

#assigning a hemisphere value to each location
northern_countries = ['USA', 'CROATIA', 'CUBA', 'MEXICO', 'IRAQ', 'BAHAMAS', 'MALDIVES', 'COSTA RICA', 'ITALY']
attack['Hemisphere'] = 'Southern'
attack.loc[attack['Country'].isin(northern_countries), 'Hemisphere'] = 'Northern'

#creating a new dataframe to look at shark attacks by hemisphere
hem_attack = attack[['Country', 'Hemisphere', 'Fatal (Y/N)']]

In [25]:
year_attack


Unnamed: 0_level_0,year,Fatal (Y/N),Country
Case Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018.06.25,2018,N,USA
2018.06.03.a,2018,Y,BRAZIL
2018.04.28.b,2018,N,COSTA RICA
2018.04.23,2018,N,MALDIVES
2018.04.03,2018,N,SOUTH AFRICA
2017.12.31,2017,N,USA
2017.11.04,2017,Y,CUBA
2017.06.02,2017,N,BAHAMAS
2017.04.17.a,2017,Y,AUSTRALIA
2016.12.01,2016,N,AUSTRALIA


In [20]:
#extracting the year to a new column
attack['year'] = attack.index.str[:4].astype(int)

#creating a new dataframe to look at shark attacks by year
year_attack = attack[['year', 'Fatal (Y/N)', 'Country']]


In [21]:
#making a new dataframe to look at shark attacks in the us
us_attacks = attack[attack['Country'] == 'USA'][['Area', 'Fatal (Y/N)']]

#looking at unique values of US area column
us_attacks['Area'].unique()

#splitting the shark attacks into west/east coast
west_coast = ['California', 'Oregon', 'Hawaii']
us_attacks['West/East'] = 'East'
us_attacks.loc[us_attacks['Area'].isin(west_coast), 'West/East'] = 'West'



In [22]:
# export new csv to data folder
attack.to_csv("data/attack_clean.csv", index=True)
year_attack.to_csv("data/attack_by_year.csv", index=True)
us_attacks.to_csv("data/us_attacks.csv", index=True)
hem_attack.to_csv("data/attack_by_hemisphere.csv", index=True)