In [2]:
#always going to import these libraries, regardless of whether they get used in each noteboo for continuty
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [3]:
#importing the csv and making sure it can be read 
attack = pd.read_csv('data/attacks.csv', encoding= 'unicode_escape')
attack.columns = attack.columns.str.rstrip()
pd.set_option("display.max_rows", None)

In [4]:
#identifying columns that are not super relevant to my hypotheses
unwanted_data = ['pdf','href','href formula', 'original order','Case Number.1','Case Number.2', 'Unnamed: 22', 'Unnamed: 23', 'Year', 'Type', 'Date', 'Investigator or Source', 'Injury', 'Name']

#removing irrelevant columns
attack = attack.drop(columns = unwanted_data)

#removing rows with incomplete data
attack = attack.dropna()

In [5]:
#looking at the values for shark to remove any outliers/ incorrect formatting
shark_counts = attack["Species"].value_counts()

#keeping only the shark values I want
valid_species = ['White shark', 'Tiger shark', 'Bull shark']
attack = attack[attack['Species'].isin(valid_species)]

In [6]:
#setting the index as the case number
attack.set_index('Case Number', inplace=True)

#establishing the unique values for case numbers 
case_values = attack.index.value_counts()

#removing incorrectly formatted dates
attack = attack.loc[~attack.index.str.startswith('ND')]


In [7]:
#establishing all the unique time values to clean up this column
time_values = attack['Time'].value_counts()

#reformatting timing to 00h00 format
attack['Time'] = attack['Time'].replace({'Afternoon':'12h00', 'Morning':'09h00', 'Early afternoon':'14h00', '08h00 / 09h30 ': '08h30', '>06h45': '06h45', 'Shortly before 12h00': '12h00', 'Late afternoon': '16h00', '"Just before 11h00"':'11h00', 'Early morning ': '07h00'})

#replacing h with : to create h:m format
attack['Time'] = attack['Time'].str.replace('h', ':')


In [8]:
#looking at countries to filter geographically
country_values = attack['Country'].value_counts()

#assigning a hemisphere value to each location
northern_countries = ['USA', 'CROATIA', 'CUBA', 'MEXICO', 'IRAQ', 'BAHAMAS', 'MALDIVES', 'COSTA RICA', 'ITALY']
attack['Hemisphere'] = 'Southern'
attack.loc[attack['Country'].isin(northern_countries), 'Hemisphere'] = 'Northern'


In [9]:
# export new csv to data folder
attack.to_csv("data/attack_clean.csv", index=True)


In [10]:
attack

Unnamed: 0_level_0,Country,Area,Location,Activity,Sex,Age,Fatal (Y/N),Time,Species,Hemisphere
Case Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018.06.25,USA,California,"Oceanside, San Diego County",Paddling,F,57,N,18:00,White shark,Northern
2018.06.03.a,BRAZIL,Pernambuco,"Piedade Beach, Recife",Swimming,M,18,Y,16:00,Tiger shark,Southern
2018.04.28.b,COSTA RICA,Cocos Island,Manuelita,Scuba diving,M,30,N,10:40,Tiger shark,Northern
2018.04.23,MALDIVES,Alifu Alifu Atoll,Madoogali,Fishing,M,32,N,21:50,Tiger shark,Northern
2018.04.03,SOUTH AFRICA,Eastern Cape Province,St. Francis Bay,Surfing,M,19,N,15:00,White shark,Southern
2017.12.31,USA,Hawaii,"Hultin's Beach, Oahu",Surfing,F,54,N,18:00,Tiger shark,Northern
2017.11.04,CUBA,Holquin Province,Guardalavaca Beach,Night bathing,M,22,Y,23:30,Tiger shark,Northern
2017.06.02,BAHAMAS,New Providence,Athol Island,Snorkeling,F,32,N,12:00,Tiger shark,Northern
2017.04.17.a,AUSTRALIA,Western Australia,Kelpies near Wylie Bay,Surfing,F,17,Y,16:00,White shark,Southern
2016.12.01,AUSTRALIA,New South Wales,Booti Booti National Park,Surfing,M,65,N,09:00,White shark,Southern
