In [1]:
# importing pandas and numpy

import pandas as pd
import numpy as np

In [2]:
#export file

sharks = pd.read_csv('sharkoriginal.csv',engine="python")

#to have an overview of the data size

sharks.shape

(5992, 24)

In [3]:
#overview of the data

sharks.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,,
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,


In [4]:
#index information

sharks.index

RangeIndex(start=0, stop=5992, step=1)

In [5]:
#size of dataframe

sharks.size

143808

In [6]:
#number of rows

len(sharks)

5992

In [7]:
#creating a copy

csharks=sharks.copy()
csharks.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [8]:
#check and show only columns with nulls 

null_columns=csharks.isnull().sum()
null_columns[null_columns > 0]



Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
href formula                 1
href                         3
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

In [9]:
# checked for irrelevant columns
#since there are 5992 rows, I assumed if 90% of the data has value null, it´s not worth to be analysed - so 5392
# it was proven possible to observe no more columns would drop, but I also decided to prove it empirically 

drop_cols = list(null_columns[null_columns > 5392].index)
csharks = csharks.drop(drop_cols, axis=1)

In [10]:
# observing the data, it seems Case number columns are the same
#confirming case number 1 column and case number 2 columns are the same through boolean map

print(csharks['Case Number.1']==csharks['Case Number.2'])

#count the rows to match the boolean map length, to confirm the results

print(sharks.shape[0])

#once confirmed, dropping one of the columns

csharks.drop(['Case Number.2'], axis=1, inplace=True)

0        True
1        True
2        True
3        True
4       False
        ...  
5987     True
5988     True
5989     True
5990     True
5991     True
Length: 5992, dtype: bool
5992


In [11]:
#if there are spaces in the columns, they won´t appear anymore and the columns can be filtered correctly

csharks.columns = csharks.columns.str.replace(' ', '')

In [12]:
# substituted the nulls by NA (as Non Available) for descriptive data


csharks[['Country', 'Area', 'Location','Activity','Name','Sex','Age','Injury','Time','Species','href']] = csharks[['Country', 'Area', 'Location','Activity','Name','Sex','Age','Injury','Time','Species','href']].fillna("NA")


In [13]:
#checking data for potential new index 

print(csharks['originalorder'].nunique())

print(csharks['CaseNumber'].nunique())

# another way to do it

print(csharks['originalorder'].is_unique)

print(csharks['CaseNumber'].is_unique)

#although it doesn´t seem unique, I assumed it was close to the total number and assumed original order as index to test


5988
5976
False
False


In [14]:
# change index to original order

csharks.set_index(['originalorder'], inplace=True)

In [15]:
#not in order, sort by the new index

csharks=csharks.sort_values(by='originalorder')

In [16]:
#after checking for irrelevant data given the analysis goals and not because of nulls or repetitions,deleting more columns
# deleted pdf and ref since both can be treaceable through case number
# deleted also name since we have gender and age
#deleted also CaseNumber.1 since there was still CaseNumber
#deleted also the exact location, since would take a long time to clean the data and the information about the area is already relevant
#left date and time because the goal is to check for seasonality

irrelevant_columns=['Name','pdf','hrefformula', 'href', 'CaseNumber.1','Location']
csharks=csharks.drop(irrelevant_columns, axis=1)

In [17]:
# checked for outliers, years in this case

descriptivecsharks = csharks.describe().transpose()

print(descriptivecsharks)

descriptivecsharks['IQR'] = descriptivecsharks['75%'] - descriptivecsharks['25%']
descriptivecsharks

outlier25=descriptivecsharks['25%']-(1.5*descriptivecsharks['IQR'])

outlier75=descriptivecsharks['75%']+(1.5*descriptivecsharks['IQR'])

print(outlier25)
print(outlier75)



       count         mean         std  min     25%     50%     75%     max
Year  5992.0  1925.204606  286.473712  0.0  1942.0  1975.0  2003.0  2016.0
Year    1850.5
dtype: float64
Year    2094.5
dtype: float64


In [18]:
#there are outliers if outlier<Year 1850 and outlier> 2094,5

csharks['Year'] = np.where(csharks.Year > 1850, csharks.Year, "Not relevant")

csharks

Unnamed: 0_level_0,CaseNumber,Date,Year,Type,Country,Area,Activity,Sex,Age,Injury,Fatal(Y/N),Time,Species,InvestigatororSource
originalorder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2,ND.0001,1845-1853,Not relevant,Unprovoked,CEYLON (SRI LANKA),Eastern Province,Swimming,M,15,"FATAL. ""Shark bit him in half, carrying away t...",Y,,,S.W. Baker
3,ND.0002,1883-1889,Not relevant,Unprovoked,PANAMA,,,M,,FATAL,Y,,,"The Sun, 10/20/1938"
4,ND.0003,1900-1905,Not relevant,Unprovoked,USA,North Carolina,Swimming,M,,FATAL,Y,,,"F. Schwartz, p.23; C. Creswell, GSAF"
5,ND.0004,Before 1903,Not relevant,Unprovoked,AUSTRALIA,Western Australia,Pearl diving,M,,FATAL,Y,,,"H. Taunton; N. Bartlett, pp. 233-234"
6,ND.0005,Before 1903,Not relevant,Unprovoked,AUSTRALIA,Western Australia,Diving,M,,FATAL,Y,,,"H. Taunton; N. Bartlett, p. 234"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5989,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Surfing,M,,No injury: Knocked off board by shark,N,,2 m shark,"The Age, 9/16/2016"
5990,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Surfing,M,,Struck by fin on chest & leg,N,,,"The Age, 9/18/2016"
5991,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,Surfing,M,43,Lacerations to lower leg,N,10h43,,"Orlando Sentinel, 9/19/2016"
5992,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,Surfing,M,36,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016"


In [19]:
#eliminating the Not relevant years

csharks.drop(csharks[csharks['Year'] == 'Not relevant'].index, inplace=True)

#confirm they are removed 
csharks.Year.value_counts()

#setting year as numeric

csharks['Year']=pd.to_numeric(csharks['Year']).astype(int)

#confirm it worked

csharks.dtypes


CaseNumber              object
Date                    object
Year                     int32
Type                    object
Country                 object
Area                    object
Activity                object
Sex                     object
Age                     object
Injury                  object
Fatal(Y/N)              object
Time                    object
Species                 object
InvestigatororSource    object
dtype: object

In [20]:
# hypothetically, i got an indication that only the last 20 years should matter
#i created a case my head of data decided that only the last 20 years would matter
# couldn´t substitute the rows with loc method, left for discussion

# csharks.loc[csharks['Year']<=2000]

csharks.drop(csharks[csharks['Year']<=2000].index, inplace=True)


In [21]:
#confirm no duplicates left

before=len(csharks)

csharks.drop_duplicates(inplace=True)

after = len(csharks)
print('Number of duplicate records dropped: ', str(before - after))



Number of duplicate records dropped:  0


In [22]:
#number of times words that contain Hawaii show up=True

csharks['Area'].str.contains(r'Hawaii').value_counts()

False    1644
True      114
Name: Area, dtype: int64

In [23]:
csharks.Area.value_counts()

Florida                                                461
New South Wales                                        134
Hawaii                                                 114
California                                              98
Western Australia                                       74
                                                      ... 
San Carlos                                               1
Kochi Prefecture                                         1
Cabo San Lucas                                           1
Telyakovsky Bay, Khasan,  Primorsky Krai (Far East)      1
Santiago de Cuba Province                                1
Name: Area, Length: 234, dtype: int64

In [24]:
#Group Area with similar keywords, in this case found a pattern with Hawaii

csharks['Area'][csharks.Area.str.contains(r'Hawaii')] = 'Hawaii'

#To confirm that Hawaii count equals the number of Area names containing Hawaii 

csharks.Area.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  csharks['Area'][csharks.Area.str.contains(r'Hawaii')] = 'Hawaii'


Florida                                                461
New South Wales                                        134
Hawaii                                                 114
California                                              98
Western Australia                                       74
                                                      ... 
San Carlos                                               1
Kochi Prefecture                                         1
Cabo San Lucas                                           1
Telyakovsky Bay, Khasan,  Primorsky Krai (Far East)      1
Santiago de Cuba Province                                1
Name: Area, Length: 234, dtype: int64

In [25]:
# saw many locations with number/without proper string that seemed irrelevant. 
#it seems they were dropped in the previous steps but proved it empirically
#decided locations with numbers will be considered not relevant

csharks['Area'][csharks.Area.str.contains(r'[0-9]')] = 'Not relevant'

#dropping not relevant locations

csharks.drop(csharks[csharks['Area'] == 'Not relevant'].index, inplace=True)

#the areas starting with between are also not very precise, decided to drop them also

csharks['Area'][csharks.Area.str.contains(r'Between')] = 'Not precised'

csharks.drop(csharks[csharks['Area'] == 'Not precised'].index, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  csharks['Area'][csharks.Area.str.contains(r'[0-9]')] = 'Not relevant'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  csharks['Area'][csharks.Area.str.contains(r'Between')] = 'Not precised'


In [26]:
#improve area columns visuals

csharks.Area = (csharks.Area
                            .str.replace('?', '')
                            .str.replace('&', 'and')
                            .str.lower()
                            .str.title())

In [27]:
#organise countries

csharks.Country = (csharks.Country
                            .str.replace('?', '')
                            .str.replace('&', 'and')
                            .str.lower()
                            .str.title()
                            .str.replace('Usa','USA'))

#check the frequency of attacks by country

country=csharks.Country.value_counts(normalize=True)

#question to discuss: categorise countries as "High Danger Country, Medium, Low"

In [28]:
#thought of eliminating the age values with letters but it would reduce the data considerably - decided not to

number=csharks['Age'].str.isnumeric()

number.value_counts()



True     1290
False     463
Name: Age, dtype: int64

In [29]:
#Clean up age


csharks['Age'][csharks.Age.str.contains(r'[A-Za-z]')] = 'NA'
csharks['Age'][csharks.Age.str.contains(r'\&')] = 'More than one person'
csharks['Age'][csharks.Age.str.contains(r'\!?/,')] = 'NA'

csharks.Age.value_counts()





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  csharks['Age'][csharks.Age.str.contains(r'[A-Za-z]')] = 'NA'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  csharks['Age'][csharks.Age.str.contains(r'\&')] = 'More than one person'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  csharks['Age'][csharks.Age.str.contains(r'\!?/,')] = 'NA'


NA    458
15     50
20     50
16     44
19     42
     ... 
74      1
6½      1
86      1
75      1
5       1
Name: Age, Length: 78, dtype: int64

In [30]:
#DISCUSSION
#decided that more columns should be have a different type. changing age now
# changing age like this didn´t work. the idea was to create age categories afterwards
#left to discuss

#csharks.drop(csharks[csharks['Age'] == 'NA'].index, inplace=True)
#csharks['Age']=pd.to_numeric(csharks['Age'], errors='ignore')
#csharks['Age']=csharks['Age'].astype(int, errors='ignore')

In [31]:
#clean up the type
#eliminate invalids

csharks.drop(csharks[csharks['Type'] == 'Invalid'].index, inplace=True)

# Boat and boating are the same

csharks.Type = (csharks.Type.str.replace('Boating', 'Boat'))

#Created new category, because we want to protect the sharks and it´s not their fault boat or sea disasters!

csharks['shark fault?']=np.where(csharks['Type']=='Provoked','Y','N')


In [32]:
#DISCUSSION
#Date clean up. Again, had some issues. 
#The idea was to categorize afterwards and check for seasonality/warm waters correlation
#left it for discussion

#csharks['Date']=pd.to_datetime(csharks['Date'],errors='ignore')
#csharks['Month'] = pd.DatetimeIndex(csharks['Date']).month
#csharks.dtypes

In [33]:
#basic clean up injury

csharks.Injury = (csharks.Injury
                            .str.replace('?', '')
                            .str.replace('&', 'and')
                            .str.lower())

In [34]:
#basic clean up activity

csharks.Activity = (csharks.Activity
                            .str.replace('?', '')
                            .str.replace('&', 'and')
                            .str.lower()
                            .str.title())

In [35]:
#clean up the Gender

#DISCUSSION

#Why doesnt it work with the loc?
#csharks.loc[csharks.Sex.str.startswith('F')] = 'Female'
#csharks.loc[csharks['Sex'].str.startswith('M')] = 'Male'

csharks['Sex'][csharks.Sex.str.startswith('F')]='Female'
csharks['Sex'][csharks.Sex.str.startswith('M')]='Male'

# filtering bad results all to NA

csharks['Sex'][(csharks['Sex']=='N') |
               (csharks['Sex']=='lli') |
              (csharks['Sex']== '.')]='NA'

csharks.Sex.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  csharks['Sex'][csharks.Sex.str.startswith('F')]='Female'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  csharks['Sex'][csharks.Sex.str.startswith('M')]='Male'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  csharks['Sex'][(csharks['Sex']=='N') |


Male      1284
Female     254
NA          72
Name: Sex, dtype: int64

In [36]:
#organise Species data

csharks.Species=csharks.Species.str.lower()

#if non fatal and species not written:

# csharks.loc[(csharks['Species']=='na') & (csharks['Fatal(Y/N)']!='NA')]

#too much data would be deleted and it would be bias due to personal inclination to save the sharks
#didn´t apply the approach
#assumed I was told to focus my analysis on the most well known shark species with attacks registered

# check for categories

csharks['Species'][csharks.Species.str.contains(r'zambesi')] = 'Zambesi Shark'
csharks['Species'][csharks.Species.str.contains(r'white')] = 'White Shark'
csharks['Species'][csharks.Species.str.contains(r'tiger')] = 'Tiger Shark'
csharks['Species'][csharks.Species.str.contains(r'blacktip')] = 'Blacktip Shark'
csharks['Species'][csharks.Species.str.contains(r'blue')] = 'Blue Shark'
csharks['Species'][csharks.Species.str.contains(r'bronze')] = 'Bronze Whaler Shark'
csharks['Species'][csharks.Species.str.contains(r'raggedtooth')] = 'Raggedtooth Shark'
csharks['Species'][csharks.Species.str.contains(r'nurse')] = 'Nurse Shark'
csharks['Species'][csharks.Species.str.contains(r'bull')] = 'Bull Shark'
csharks['Species'][csharks.Species.str.contains(r'caribbean')] = 'Caribbean Reef Shark'
csharks['Species'][csharks.Species.str.contains(r'make')] = 'Mako Shark'
csharks['Species'][csharks.Species.str.contains(r'grey')] = 'Grey Shark'
csharks['Species'][csharks.Species.str.contains(r'hammerhead')] = 'Hammerhead Shark'
csharks['Species'][csharks.Species.str.contains(r'leopard')] = 'Leopard Shark'
csharks['Species'][csharks.Species.str.contains(r'lemon')] = 'Lemon Shark'


csharks.Species.value_counts()


#tried to leave only the species that matters, it didnt work. left it for discussion

"""ccsharks=csharks.loc(csharks[(csharks['Species'] == 'Zambesi Shark') |
                     (csharks['Species'] == 'White Shark') |
                    (csharks['Species'] == 'Tiger Shark') |
                     (csharks['Species'] == 'Blacktip Shark') |
                    (csharks['Species'] == 'Blue Shark') |
                     (csharks['Species'] == 'Bronze Whaler Shark') |
                    (csharks['Species'] == 'Raggedtooth Shark') |
                     (csharks['Species'] == 'Nurse Shark') |
                    (csharks['Species'] == 'Bull Shark') |
                     (csharks['Species'] == 'Caribbean Reef Shark') |
                   (csharks['Species'] == 'Mako Shark') |
                     (csharks['Species'] == 'Grey Shark') |
                   (csharks['Species'] == 'Hammerhead Shark') |
                     (csharks['Species'] == 'Leopard Shark') |
                    (csharks['Species'] == 'Lemon Shark')], axis=1) """


"""
csharks['Species'][(csharks[(csharks['Species'] != 'Zambesi Shark') |
                     (csharks['Species'] != 'White Shark') |
                    (csharks['Species'] != 'Tiger Shark') |
                     (csharks['Species'] != 'Blacktip Shark') |
                    (csharks['Species'] != 'Blue Shark') |
                     (csharks['Species'] != 'Bronze Whaler Shark') |
                    (csharks['Species'] != 'Raggedtooth Shark') |
                     (csharks['Species'] != 'Nurse Shark') |
                    (csharks['Species'] != 'Bull Shark') |
                     (csharks['Species'] != 'Caribbean Reef Shark') |
                   (csharks['Species'] != 'Mako Shark') |
                     (csharks['Species'] != 'Grey Shark') |
                   (csharks['Species'] != 'Hammerhead Shark') |
                     (csharks['Species'] != 'Leopard Shark') |
                    (csharks['Species'] != 'Lemon Shark')] = "NA" """

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  csharks['Species'][csharks.Species.str.contains(r'zambesi')] = 'Zambesi Shark'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  csharks['Species'][csharks.Species.str.contains(r'white')] = 'White Shark'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  csharks['Species'][csharks.Species.str.contains(r'tiger')] = 'Tiger Shark'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stab

'\ncsharks[\'Species\'][(csharks[(csharks[\'Species\'] != \'Zambesi Shark\') |\n                     (csharks[\'Species\'] != \'White Shark\') |\n                    (csharks[\'Species\'] != \'Tiger Shark\') |\n                     (csharks[\'Species\'] != \'Blacktip Shark\') |\n                    (csharks[\'Species\'] != \'Blue Shark\') |\n                     (csharks[\'Species\'] != \'Bronze Whaler Shark\') |\n                    (csharks[\'Species\'] != \'Raggedtooth Shark\') |\n                     (csharks[\'Species\'] != \'Nurse Shark\') |\n                    (csharks[\'Species\'] != \'Bull Shark\') |\n                     (csharks[\'Species\'] != \'Caribbean Reef Shark\') |\n                   (csharks[\'Species\'] != \'Mako Shark\') |\n                     (csharks[\'Species\'] != \'Grey Shark\') |\n                   (csharks[\'Species\'] != \'Hammerhead Shark\') |\n                     (csharks[\'Species\'] != \'Leopard Shark\') |\n                    (csharks[\'Species\']

In [37]:
#demographic/geographic information
# to compare some demographics and fatal or non fatal injuries

melted = pd.melt(csharks, id_vars=['Area','Sex','Age','Fatal(Y/N)'], 
                 value_vars=['Injury'])
melted


Unnamed: 0,Area,Sex,Age,Fatal(Y/N),variable,value
0,North Island,Male,,N,Injury,"no injury, kayak bitten"
1,California,Male,,N,Injury,"foot bruised, board dinged"
2,South Australia,,,N,Injury,no injury to occupants
3,Holquin Province,Female,55,N,Injury,left arm bitten
4,New South Wales,Male,35,N,Injury,"no injury, shark ramme d and bit kayak"
...,...,...,...,...,...,...
1605,Victoria,Male,,N,Injury,no injury: knocked off board by shark
1606,Victoria,Male,,N,Injury,struck by fin on chest and leg
1607,Florida,Male,43,N,Injury,lacerations to lower leg
1608,Florida,Male,36,N,Injury,lacerations to hands


In [38]:
#split last columns and drop second one, connected to date again, irrelevant

csharks.InvestigatororSource=csharks.InvestigatororSource.str.split(pat=',',n=1,expand=True)


In [39]:
#reorder columns

order_list=['CaseNumber','Species','Injury', 'Fatal(Y/N)','shark fault?','Sex', 'Age','Activity','Country', 'Area', 'Year','Date','Time','Type','InvestigatororSource']

csharks=csharks[order_list]

csharks

Unnamed: 0_level_0,CaseNumber,Species,Injury,Fatal(Y/N),shark fault?,Sex,Age,Activity,Country,Area,Year,Date,Time,Type,InvestigatororSource
originalorder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
4236,2001.01.06,White Shark,"no injury, kayak bitten",N,No,Male,,Kayaking,New Zealand,North Island,2001,06-Jan-01,,Boat,C. Duffy
4237,2001.01.09,"1.8 m to 2.4 m [6' to 8'] ""black finned shark""","foot bruised, board dinged",N,No,Male,,Surfing,USA,California,2001,09-Jan-01,,Unprovoked,M. Sanders
4238,2001.01.21,White Shark,no injury to occupants,N,No,,,Fishing,Australia,South Australia,2001,21-Jan-01,,Boat,Northern Territory News
4239,2001.01.24,2 m [6.75'] shark,left arm bitten,N,No,Female,55,Swimming,Cuba,Holquin Province,2001,24-Jan-01,,Unprovoked,Trip Advisor
4240,2001.01.24.R,5 m shark,"no injury, shark ramme d and bit kayak",N,No,Male,35,Kayaking,Australia,New South Wales,2001,Reported 24-Jan-2001,,Boat,Daily Telegraph 1/24/2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5989,2016.09.15,2 m shark,no injury: knocked off board by shark,N,No,Male,,Surfing,Australia,Victoria,2016,16-Sep-16,,Unprovoked,The Age
5990,2016.09.17,na,struck by fin on chest and leg,N,No,Male,,Surfing,Australia,Victoria,2016,17-Sep-16,,Unprovoked,The Age
5991,2016.09.18.a,na,lacerations to lower leg,N,No,Male,43,Surfing,USA,Florida,2016,18-Sep-16,10h43,Unprovoked,Orlando Sentinel
5992,2016.09.18.b,na,lacerations to hands,N,No,Male,36,Surfing,USA,Florida,2016,18-Sep-16,11h00,Unprovoked,Orlando Sentinel


In [40]:
#rename columns

csharks=csharks.rename(columns={'CaseNumber':'Case Number','Injury':'Type of Injury','Fatal(Y/N)':'Fatal:Yes or No','InvestigatororSource':'Investigation','shark fault?':'Unprovoked attack'})


In [41]:
#decided the index chosen was not relevant and reset to zero

csharks.reset_index(drop=True, inplace=True)

csharks

Unnamed: 0,Case Number,Species,Type of Injury,Fatal:Yes or No,Unprovoked attack,Sex,Age,Activity,Country,Area,Year,Date,Time,Type,Investigation
0,2001.01.06,White Shark,"no injury, kayak bitten",N,No,Male,,Kayaking,New Zealand,North Island,2001,06-Jan-01,,Boat,C. Duffy
1,2001.01.09,"1.8 m to 2.4 m [6' to 8'] ""black finned shark""","foot bruised, board dinged",N,No,Male,,Surfing,USA,California,2001,09-Jan-01,,Unprovoked,M. Sanders
2,2001.01.21,White Shark,no injury to occupants,N,No,,,Fishing,Australia,South Australia,2001,21-Jan-01,,Boat,Northern Territory News
3,2001.01.24,2 m [6.75'] shark,left arm bitten,N,No,Female,55,Swimming,Cuba,Holquin Province,2001,24-Jan-01,,Unprovoked,Trip Advisor
4,2001.01.24.R,5 m shark,"no injury, shark ramme d and bit kayak",N,No,Male,35,Kayaking,Australia,New South Wales,2001,Reported 24-Jan-2001,,Boat,Daily Telegraph 1/24/2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1605,2016.09.15,2 m shark,no injury: knocked off board by shark,N,No,Male,,Surfing,Australia,Victoria,2016,16-Sep-16,,Unprovoked,The Age
1606,2016.09.17,na,struck by fin on chest and leg,N,No,Male,,Surfing,Australia,Victoria,2016,17-Sep-16,,Unprovoked,The Age
1607,2016.09.18.a,na,lacerations to lower leg,N,No,Male,43,Surfing,USA,Florida,2016,18-Sep-16,10h43,Unprovoked,Orlando Sentinel
1608,2016.09.18.b,na,lacerations to hands,N,No,Male,36,Surfing,USA,Florida,2016,18-Sep-16,11h00,Unprovoked,Orlando Sentinel


In [42]:
#QUESTIONS TO DISCUSS

In [43]:
#next steps:
#check seasonality through date
#cluster ages
#create high danger zones
