In [1]:
#importing libraries, the data and shaping the latter as a dataframe
import pandas as pd
import numpy as np
import sys
import re
attacks = pd.read_csv("ignoredstuff/attacks.csv",encoding = "ISO-8859-1")

# A. Initial exploration and general basic cleaning

In [2]:
#df's shape
attacks.shape

(25723, 24)

In [3]:
#df's columns
columns_attacks = attacks.columns
columns_attacks

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [4]:
#a quick view...
attacks.sample(10)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
1403,2007.02.00,Feb-2007,2007.0,Unprovoked,NEW CALEDONIA,Loyalty Islands,Ouvéa,Spearfishing,male,M,...,,"Les Nouvelles Caledoniennes,",2007.02.00-Ouvea.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2007.02.00,2007.02.00,4900.0,,
22993,,,,,,,,,,,...,,,,,,,,,,
24940,,,,,,,,,,,...,,,,,,,,,,
9529,,,,,,,,,,,...,,,,,,,,,,
21792,,,,,,,,,,,...,,,,,,,,,,
23869,,,,,,,,,,,...,,,,,,,,,,
25181,,,,,,,,,,,...,,,,,,,,,,
18877,,,,,,,,,,,...,,,,,,,,,,
13734,,,,,,,,,,,...,,,,,,,,,,
3588,1966.01.25,25-Jan-1966,1966.0,Sea Disaster,INDONESIA,North Sumatra,Near Belawan,wreck of the State Oil Company ship Permina,,,...,,Sydney Daily Telegraph,1966.01.25-Permina.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1966.01.25,1966.01.25,2715.0,,


In [5]:
#elemental informatio
attacks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

In [6]:
#looking for NaNs
attacks.isna().sum()

Case Number               17021
Date                      19421
Year                      19423
Type                      19425
Country                   19471
Area                      19876
Location                  19961
Activity                  19965
Name                      19631
Sex                       19986
Age                       22252
Injury                    19449
Fatal (Y/N)               19960
Time                      22775
Species                   22259
Investigator or Source    19438
pdf                       19421
href formula              19422
href                      19421
Case Number.1             19421
Case Number.2             19421
original order            19414
Unnamed: 22               25722
Unnamed: 23               25721
dtype: int64

In [7]:
#given the high number of NaNs it will be better to work with them before worrying about descriptive statistics...

In [8]:
#deleting all the rows exclusively composed by null values
attacks_1_1 = attacks.dropna(how="all")

In [9]:
#check again the number of NaNs
attacks_1_1.isna().sum()
#17020 rows contained only null values and were thus deleted


Case Number                  1
Date                      2401
Year                      2403
Type                      2405
Country                   2451
Area                      2856
Location                  2941
Activity                  2945
Name                      2611
Sex                       2966
Age                       5232
Injury                    2429
Fatal (Y/N)               2940
Time                      5755
Species                   5239
Investigator or Source    2418
pdf                       2401
href formula              2402
href                      2401
Case Number.1             2401
Case Number.2             2401
original order            2394
Unnamed: 22               8702
Unnamed: 23               8701
dtype: int64

In [10]:
#checking the shape of our new "cleaner" df
attacks_1_1.shape

(8703, 24)

In [11]:
#comparing the info provided by .shape and the sum of null given by .isna().sum()
#the columns "Unnamed: 22" & "Unnamed: 23" will be deleted as the contain basically null values
#which are useless and create a gap of such a mgnitude that is simply unfeasible to fill 

attacks_1_2 = attacks_1_1.drop(["Unnamed: 22" , "Unnamed: 23"], axis=1)


In [12]:
#our new df's columns
columns_attacks_1_2 = attacks_1_2.columns
columns_attacks_1_2

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order'],
      dtype='object')

In [13]:
#taking a look at a sample
attacks_1_2.sample(10)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
1065,2009.12.16,16-Dec-2009,2009.0,Unprovoked,NEW ZEALAND,South Island,Clark Island,Swimming to shore from capsized kayak,Maurice Bede Philips,M,...,Y,,"White shark, 2.8 to 3 m","R.D. Weeks, GSAF",2009.12.16-Philips.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2009.12.16,2009.12.16,5238.0
2641,1990.04.06,06-Apr-1990,1990.0,Unprovoked,AUSTRALIA,Queensland,Fingal Beach,Surfing,Tony Patton,M,...,N,,3 m shark,"Herald, 4/10/1990, p.4",1990.04.06-TonyPatton.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1990.04.06,1990.04.06,3662.0
2589,1991.07.18,18-Jul-1991,1991.0,Unprovoked,USA,Florida,150 miles from Crystal River,Swimming from makeshift raft to life vest aft...,Larry Myers,M,...,N,,1.8 m [6'] blacktip shark,"Orlando Sentinel, 7/23/1991, p.A1",1991.07.18-Myers.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1991.07.18,1991.07.18,3714.0
5809,1881.08.12,12-Aug-1881,1881.0,Unprovoked,USA,Rhode Island,Providence,Swimming,Jerry Lowney,M,...,N,,,Providence Journal 8/15/1881,1881.08.12-Lowney.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1881.08.12,1881.08.12,494.0
8564,0,,,,,,,,,,...,,,,,,,,,,
3041,1981.02.07,07-Feb-1981,1981.0,Unprovoked,AUSTRALIA,Western Australia,"Parker's Point, Rottnest Island",,Filmer,,...,UNKNOWN,,,"T. Peake, GSAF",1981.02.07-NV-Filmer.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1981.02.07,1981.02.07,3262.0
8063,0,,,,,,,,,,...,,,,,,,,,,
1220,2008.08.16,16-Aug-2008,2008.0,Unprovoked,USA,US Virgin Islands,Buck Island,Treading water,Elizabeth Riggs,F,...,N,Dusk,8' bull shark or Caribbean reef shark,"M. Levne, GSAF",2008.08.16-Riggs.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2008.08.16,2008.08.16,5083.0
8667,0,,,,,,,,,,...,,,,,,,,,,
4171,1958.01.09,09-Jan-1958,1958.0,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Scotburgh,Swimming,Derek Garth Prinsloo,M,...,Y,07h30,White shark,"M. Levine, GSAF",1958.01.09-Prinsloo.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1958.01.09,1958.01.09,2132.0


# B. The Hypotheses to test

It has been decided that the hypotheses to test are the following:

- Shark attacks' fatality rate has fallen over time
- Shark attacks' fatality rate is negatively correlated with the level of development of the place where the attack takes place

# C. Reshaping and cleaning the dataset according to hypotheses to test

The formulated hypotheses imply that fatality will be our main variable of interest. Besides that, included variable which are to be inspected are those related to time and location. These are respectively "Date", "Year" "Time" plus those related to Case Number on the one hand, and "Area" "Location" and "Country" on the other.

In [14]:
#first of all, all the variable which are not of interest will be discarded
columns_to_discard = ['Type', 'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Time','Species ', 'Investigator or Source', 
                      'pdf', 'href formula', 'href','original order']

attacks_1_3= attacks_1_2.drop(columns_to_discard, axis=1)


In [15]:
#taking a look at this new refined df
print(attacks_1_3.shape)
attacks_1_3.sample(10)

(8703, 9)


Unnamed: 0,Case Number,Date,Year,Country,Area,Location,Fatal (Y/N),Case Number.1,Case Number.2
923,2011.05.21.a,21-May-2011,2011.0,NEW CALEDONIA,North Province,Kendec,Y,2011.05.21.a,2011.05.21.a
8279,0,,,,,,,,
5164,1925.09.03,03-Sep-1925,1925.0,ENGLAND,Isle of Wight,Off Shanklin,N,1925.09.03,1925.09.03
962,2010.12.01.b,01-Dec-2010,2010.0,EGYPT,South Sinai Peninsula,"Ras Nasrani, Sharm el-Sheikh",N,2010.12.01.b,2010.12.01.b
4628,1945.00.00.c,1945,1945.0,CUBA,Havana Province,Cojimar,Y,1945.00.00.c,1945.00.00.c
1894,2001.10.07,07-Oct-2001,2001.0,SOUTH AFRICA,KwaZulu-Natal,Salt Rock,N,2001.10.07,2001.10.07
559,2014.05.06,06-May-2014,2014.0,USA,South Carolina,"Coligny Beach, Hilton Head, Beaufort County",N,2014.05.06,2014.05.06
8136,0,,,,,,,,
6541,0,,,,,,,,
2526,1992.11.05.a,05-Nov-1992,1992.0,USA,Hawaii,"Kea'au Beach Park, O'ahu",Y,1992.11.05.a,1992.11.05.a


In [16]:
#resetting the df's index
attacks_1_3_1 = attacks_1_3.reset_index()
display(attacks_1_3_1) 

Unnamed: 0,index,Case Number,Date,Year,Country,Area,Location,Fatal (Y/N),Case Number.1,Case Number.2
0,0,2018.06.25,25-Jun-2018,2018.0,USA,California,"Oceanside, San Diego County",N,2018.06.25,2018.06.25
1,1,2018.06.18,18-Jun-2018,2018.0,USA,Georgia,"St. Simon Island, Glynn County",N,2018.06.18,2018.06.18
2,2,2018.06.09,09-Jun-2018,2018.0,USA,Hawaii,"Habush, Oahu",N,2018.06.09,2018.06.09
3,3,2018.06.08,08-Jun-2018,2018.0,AUSTRALIA,New South Wales,Arrawarra Headland,N,2018.06.08,2018.06.08
4,4,2018.06.04,04-Jun-2018,2018.0,MEXICO,Colima,La Ticla,N,2018.06.04,2018.06.04
...,...,...,...,...,...,...,...,...,...,...
8698,8698,0,,,,,,,,
8699,8699,0,,,,,,,,
8700,8700,0,,,,,,,,
8701,8701,0,,,,,,,,


In [17]:
#now lets delete all rows where the variable "Fatal (Y/N)" takes a null value, for that a specific function has been defined
# .isna().sum() is employed to check if rows including null values for the specified column have been indeed removed.
from src import row_nan_out

attacks_1_3_2 = row_nan_out(attacks_1_3_1,"Fatal (Y/N)")
print(attacks_1_3_2.isna().sum())


Se encontaron valores nulos a eliminar en los datos.
index              0
Case Number        1
Date               0
Year               1
Country           45
Area             404
Location         483
Fatal (Y/N)        0
Case Number.1      0
Case Number.2      0
dtype: int64


In [18]:
#resetting the index and removing redundant past indices
attacks_1_3_3 = attacks_1_3_2.reset_index()
attacks_1_3_4 = attacks_1_3_3.drop(["index", "level_0"], axis=1)

### C.1 Fatality rate across time

In [19]:
#the space in the column name "Case Number" is removed here asit caused problem further down the code
attacks_1_3_4 = attacks_1_3_4.rename(columns={"Case Number" : "CaseNumber"})

In [20]:
attacks_1_3_5 = attacks_1_3_4.copy()

#Comparing Case Number.1 and Case Number.2 as they seem to be largely the same...
#The rate of disagreement between the two is 0,002%, case number.2 is dropped as a consequence
#the column created for the comparisson is also deleted

attacks_1_3_5['Cases_Check'] = np.where((attacks_1_3_5["Case Number.1"] == attacks_1_3_5["Case Number.2"]), True , False)
attacks_1_3_5['Cases_Check'].value_counts()
attacks_1_3_5 = attacks_1_3_5.drop(["Case Number.2", "Cases_Check"], axis=1)

#the same comparisson is performed for Case Number.1 and Case Number
#with a rate of disagreement of 0,0036% Case Number.1 is dropped from the dataframe too
#the column created for the comparisson is also deleted again

attacks_1_3_5['Cases_Check'] = np.where((attacks_1_3_5["Case Number.1"] == attacks_1_3_5["CaseNumber"]), True , False)
attacks_1_3_5['Cases_Check'].value_counts()
attacks_1_3_6 = attacks_1_3_5.drop(["Case Number.1", "Cases_Check"], axis=1)

display(attacks_1_3_6)

Unnamed: 0,CaseNumber,Date,Year,Country,Area,Location,Fatal (Y/N)
0,2018.06.25,25-Jun-2018,2018.0,USA,California,"Oceanside, San Diego County",N
1,2018.06.18,18-Jun-2018,2018.0,USA,Georgia,"St. Simon Island, Glynn County",N
2,2018.06.09,09-Jun-2018,2018.0,USA,Hawaii,"Habush, Oahu",N
3,2018.06.08,08-Jun-2018,2018.0,AUSTRALIA,New South Wales,Arrawarra Headland,N
4,2018.06.04,04-Jun-2018,2018.0,MEXICO,Colima,La Ticla,N
...,...,...,...,...,...,...,...
5758,ND.0005,Before 1903,0.0,AUSTRALIA,Western Australia,Roebuck Bay,Y
5759,ND.0004,Before 1903,0.0,AUSTRALIA,Western Australia,,Y
5760,ND.0003,1900-1905,0.0,USA,North Carolina,Ocracoke Inlet,Y
5761,ND.0002,1883-1889,0.0,PANAMA,,"Panama Bay 8ºN, 79ºW",Y


In [21]:
#The function first_year is applied to extract the substring containing year information 
#for values in columns Date & CaseNumber

from src import first_year
attacks_1_3_6["Case_Year"] = attacks_1_3_6.CaseNumber.apply(first_year)
attacks_1_3_6["Date_Year"] = attacks_1_3_6.Date.apply(first_year)
display(attacks_1_3_6)


Unnamed: 0,CaseNumber,Date,Year,Country,Area,Location,Fatal (Y/N),Case_Year,Date_Year
0,2018.06.25,25-Jun-2018,2018.0,USA,California,"Oceanside, San Diego County",N,2018,2018
1,2018.06.18,18-Jun-2018,2018.0,USA,Georgia,"St. Simon Island, Glynn County",N,2018,2018
2,2018.06.09,09-Jun-2018,2018.0,USA,Hawaii,"Habush, Oahu",N,2018,2018
3,2018.06.08,08-Jun-2018,2018.0,AUSTRALIA,New South Wales,Arrawarra Headland,N,2018,2018
4,2018.06.04,04-Jun-2018,2018.0,MEXICO,Colima,La Ticla,N,2018,2018
...,...,...,...,...,...,...,...,...,...
5758,ND.0005,Before 1903,0.0,AUSTRALIA,Western Australia,Roebuck Bay,Y,0005,1903
5759,ND.0004,Before 1903,0.0,AUSTRALIA,Western Australia,,Y,0004,1903
5760,ND.0003,1900-1905,0.0,USA,North Carolina,Ocracoke Inlet,Y,0003,1900
5761,ND.0002,1883-1889,0.0,PANAMA,,"Panama Bay 8ºN, 79ºW",Y,0002,1883


In [23]:
#analyzing descriptive statistics for Year Variables in order to decide which range of years to focus on
#with at least 75% of all year observations in the sample for the three variables which inform about years taking place
#after 1945, this will be selected as starting point for the sample. Besides this, 1945 is also an interesting point 
#for historical and economic reasons.

attacks_1_3_6 = attacks_1_3_6.astype({"Case_Year": float})
attacks_1_3_6 = attacks_1_3_6.astype({"Date_Year": float})
print(attacks_1_3_6["Date_Year"].describe())
print(attacks_1_3_6["Case_Year"].describe())
print(attacks_1_3_6["Year"].describe())


count    5744.000000
mean     1967.455084
std        48.007509
min      1543.000000
25%      1945.000000
50%      1979.000000
75%      2005.000000
max      2018.000000
Name: Date_Year, dtype: float64
count    5762.000000
mean     1926.352135
std       281.706297
min         0.000000
25%      1942.000000
50%      1978.000000
75%      2005.000000
max      2018.000000
Name: Case_Year, dtype: float64
count    5762.000000
mean     1924.575842
std       292.236459
min         0.000000
25%      1942.000000
50%      1978.000000
75%      2005.000000
max      2018.000000
Name: Year, dtype: float64


Unnamed: 0,CaseNumber,Date,Year,Country,Area,Location,Fatal (Y/N),Case_Year,Date_Year
0,2018.06.25,25-Jun-2018,2018.0,USA,California,"Oceanside, San Diego County",N,2018.0,2018.0
1,2018.06.18,18-Jun-2018,2018.0,USA,Georgia,"St. Simon Island, Glynn County",N,2018.0,2018.0
2,2018.06.09,09-Jun-2018,2018.0,USA,Hawaii,"Habush, Oahu",N,2018.0,2018.0
3,2018.06.08,08-Jun-2018,2018.0,AUSTRALIA,New South Wales,Arrawarra Headland,N,2018.0,2018.0
4,2018.06.04,04-Jun-2018,2018.0,MEXICO,Colima,La Ticla,N,2018.0,2018.0
...,...,...,...,...,...,...,...,...,...
5758,ND.0005,Before 1903,0.0,AUSTRALIA,Western Australia,Roebuck Bay,Y,5.0,1903.0
5759,ND.0004,Before 1903,0.0,AUSTRALIA,Western Australia,,Y,4.0,1903.0
5760,ND.0003,1900-1905,0.0,USA,North Carolina,Ocracoke Inlet,Y,3.0,1900.0
5761,ND.0002,1883-1889,0.0,PANAMA,,"Panama Bay 8ºN, 79ºW",Y,2.0,1883.0


### Z. Ideas, experiemntos e información variada (En Español)

In [None]:
#df version index and description:

#attacks -> Raw data
#attacks_1_1 -> Rows where all values are Nulls are deletes
#attacks_1_2 -> Columns "Unnamed: 22" & "Unnamed: 23" deleted
#attacks_1_3 -> Columns which are not of interest to the hypotheses test are discarded
    #attacks_1_3_1 -> equal to attacks_1_3 but with resetted index column
    #attacks_1_3_2 -> rows for which the values of column "Fatal (Y/N)" is null are removed
    #attacks_1_3_3 -> bridge version between 1_3_2 & 1_3_4
    #attacks_1_3_4 -> version 1_3_2 with resetted index
    #attacks_1_3_5 -> version 1_3_4 but with modified a column name
    #attacks_1_3_6 -> version 1_3_5 without columns Case Number.1 & Case Number.2 as they have been identified as redundant
 

In [None]:
#nombres "ocupados"

#columns_to_discard

In [None]:
#sites of interest

#https://www.sharkattackfile.net/species.htm
#https://es.wikipedia.org/wiki/Selachimorpha
#https://es.wikipedia.org/wiki/Ataque_de_tibur%C3%B3n