In [1]:
#importing libraries, the data and shaping the latter as a dataframe
import pandas as pd
import numpy as np
attacks = pd.read_csv("ignoredstuff/attacks.csv",encoding = "ISO-8859-1")

# A. Initial exploration and general basic cleaning

In [2]:
#df's shape
attacks.shape

(25723, 24)

In [3]:
#df's columns
columns_attacks = attacks.columns
columns_attacks

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [4]:
#a quick view...
attacks.sample(10)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
17359,,,,,,,,,,,...,,,,,,,,,,
7502,0,,,,,,,,,,...,,,,,,,,,,
23519,,,,,,,,,,,...,,,,,,,,,,
13958,,,,,,,,,,,...,,,,,,,,,,
15542,,,,,,,,,,,...,,,,,,,,,,
9878,,,,,,,,,,,...,,,,,,,,,,
24754,,,,,,,,,,,...,,,,,,,,,,
3015,1981.08.24.b,24-Aug-1981,1981.0,Provoked,USA,Florida,Gulf Island National Seashore Park,Spearfishing,Ted Best,M,...,"1.8 m [6'] shark, species identity questionable","A. MacCormick, pp.82-83, citing the New Orlean...",1981.08.24.b-Best.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1981.08.24.b,1981.08.24.b,3288.0,,
596,2013.12.02,02-Dec-2013,2013.0,Unprovoked,USA,Hawaii,"Between Makena & Molokini, Maui",Kayaking / Fishing,Patrick Briney,M,...,,"C. Sugidono, The Maui News, 12/2/2013",2013.12.02-Briney.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2013.12.02,2013.12.02,5707.0,,
8701,0,,,,,,,,,,...,,,,,,,,,,


In [5]:
#elemental informatio
attacks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

In [6]:
#looking for NaNs
attacks.isna().sum()

Case Number               17021
Date                      19421
Year                      19423
Type                      19425
Country                   19471
Area                      19876
Location                  19961
Activity                  19965
Name                      19631
Sex                       19986
Age                       22252
Injury                    19449
Fatal (Y/N)               19960
Time                      22775
Species                   22259
Investigator or Source    19438
pdf                       19421
href formula              19422
href                      19421
Case Number.1             19421
Case Number.2             19421
original order            19414
Unnamed: 22               25722
Unnamed: 23               25721
dtype: int64

In [7]:
#given the high number of NaNs it will be better to work with them before worrying about descriptive statistics...

In [8]:
#deleting all the rows exclusively composed by null values
attacks_1_1 = attacks.dropna(how="all")

In [9]:
#check again the number of NaNs
attacks_1_1.isna().sum()
#17020 rows contained only null values and were thus deleted


Case Number                  1
Date                      2401
Year                      2403
Type                      2405
Country                   2451
Area                      2856
Location                  2941
Activity                  2945
Name                      2611
Sex                       2966
Age                       5232
Injury                    2429
Fatal (Y/N)               2940
Time                      5755
Species                   5239
Investigator or Source    2418
pdf                       2401
href formula              2402
href                      2401
Case Number.1             2401
Case Number.2             2401
original order            2394
Unnamed: 22               8702
Unnamed: 23               8701
dtype: int64

In [10]:
#checking the shape of our new "cleaner" df
attacks_1_1.shape

(8703, 24)

In [11]:
#comparing the info provided by .shape and the sum of null given by .isna().sum()
#the columns "Unnamed: 22" & "Unnamed: 23" will be deleted as the contain basically null values
#which are useless and create a gap of such a mgnitude that is simply unfeasible to fill 

attacks_1_2 = attacks_1_1.drop(["Unnamed: 22" , "Unnamed: 23"], axis=1)


In [12]:
#our new df's columns
columns_attacks_1_2 = attacks_1_2.columns
columns_attacks_1_2

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order'],
      dtype='object')

In [13]:
#taking a look at a sample
attacks_1_2.sample(10)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
1449,2006.07.31.R,31-Jul-2006,2006.0,Provoked,USA,Kentucky,"Newport Aquarium, Newport",Touching sharks,12 people,,...,N,,small catsharks,"Cincinatti News, 7/31/2006",2006.07.31.R-NewportAquarium.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2006.07.31.R,2006.07.31.R,4854.0
2673,1989.08.06.R,Reported 06-Aug-1989,1989.0,Provoked,AUSTRALIA,Queensland,Moreton Bay,Fishing for sharks,Vic Hislop,M,...,N,,3.5 m white shark,"Sunday Mail, 8/6/1989",1989.08.06.R-Hislop.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1989.08.06.R,1989.08.06.R,3630.0
7478,0,,,,,,,,,,...,,,,,,,,,,
2600,1991.05.19,19-May-1991,1991.0,Unprovoked,SOUTH AFRICA,Western Cape Province,Gordons Bay,Scuba diving,Coen Marais,M,...,N,13h00,"3.5 m [11.5'] female white shark named ""Notchfin""","M. Levine, GSAF",1991.05.19-Marais-Jordaan.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1991.05.19,1991.05.19,3703.0
3017,1981.08.10.b,10-Aug-1981,1981.0,Unprovoked,USA,Florida,"Marathon, Monroe County",Diving,Mike Barber,M,...,N,,small nurse shark,"E. Pace, FSAF",1981.08.10.b-Barber.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1981.08.10.b,1981.08.10.b,3286.0
6379,0,,,,,,,,,,...,,,,,,,,,,
1499,2006.02.23,23-Feb-2006,2006.0,Invalid,USA,Hawaii,"Makena, Maui",Free diving,Anthony Moore,M,...,,Late afternoon,Invalid,http://www.kesq.com,2006.02.23-Moore.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2006.02.23,2006.02.23,4804.0
671,2013.05.27.b,27-May-2013,2013.0,Unprovoked,USA,Hawaii,"Halewia, Oahu",Diving,Tali Ena,M,...,N,,Galapagos shark,"A. Brenneka, Shark Attack Survivors",2013.05.27.b-Ena.pdf,pdf-directory/2013.05.27.b-Ena.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2013.05.27.b,2013.05.27.b,5632.0
4637,1944.10.25.b,25-Oct-1944,1944.0,Sea Disaster,PHILIPPINES,Off Samar Island in the Gulf of Leyte,,USS Johnston DD 557 sunk on 10/24/1944 in the ...,"William Clinton Carter, Jr. & 2 other men",M,...,Y,,,"Abiline Reporter News, 12/29/1944",1944.10.25.b-Carter.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1944.10.25.b,1944.10.25.b,1666.0
8621,0,,,,,,,,,,...,,,,,,,,,,


# B. The Hypotheses to test

It has been decided that the hypotheses to test are the following:

- Shark attacks' fatality rate has fallen over time
- Shark attacks' fatality rate is negatively correlated with the level of development of the place where the attack takes place

# C. Reshaping and cleaning the dataset according to Hypotheses to test

The formulated hypotheses imply that fatality will be our main variable of interest. Besides that, included variable which are to be inspected are those related to time and location. These are respectively "Date", "Year" "Time" plus those related to Case Number on the one hand, and "Area" "Location" and "Country" on the other.

In [14]:
#first of all, all the variable which are not of interest will be discarded
columns_to_discard = ['Type', 'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Time','Species ', 'Investigator or Source', 
                      'pdf', 'href formula', 'href','original order']

attacks_1_3= attacks_1_2.drop(columns_to_discard, axis=1)


In [15]:
#taking a look at this new refined df
print(attacks_1_3.shape)
attacks_1_3.sample(10)

(8703, 9)


Unnamed: 0,Case Number,Date,Year,Country,Area,Location,Fatal (Y/N),Case Number.1,Case Number.2
6281,ND.0021,No date,0.0,SOUTH AFRICA,KwaZulu-Natal,Durban,Y,ND.0021,ND.0021
832,2011.12.23,23-Dec-2011,2011.0,USA,Florida,New Smyrna Beach,N,2011.12.23,2011.12.23
4256,1956.03.25,25-Mar-1956,1956.0,ITALY,Ligurian Sea,"Genova, S. Nazaro, Punta Vagno",UNKNOWN,1956.03.25,1956.03.25
2087,1999.11.23,23-Nov-1999,1999.0,USA,Hawaii,"Big Island off Kona Village Resort, North Kona",N,1999.11.23,1999.11.23
1763,2003.06.30,30-Jun-2003,2003.0,USA,Florida,"New Smyrna Beach, Volusia County",N,2003.06.30,2003.06.30
1842,2002.07.10,10-Jul-2002,2002.0,USA,Florida,"Ponce Inlet, New Smyrna Beach, Volusia County",N,2002.07.10,2002.07.10
869,2011.09.11.a,11-Sep-2011,2011.0,PAPUA NEW GUINEA,Central Province,"Hula, near Port Moresby",N,2011.09.11.a,2011.09.11.a
5852,1877.12.12,12-Dec-1877,1877.0,AUSTRALIA,New South Wales,Near Sydney,Y,1877.12.12,1877.12.12
3645,1965.00.00.d,May-Jun-1965,1965.0,RED SEA,East of the Gulf of Aqaba,,N,1965.00.00.d,1965.00.00.d
7516,0,,,,,,,,


In [16]:
#resetting the df's index
attacks_1_3_1 = attacks_1_3.reset_index()
display(attacks_1_3_1)

Unnamed: 0,index,Case Number,Date,Year,Country,Area,Location,Fatal (Y/N),Case Number.1,Case Number.2
0,0,2018.06.25,25-Jun-2018,2018.0,USA,California,"Oceanside, San Diego County",N,2018.06.25,2018.06.25
1,1,2018.06.18,18-Jun-2018,2018.0,USA,Georgia,"St. Simon Island, Glynn County",N,2018.06.18,2018.06.18
2,2,2018.06.09,09-Jun-2018,2018.0,USA,Hawaii,"Habush, Oahu",N,2018.06.09,2018.06.09
3,3,2018.06.08,08-Jun-2018,2018.0,AUSTRALIA,New South Wales,Arrawarra Headland,N,2018.06.08,2018.06.08
4,4,2018.06.04,04-Jun-2018,2018.0,MEXICO,Colima,La Ticla,N,2018.06.04,2018.06.04
...,...,...,...,...,...,...,...,...,...,...
8698,8698,0,,,,,,,,
8699,8699,0,,,,,,,,
8700,8700,0,,,,,,,,
8701,8701,0,,,,,,,,


### C.1 Fatality rate across time

### Z. Ideas, experiemntos e información variada (En Español)

In [None]:
#df version index and description:

#attacks -> Raw data
#attacks_1_1 -> Rows where all values are Nulls are deletes
#attacks_1_2 -> Columns "Unnamed: 22" & "Unnamed: 23" deleted
#attacks_1_3 -> Columns which are not of interest to the hypotheses test are discarded
    #attacks_1_3_1 -> equal to attacks_1_3 but with resetted index column

In [None]:
#nombres "ocupados"

#columns_to_discard

In [None]:
#sites of interest

#https://www.sharkattackfile.net/species.htm
#https://es.wikipedia.org/wiki/Selachimorpha
#https://es.wikipedia.org/wiki/Ataque_de_tibur%C3%B3n

In [59]:
#def row_nan_out(x,y):
#    #this function eliminate all rows of a dataframe for which a given column's value is null
#    #argument "x" is supposed to capture the df's name and "y" the name of the refference column
#    #if the function does not find null values in the specified column it returns nothing
#    x_nans = x[y].isna()
#    
#    for i in x_nans:
#        if i == True:
#            print("Se encontaron valores nulos a eliminar en los datos.")
#            for j in range(0, len(x_nans)):
#                if x_nans[j] == True:
#                    x.drop(j, axis=0, inplace=True)
#            return x
#
#        else:
#            pass
#    return x

In [58]:
#test = row_nan_out(attacks_1_3_1,"Fatal (Y/N)")
#
#print(test)

      index Case Number         Date    Year             Country  \
0         0  2018.06.25  25-Jun-2018  2018.0                 USA   
1         1  2018.06.18  18-Jun-2018  2018.0                 USA   
2         2  2018.06.09  09-Jun-2018  2018.0                 USA   
3         3  2018.06.08  08-Jun-2018  2018.0           AUSTRALIA   
4         4  2018.06.04  04-Jun-2018  2018.0              MEXICO   
...     ...         ...          ...     ...                 ...   
6297   6297     ND.0005  Before 1903     0.0           AUSTRALIA   
6298   6298     ND.0004  Before 1903     0.0           AUSTRALIA   
6299   6299     ND.0003    1900-1905     0.0                 USA   
6300   6300     ND.0002    1883-1889     0.0              PANAMA   
6301   6301     ND.0001    1845-1853     0.0  CEYLON (SRI LANKA)   

                   Area                             Location Fatal (Y/N)  \
0            California          Oceanside, San Diego County           N   
1               Georgia       S

In [40]:
#from src import row_nan_out

In [49]:
#x_nans = attacks_1_3_1["Fatal (Y/N)"].isna()
#x_nans
#x_nans.reset_index()

Unnamed: 0,index,Fatal (Y/N)
0,0,False
1,1,False
2,2,False
3,3,False
4,4,False
...,...,...
5758,6297,False
5759,6298,False
5760,6299,False
5761,6300,False
