In [3]:
#importing libraries, the data and shaping the latter as a dataframe
import pandas as pd
import numpy as np
attacks = pd.read_csv("ignoredstuff/attacks.csv",encoding = "ISO-8859-1")

# A. Initial exploration and general basic cleaning

In [4]:
#df's shape
attacks.shape

(25723, 24)

In [5]:
#df's columns
columns_attacks = attacks.columns
columns_attacks

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [6]:
#a quick view...
attacks.sample(10)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
3052,1980.11.17,17-Nov-1980,1980.0,Invalid,SOUTH AFRICA,KwaZulu-Natal,Mnini,Swimming,Baba Sibaya,M,...,Shark involvement prior to death was not confi...,"W.O. Hutt; G. Charter, B. Davis, Natal Sharks...",1980.11.17-BabaSibaya.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1980.11.17,1980.11.17,3251.0,,
14541,,,,,,,,,,,...,,,,,,,,,,
20907,,,,,,,,,,,...,,,,,,,,,,
16792,,,,,,,,,,,...,,,,,,,,,,
6056,1845.00.00.R,Reported 1845,1845.0,Unprovoked,SIERRA LEONE,,,,"""The Queen's Chaplain""",M,...,,Journal of an African Cruiser,1845.00.00.R-SierraLeone.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1845.00.00.R,1845.00.00.R,247.0,,
16214,,,,,,,,,,,...,,,,,,,,,,
19679,,,,,,,,,,,...,,,,,,,,,,
15762,,,,,,,,,,,...,,,,,,,,,,
13861,,,,,,,,,,,...,,,,,,,,,,
7452,0,,,,,,,,,,...,,,,,,,,,,


In [7]:
#elemental informatio
attacks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

In [8]:
#looking for NaNs
attacks.isna().sum()

Case Number               17021
Date                      19421
Year                      19423
Type                      19425
Country                   19471
Area                      19876
Location                  19961
Activity                  19965
Name                      19631
Sex                       19986
Age                       22252
Injury                    19449
Fatal (Y/N)               19960
Time                      22775
Species                   22259
Investigator or Source    19438
pdf                       19421
href formula              19422
href                      19421
Case Number.1             19421
Case Number.2             19421
original order            19414
Unnamed: 22               25722
Unnamed: 23               25721
dtype: int64

In [9]:
#given the high number of NaNs it will be better to work with them before worrying about descriptive statistics...

In [10]:
#deleting all the rows exclusively composed by null values
attacks_1_1 = attacks.dropna(how="all")

In [11]:
#check again the number of NaNs
attacks_1_1.isna().sum()
#17020 rows contained only null values and were thus deleted


Case Number                  1
Date                      2401
Year                      2403
Type                      2405
Country                   2451
Area                      2856
Location                  2941
Activity                  2945
Name                      2611
Sex                       2966
Age                       5232
Injury                    2429
Fatal (Y/N)               2940
Time                      5755
Species                   5239
Investigator or Source    2418
pdf                       2401
href formula              2402
href                      2401
Case Number.1             2401
Case Number.2             2401
original order            2394
Unnamed: 22               8702
Unnamed: 23               8701
dtype: int64

In [12]:
#checking the shape of our new "cleaner" df
attacks_1_1.shape

(8703, 24)

In [13]:
#comparing the info provided by .shape and the sum of null given by .isna().sum()
#the columns "Unnamed: 22" & "Unnamed: 23" will be deleted as the contain basically null values
#which are useless and create a gap of such a mgnitude that is simply unfeasible to fill 

attacks_1_2 = attacks_1_1.drop(["Unnamed: 22" , "Unnamed: 23"], axis=1)


In [14]:
#our new df's columns
columns_attacks_1_2 = attacks_1_2.columns
columns_attacks_1_2

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order'],
      dtype='object')

In [15]:
#taking a look at a sample
attacks_1_2.sample(10)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
4998,1932.01.11,11-Jan-1932,1932.0,Boating,AUSTRALIA,Victoria,Frankston,Fishing,boat,,...,UNKNOWN,,Grey nurse shark,G.P. Whitley citing Herald (Melbourne) 1/12/1932,1932.01.11-boatFrankston.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1932.01.11,1932.01.11,1305.0
5830,1879.11.10,10-Nov-1879,1879.0,Unprovoked,AUSTRALIA,New South Wales,Clarence Heads,Swimming,Goddard,M,...,N,,,Maitland Mercury & Hunter River General Advert...,1879.11.10-Goddard.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1879.11.10,1879.11.10,473.0
6994,0,,,,,,,,,,...,,,,,,,,,,
4834,1937.10.27.b,27-Oct-1937,1937.0,Unprovoked,AUSTRALIA,Queensland,"Kirra Beach, Coolangatta",Swimming,Jack Brinkley,M,...,Y,17h30,Tiger shark,"J. Green; V.M. Coppleson, pp.51, 81-84 & 234; ...",1937.10.27.b-Brinkley.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1937.10.27.b,1937.10.27.b,1469.0
7117,0,,,,,,,,,,...,,,,,,,,,,
4890,1936.01.05,05-Jan-1936,1936.0,Invalid,AUSTRALIA,Queensland,"Main Beach, Southport",Surfing,Kevin Canavan,M,...,,,Shark involvement prior to death was not confi...,"Canberra Times, 1/7/1936; V.M. Coppleson, p.92-93",1936.01.05-Canavan.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1936.01.05,1936.01.05,1413.0
14,2018.05.00,May 2018,2018.0,Provoked,AUSTRALIA,Westerm Australia,Dugong Bay,Feeding sharks,Melisa Brunning,F,...,N,,"Tawny nurse shark, 2m","Perth Now, 6/30/2018",2018.05.00-Brunning.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.00,2018.05.00,6289.0
2727,1988.07.00,Jul-1988,1988.0,Unprovoked,BAHAMAS,,,Spearfishing,Peter Albury,M,...,N,,,"E. Pace, GSAF",1988.07.00-Albury.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1988.07.00,1988.07.00,3576.0
5076,1929.04.04,04-Apr-1929,1929.0,Unprovoked,AUSTRALIA,Torres Strait,Badu Island,Swimming between boats,"Ned Luffman, a Torres Strait Islander",M,...,Y,,,"V.M. Coppleson (1958), p.242",1929.04.04-Luffman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1929.04.04,1929.04.04,1227.0
8319,0,,,,,,,,,,...,,,,,,,,,,


# B. The Hypotheses to test

It has been decided that the hypotheses to test are the following:

- Shark attacks' fatality rate has fallen over time
- Shark attacks' fatality rate is negatively correlated with the level of development of the place where the attack takes place

# C. Reshaping and cleaning the dataset according to Hypotheses to test

The formulated hypotheses imply that fatality will be our main variable of interest. Besides that, included variable which are to be inspected are those related to time and location. These are respectively "Date", "Year" "Time" plus those related to Case Number on the one hand, and "Area" "Location" and "Country" on the other.

In [18]:
#first of all, all the variable which are not of interest will be discarded
columns_to_discard = ['Type', 'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Time','Species ', 'Investigator or Source', 
                      'pdf', 'href formula', 'href','original order']

attacks_1_3= attacks_1_2.drop(columns_to_discard, axis=1)


In [19]:
#taking a look at this new refined df
print(attacks_1_3.shape)
attacks_1_3.sample(10)

(8703, 9)


Unnamed: 0,Case Number,Date,Year,Country,Area,Location,Fatal (Y/N),Case Number.1,Case Number.2
3799,1962.08.23,23-Aug-1962,1962.0,SOUTH AFRICA,Western Cape Province,Robbesteen,N,1962.08.23,1962.08.23
7495,0,,,,,,,,
6535,0,,,,,,,,
4700,1942.11.00.a,Nov-1942,1942.0,,Off South American coast,,Y,1942.11.00.a,1942.11.00.a
605,2013.10.31,31-Oct-2013,2013.0,USA,Hawaii,"Kanaha Beach, Maui",N,2013.10.31,2013.10.31
2641,1990.04.06,06-Apr-1990,1990.0,AUSTRALIA,Queensland,Fingal Beach,N,1990.04.06,1990.04.06
1504,2006.02.01.b,01-Feb-2006,2006.0,TONGA,Vava'u,Tuanuku,Y,2006.02.01.b,2006.02.01.b
1567,2005.07.17.b,17-Jul-2005,2005.0,LIBERIA,,,,2005.07.17.b,2005.07.17.b
88,2017.09.10.b,10-Sep-2017,2017.0,AUSTRALIA,Westerm Australia,Sam's Creek area,N,2017.09.10.b,2017.09.10.b
7991,0,,,,,,,,


In [46]:
#resetting the df's index
attacks_1_3_1 = attacks_1_3.reset_index()
display(attacks_1_3_1)

Unnamed: 0,index,Case Number,Date,Year,Country,Area,Location,Fatal (Y/N),Case Number.1,Case Number.2
0,0,2018.06.25,25-Jun-2018,2018.0,USA,California,"Oceanside, San Diego County",N,2018.06.25,2018.06.25
1,1,2018.06.18,18-Jun-2018,2018.0,USA,Georgia,"St. Simon Island, Glynn County",N,2018.06.18,2018.06.18
2,2,2018.06.09,09-Jun-2018,2018.0,USA,Hawaii,"Habush, Oahu",N,2018.06.09,2018.06.09
3,3,2018.06.08,08-Jun-2018,2018.0,AUSTRALIA,New South Wales,Arrawarra Headland,N,2018.06.08,2018.06.08
4,4,2018.06.04,04-Jun-2018,2018.0,MEXICO,Colima,La Ticla,N,2018.06.04,2018.06.04
...,...,...,...,...,...,...,...,...,...,...
8698,8698,0,,,,,,,,
8699,8699,0,,,,,,,,
8700,8700,0,,,,,,,,
8701,8701,0,,,,,,,,


In [39]:
x_nans = attacks_1_3_1["Fatal (Y/N)"].isna()
#print(x_nans)
#x_nans[8698]


0       False
1       False
2       False
3       False
4       False
        ...  
8698     True
8699     True
8700     True
8701     True
8702     True
Name: Fatal (Y/N), Length: 8703, dtype: bool


### C.1 Fatality rate across time

### Z. Ideas, experiemntos e información variada (En Español)

In [None]:
#df version index and description:

#attacks -> Raw data
#attacks_1_1 -> Rows where all values are Nulls are deletes
#attacks_1_2 -> Columns "Unnamed: 22" & "Unnamed: 23" deleted
#attacks_1_3 -> Columns which are not of interest to the hypotheses test are discarded
    #attacks_1_3_1 -> equal to attacks_1_3 but with resetted index column

In [None]:
#nombres "ocupados"

#columns_to_discard

In [None]:
#sites of interest

#https://www.sharkattackfile.net/species.htm
#https://es.wikipedia.org/wiki/Selachimorpha
#https://es.wikipedia.org/wiki/Ataque_de_tibur%C3%B3n

In [64]:
def row_nan_out(x,y):
    #this function eliminate all rows of a dataframe for which a given column's value is null
    #argument "x" is supposed to capture the df's name and "y" the name of the refference column
    
    x_nans = x[y].isna()
    
    if "True" in x_nans:
        print("Se encontaron valores nulos a eliminar en los datos.")
        for i in range(0, len(x_nans)):
            if x_nans[i] == True:
                x.drop(i, axis=0, inplace=True)
        return x

    else:
        print("No se encontraron valores nulos en los datos proporcionados.")
        return x
    
    

In [65]:
test = row_nan_out(attacks_1_3_1,"Fatal (Y/N)")



No se encontraron valores nulos en los datos proporcionados.
