# Shark Attack Cleaner

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
df = pd.read_csv('attacks.csv', sep=',', encoding='ANSI')

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


#### Limpando colunas inuteis (nulas e com dados repetidos de outras colunas)

In [46]:
# ELIMINANDO AS COLUNAS NULAS OU COM INFO REPETID
df_fatal = df.loc[:, ['Date', 
                      'Year', 
                      'Type',
                      'Country', 
                      'Activity', 
                      'Injury', 
                      'Fatal (Y/N)', 
                      ]].rename(columns={'Fatal (Y/N)': 'Fatal',
                                                         'Sex ': 'Sex' })
df_fatal.head(1)

Unnamed: 0,Date,Year,Type,Country,Activity,Injury,Fatal
0,25-Jun-2018,2018.0,Boating,USA,Paddling,"No injury to occupant, outrigger canoe and pad...",N


##### Limpando linhas totalmente nulas

In [47]:

linenan = df_fatal.loc[df_fatal.isnull().all(axis=1)].index
df_fatal = df_fatal.drop(index=linenan)
df_fatal.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6302 entries, 0 to 6301
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      6302 non-null   object 
 1   Year      6300 non-null   float64
 2   Type      6298 non-null   object 
 3   Country   6252 non-null   object 
 4   Activity  5758 non-null   object 
 5   Injury    6274 non-null   object 
 6   Fatal     5763 non-null   object 
dtypes: float64(1), object(6)
memory usage: 393.9+ KB


## FATAL? 

### Limpando a coluna 'Fatal (Y/N))'

In [48]:
#obtendo os valores da coluna para tratamento e padronizacao
df_fatal.loc[:,'Fatal'].unique()

array(['N', 'Y', nan, 'M', 'UNKNOWN', '2017', ' N', 'N ', 'y'],
      dtype=object)

In [49]:
def limp_fatal(row):
    '''recebe as linhas e baseada dos dados da colunas:
                            Fatal e Injury, padroniza os dados com
                                                            0; 1; np.nan
                            se fatal = 0
                            se nao fatal = 1
                            sem o dado = np.nan
    '''

    if isinstance(row['Fatal'], float) or row['Fatal'] == 'UNKNOWN':
        if row['Injury'] == 'NaN':
            return np.nan
        elif bool(re.search('fatal', row['Injury'].lower())):
            return 1
        else:
            return 0
    elif 'n' == row['Fatal'].lower().strip() or row['Fatal'] == '2017' or row['Fatal'] == 'M':
        return 0
    elif 'y' == row['Fatal'].lower().strip():
        return 1

###### A coluna 'Injury' contem dados ausentes da coluna 'Fatal'
Padronizando os dados de 'Injury' para que nulos sejam strings

In [50]:
df_fatal.loc[df_fatal.loc[:, 'Injury'].isna(), 'Injury'] = 'NaN'

##### rodando a funcao de limpesa de 'Fatal'
limp_fatal()

In [51]:
df_fatal.loc[:, 'Fatal'] = df_fatal.loc[:].apply(limp_fatal, axis=1)

In [52]:
#serie fatal limpa
ind_fatal = df_fatal['Fatal']
ind_fatal.value_counts()


0.0    4834
1.0    1441
Name: Fatal, dtype: int64

In [53]:
df_fatal.loc[:, 'Country'] = df_fatal.loc[:, 'Country'].apply(lambda x : re.sub('[^\w /]', '', x.strip()) if isinstance(x, str) else np.nan)

In [54]:
fat_country = df_fatal.groupby('Country', as_index=False)[['Country', "Fatal"]].agg(('count', sum, 'mean')).sort_values(by=[('Fatal', 'count')] , ascending=False)
fat_country.head(50)

Unnamed: 0_level_0,Fatal,Fatal,Fatal
Unnamed: 0_level_1,count,sum,mean
Country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
USA,2224,198.0,0.089029
AUSTRALIA,1332,295.0,0.221471
SOUTH AFRICA,578,114.0,0.197232
PAPUA NEW GUINEA,134,56.0,0.41791
NEW ZEALAND,126,25.0,0.198413
BRAZIL,110,38.0,0.345455
BAHAMAS,109,10.0,0.091743
MEXICO,90,44.0,0.488889
ITALY,71,15.0,0.211268
FIJI,62,22.0,0.354839


In [63]:
ano_0 = df_fatal.loc[:,'Year'] == 0
ano_na = df_fatal.loc[:,'Year'].isna()
df_fatal['Date']

0       25-Jun-2018
1       18-Jun-2018
2       09-Jun-2018
3       08-Jun-2018
4       04-Jun-2018
           ...     
6297    Before 1903
6298    Before 1903
6299      1900-1905
6300      1883-1889
6301      1845-1853
Name: Date, Length: 6302, dtype: object

In [107]:
ano = df_fatal.loc[:, 'Date'].str.extract('(\d{4})')
df_fatal.loc[:, 'Year' ] = ano[0].astype(float)
dropano = df_fatal['Year'] >= 1950
df_fatal = df_fatal[dropano]
df_fatal.loc[:, 'Year'] = df_fatal.loc[:, 'Year'].astype(int)
df_fatal


Unnamed: 0,Date,Year,Type,Country,Activity,Injury,Fatal
0,25-Jun-2018,2018,Boating,USA,Paddling,"No injury to occupant, outrigger canoe and pad...",0.0
1,18-Jun-2018,2018,Unprovoked,USA,Standing,Minor injury to left thigh,0.0
2,09-Jun-2018,2018,Invalid,USA,Surfing,Injury to left lower leg from surfboard skeg,0.0
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,Surfing,Minor injury to lower leg,0.0
4,04-Jun-2018,2018,Provoked,MEXICO,Free diving,Lacerations to leg & hand shark PROVOKED INCIDENT,0.0
...,...,...,...,...,...,...,...
6254,Before 1957,1957,Provoked,CUBA,"Shark fishing, knocked overboard","FATAL, hip bitten PROVOKED INCIDENT",1.0
6255,Before 1956,1956,Unprovoked,MARSHALL ISLANDS,Swimming,Buttocks bitten,0.0
6256,Before 1956,1956,Unprovoked,KIRIBATI,Diving,No injury,0.0
6257,Before Mar-1956,1956,Unprovoked,NORTH PACIFIC OCEAN,"Fishing, wading with string of fish",Survived,0.0


In [113]:
df_fatal.loc[df_fatal.loc[:, 'Type'] == 'Boating']

Unnamed: 0,Date,Year,Type,Country,Activity,Injury,Fatal
0,25-Jun-2018,2018,Boating,USA,Paddling,"No injury to occupant, outrigger canoe and pad...",0.0
86,Sep-2017,2017,Boating,AUSTRALIA,Fishing,"sharks rammed boats, no injury to occupants",0.0
105,01-Aug-2017,2017,Boating,USA,Kayaking / Fishing,"No injury, bow of kayak bitten",0.0
116,20-Jul-2017,2017,Boating,USA,Kayaking,"No injury, kayak bitten",0.0
120,11-Jul-2017,2017,Boating,USA,Kayaking,"No injury, kayak bitten",0.0
...,...,...,...,...,...,...,...
4413,06-Apr-1952,1952,Boating,AUSTRALIA,Fishing for white sharks,No injury to fisherman Alf Dean & other occupa...,0.0
4460,12-Nov-1950,1950,Boating,AUSTRALIA,Paddling a canoe,"No injury to occupants, shark holed canoet",0.0
4465,Reported 27-Jul-1950,1950,Boating,AUSTRALIA,Fishing,No injury to occupants,0.0
4475,01-May-1950,1950,Boating,ITALY,Fishing on a boat,No injury,0.0
