In [1]:
#!pip install spacy
#!pip install spacy en_core_web_sm

#import nltk
#nltk.download('punkt')

import pandas as pd
import numpy as np
import re
#import spacy
#from spacy import displacy

pd.set_option('display.max_columns', None)  # me muestre todas las columnas
pd.set_option('display.max_colwidth', 100)  #me muestre más caracteres por columna.

import warnings
warnings.filterwarnings('ignore')

# para pintar
import pylab as plt
import seaborn as sns

# para que salga el grafico
%matplotlib inline

In [2]:
data = pd.read_csv('attacks.csv', encoding= "ISO-8859-1") 

In [3]:
data.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

In [4]:
data[data['Unnamed: 23'] == 'Teramo']           

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
4415,1952.03.30,30-Mar-1952,1952.0,Unprovoked,NETHERLANDS ANTILLES,Curacao,,Went to aid of child being menaced by the shark,A.J. Eggink,M,,"Buttock bitten, tissue removed",N,,"Bull shark, 2.7 m [9'] was captured & dragged on the sand where tissue taken from Eggink was fou...","J. Randall, p.352 in Sharks & Survival; H.D. Baldridge, p.172",1952.03.30-Eggink.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1952.03.30-Eggink.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1952.03.30-Eggink.pdf,1952.03.30,1952.03.30,1888.0,,Teramo


In [5]:
data[data['Unnamed: 23'] == 'change filename']

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
5840,1878.09.14.R,Reported 14-Sep-1878,1878.0,Provoked,USA,Connecticut,"Branford, New Haven County",Fishing,Captain Pattison,M,,Leg bitten by netted shark PROVOKED INCIDENT,N,,,"St. Joseph Herald, 9/14/1878",1878.09.14.R-Pattison.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1878.09.14.R-Pattison.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1878.09.14.R-Pattison.pdf,1878.09.14.R,1878.09.14.R,463.0,,change filename


In [6]:
data['Unnamed: 23'].value_counts()

Teramo             1
change filename    1
Name: Unnamed: 23, dtype: int64

In [7]:
data['Case Number'].value_counts()

0               2400
1920.00.00.b       2
1966.12.26         2
2014.08.02         2
1990.05.10         2
                ... 
1999.09.05         1
1999.09.10         1
1999.09.16         1
1999.09.18         1
xx                 1
Name: Case Number, Length: 6287, dtype: int64

# Glosario de columnas y tipos iniciales

0. Case Number: La fecha coincide con el case number. OBJETC
1. Date: La fecha del ataque. OBJECT
2. Year: El año del ataque. FLOAT64
3. Type: Tipo de ataque (provocado, no provocado, desastre marino, etc). OBJECT
4. Country: País del ataque. OBJECT
5. Area: Area del país. OBJECT
6. Location: Localización concreta dentro del area. OBJECT
7. Activity: La actividad que se estaba realizando durante el ataque. OBJECT
8. Name: Nombre de la víctima. OBJECT
9. Sex : Género de la víctima. OBJECT
10. Age: Edad de la víctima. OBJECT
11. Injury: Tipo de lesión provocada. OBJECT
12. Fatal (Y/N): Muerte o no de la victima. OBJECT
13. Time: Hora del ataque. OBJECT
14. Species : Especie del tiburón del ataque. OBJECT
15. Investigator or Source: Investigador del ataque + organización a la que pertenece. OBJECT
16. pdf: Imagino que un pdf del informe del ataque por víctima. OBJECT
17. href formula: Enlace al informe en pdf del apartado anterior. OBJECT
18. href: Parece una columna con lo mismo que lo anterior pero tiene menos filas. OBJECT
19. Case Number.1: Columna igual que Case Number pero con dos filas menos. OBJECT
20. Case Number.2: Columna igual que Case Number pero con una fila menos. OBJECT
21. Original Order: Parece un id del caso, a priori mayor cuanto más reciente. FLOAT64
22. Unnamed: 22: No sé qué significa, todo NaNs salvo el 1478 "stopped here". OBJECT
23. Unnamed: 23: No sé qué significa, todo NaNs salvo el 4415 "Teramo" y el 5840 "change filename". OBJECT

# Restricciones:
- No se pueden eliminar columnas.
- Deben quedar al menos 1500 filas.

# Primeras consideraciones

- Como no podemos eliminar columnas, el % de nulos por fila no es tan relevante.
- Sí podemos quitar todas las filas en las que todos los valores sean nulos. 
- Sí podemos quitar todas las filas duplicadas.
- Arreglar los nombres de las columnas para trabajar mejor con ellas.


In [8]:
data_ori_shape = data.shape

data_ori = data.copy()

data.shape

(25723, 24)

In [9]:
data = data.dropna(how='all')

In [10]:
data.shape, data_ori.shape

((8703, 24), (25723, 24))

In [11]:
data= data.drop_duplicates()

In [12]:
data.shape

(6311, 24)

Tras hacerlo, las dimensiones pasan a ser: (6311, 24)

Ahora voy a cambiar los nombres de las columnas para homogeneizarlas, por si acaso.

In [13]:
data.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [14]:
data.columns = [e.replace(' ', '_') for e in data.columns]

data.columns = [e.replace('.', '_') for e in data.columns]

data.columns = [e.replace(':', '') for e in data.columns]

In [15]:
data.rename(columns={'Species_': 'Species', 'Sex_': 'Sex', 'Fatal_(Y/N)': 'Fatal'}, inplace=True)

In [16]:
num_unique_values = data.Type.nunique()

En la celda anterior he descubierto que las últimas 9 filas solo tienen un valor no nulo (original_order), así que las elimino.

In [17]:
num_unique_values

8

In [18]:
data['href'].value_counts()

http://sharkattackfile.net/spreadsheets/pdf_directory/w014.01.25-Grant.pdf                 4
http://sharkattackfile.net/spreadsheets/pdf_directory/2014.10.02.b-Vandenberg.pdf          3
http://sharkattackfile.net/spreadsheets/pdf_directory/1923.00.00.a-NJ fisherman.pdf        2
http://sharkattackfile.net/spreadsheets/pdf_directory/1934.12.23.a-b-Inman.pdf             2
http://sharkattackfile.net/spreadsheets/pdf_directory/1929.03.04.a-b.Roads-Aldridge.pdf    2
                                                                                          ..
http://sharkattackfile.net/spreadsheets/pdf_directory/1999.09.10-Warnock.pdf               1
http://sharkattackfile.net/spreadsheets/pdf_directory/1999.09.16-Ferguson.pdf              1
http://sharkattackfile.net/spreadsheets/pdf_directory/1999.09.18-Wallersheim.pdf           1
http://sharkattackfile.net/spreadsheets/pdf_directory/1999.09.24-boat.pdf                  1
http://sharkattackfile.net/spreadsheets/pdf_directoryND-0001-Ceylon.pd

In [19]:
print(data.index.is_unique)

True


In [20]:
data['Year'].iloc[15:]

15       2018.0
16       2018.0
17       2018.0
18       2018.0
19       2018.0
          ...  
6306        NaN
6307        NaN
6308        NaN
6309        NaN
25722       NaN
Name: Year, Length: 6296, dtype: float64

In [21]:
len(data['Species'].unique())

1550

In [22]:
data['Year']

0        2018.0
1        2018.0
2        2018.0
3        2018.0
4        2018.0
          ...  
6306        NaN
6307        NaN
6308        NaN
6309        NaN
25722       NaN
Name: Year, Length: 6311, dtype: float64

In [23]:
print(data.iloc[-10:])

      Case_Number       Date  Year        Type             Country  \
6301      ND.0001  1845-1853   0.0  Unprovoked  CEYLON (SRI LANKA)   
6302            0        NaN   NaN         NaN                 NaN   
6303            0        NaN   NaN         NaN                 NaN   
6304            0        NaN   NaN         NaN                 NaN   
6305            0        NaN   NaN         NaN                 NaN   
6306            0        NaN   NaN         NaN                 NaN   
6307            0        NaN   NaN         NaN                 NaN   
6308            0        NaN   NaN         NaN                 NaN   
6309            0        NaN   NaN         NaN                 NaN   
25722          xx        NaN   NaN         NaN                 NaN   

                   Area                             Location  Activity  Name  \
6301   Eastern Province  Below the English fort, Trincomalee  Swimming  male   
6302                NaN                                  NaN       Na

In [24]:
print(data['Case_Number_2'].is_monotonic_increasing)

False


In [25]:
data['Date'].value_counts()

1957                          11
1942                           9
1956                           8
1958                           7
1950                           7
                              ..
01-Dec-1994                    1
Reported      10-Dec-1994      1
11-Dec-1994                    1
13-Dec-1994                    1
1845-1853                      1
Name: Date, Length: 5433, dtype: int64

In [26]:
mask = data.isnull()            # Me devuelve una mascara booleana de toda la tabla.

In [27]:
null_values_count = mask.T.sum()  # Me devuelve el nº de NaN que hay por fila.

In [28]:
null_values_count

0         2
1         3
2         3
3         4
4         4
         ..
6306     22
6307     22
6308     22
6309     23
25722    23
Length: 6311, dtype: int64

In [29]:
nan_por_linea_100 = (null_values_count/len(data.T)) *100

In [30]:
nan_por_linea_100.min()

8.333333333333332

In [31]:
mask2 = nan_por_linea_100 > 50 

In [32]:
mask2

0        False
1        False
2        False
3        False
4        False
         ...  
6306      True
6307      True
6308      True
6309      True
25722     True
Length: 6311, dtype: bool

In [33]:
data = data[~mask2]

In [34]:
data.shape

(6302, 24)

In [35]:
data.isnull().T.sum()

0       2
1       3
2       3
3       4
4       4
       ..
6297    5
6298    6
6299    5
6300    7
6301    4
Length: 6302, dtype: int64

In [36]:
'''Utilizo la función to_datetime para convertir los valores de la columna 'Date' a objetos de fecha, y utilizo
el parámetro "errors='coerce'" para que los valores que no se pueden convertir se reemplacen con NaT (Not a Time).
'''
#data['Date'] = pd.to_datetime(data['Date'], errors='coerce')

'Utilizo la función to_datetime para convertir los valores de la columna \'Date\' a objetos de fecha, y utilizo\nel parámetro "errors=\'coerce\'" para que los valores que no se pueden convertir se reemplacen con NaT (Not a Time).\n'

In [37]:
print(data['Date'].dtypes)

object


In [38]:
data['Year'].is_monotonic_decreasing

False

In [39]:
data.head()

Unnamed: 0,Case_Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number_1,Case_Number_2,original_order,Unnamed_22,Unnamed_23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and paddle damaged",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.18-McNeely.pdf,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.09-Denges.pdf,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.08-Arrawarra.pdf,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.04-Ramos.pdf,2018.06.04,2018.06.04,6299.0,,


In [40]:
data['Date'].iloc[-150:-100]

6152                        1749
6153                        1755
6154                        1748
6155                 17-Dec-1742
6156        Reported 06-Apr-1738
6157                        1733
6158                        1723
6159                   June 1721
6160                 26-Mar-1703
6161                       1700s
6162                       1700s
6163                       1700s
6164    Late 1600s Reported 1728
6165               Reported 1638
6166               Reported 1637
6167               Reported 1617
6168                        1642
6169                        1595
6170    Letter dated 10-Jan-1580
6171                        1555
6172                    Ca. 1554
6173                    Ca. 1543
6174              Circa 500 A.D.
6175                    77  A.D.
6176                  Ca. 5 A.D.
6177                Ca. 214 B.C.
6178               Ca. 336.B.C..
6179                    493 B.C.
6180                Ca. 725 B.C.
6181                 Before 1939
6182      

In [41]:
data = data.sort_values(by='Year',ascending=False)
data = data.reset_index(drop=True)


In [42]:
data.iloc[-150:-100]

Unnamed: 0,Case_Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number_1,Case_Number_2,original_order,Unnamed_22,Unnamed_23
6152,1748.00.00,1748,1748.0,Unprovoked,PANAMA,Las Perlas archipelago,Taboga & Isla del Rey,Pearl diving,African slaves,M,,FATAL,Y,,,"J. Castro, et al",1748.00.00.R-LasPerlas.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1748.00.00.R-LasPerlas.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1748.00.00.R-LasPerlas.pdf,1748.00.00,1748.00.00,149.0,,
6153,1742.12.17,17-Dec-1742,1742.0,Unprovoked,,,Carlisle Bay,Swimming,2 impressed seamen,M,,FATAL,Y,,,"C. Moore, GSAF",1742.12.17-AdviceSeamen.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1742.12.17-AdviceSeamen.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1742.12.17-AdviceSeamen.pdf,1742.12.17,1742.12.17,148.0,,
6154,1738.04.06.R,Reported 06-Apr-1738,1738.0,Unprovoked,ITALY,Sicily,Strait of Messina,Swimming,male,M,,FATAL,Y,,,"C. Moore, GSAF",1738.04.06.R-Messina.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1738.04.06.R-Messina.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1738.04.06.R-Messina.pdf,1738.04.06.R,1738.04.06.R,147.0,,
6155,1733.00.00,1733,1733.0,Invalid,ICELAND,Bardestrand,Talkknefiord,,,,,"Partial hominid remains recovered from shark, probable drowning and scavenging",,,Shark involvement prior to death unconfirmed,E. Olafsen,1733.00.00-Iceland.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1733.00.00-Iceland.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1733.00.00-Iceland.pdf,1733.00.00,1733.00.00,146.0,,
6156,1723.00.00,1723,1723.0,Unprovoked,ROATAN,,,,Philip Ashton,M,,Struck on thigh,,,,"C.Moore, GSAF",1730.00.00-Ashton.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1730.00.00-Ashton.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1730.00.00-Ashton.pdf,1723.00.00,1723.00.00,145.0,,
6157,1721.06.00,June 1721,1721.0,Unprovoked,ITALY,Sardinia,"Ponte della Maddelena,",Swimming,male,M,,"FATAL, partial remains recovered from sharks gut",Y,,"White shark, 1600-lb female",F. Ricciardi; A. De Maddalena.,1721.06.00-Maddalena.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1721.06.00-Maddalena.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1721.06.00-Maddalena.pdf,1721.06.00,1721.06.00,144.0,,
6158,1703.03.26,26-Mar-1703,1703.0,Unprovoked,BARBADOS,Southwest coast,Carlisle Bay,Swimming,"Samuel Jennings, a deserter from the British frigate Milford",M,19,"Hand and foot severely bitten, surgically amputated",N,Night,,"W.R.Cutter, Vol.1, p.252",1703.03.26-Jennings.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1703.03.26-Jennings.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1703.03.26-Jennings.pdf,1703.03.26,1703.03.26,143.0,,
6159,1700.00.00.c,1700s,1700.0,Unprovoked,FRANCE,,Nice,,child,M,,FATAL,Y,,,"A. De Maddalena, citing Cazeils (1998)",1700.00.00.c-Nice.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1700.00.00.c-Nice.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1700.00.00.c-Nice.pdf,1700.00.00.c,1700.00.00.c,142.0,,
6160,1700.00.00.b,1700s,1700.0,Unprovoked,FRANCE,Côte d'Azur,Antibes,Bathing,seaman,M,,Leg severed,N,,White shark,"A. De Maddalena, citing Cazeils (1998)",1700.00.00.b-Antibes.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1700.00.00.b-Antibes.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1700.00.00.b-Antibes.pdf,1700.00.00.b,1700.00.00.b,141.0,,
6161,1700.00.00.a,1700s,1700.0,Unprovoked,BARBADOS,,,Bathing,seaman from the York,M,,FATAL,Y,,,"Tioga Eagle, 10.26/ 1842",1700.00.00.a-Barbados.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1700.00.00.a-Barbados.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1700.00.00.a-Barbados.pdf,1700.00.00.a,1700.00.00.a,140.0,,


In [43]:
data['Year'].loc[:6171].is_monotonic_decreasing

True

In [44]:
data = data.loc[:6171]

In [45]:
print(data['Date'].iloc[-100:-80])

6072              14-Apr-1839
6073                1839/1840
6074                 Ca. 1839
6075              17-Jan-1837
6076                 Ca. 1837
6077                 Jul-1837
6078              09-Sep-1837
6079                 1836.00.
6080             1836.07.26.R
6081     Reported 21-Feb-1835
6082              06-Jan-1835
6083     Reported 15-Jul-1834
6084              04-Jun-1832
6085     Reported 23-Jan-1832
6086              24-Aug-1831
6087    Reported 22- Jan-1831
6088     Reported 22-Apr-1830
6089              30-Apr-1830
6090     Reported 02-Jul-1830
6091              26-Jul-1830
Name: Date, dtype: object


In [46]:

data['Date'] = pd.to_datetime(data['Date'], errors='coerce').dt.month


In [47]:
print(data['Date'].iloc[-100:-80])

6072    4.0
6073    NaN
6074    NaN
6075    1.0
6076    NaN
6077    7.0
6078    9.0
6079    NaN
6080    NaN
6081    NaN
6082    1.0
6083    NaN
6084    6.0
6085    NaN
6086    8.0
6087    NaN
6088    NaN
6089    4.0
6090    NaN
6091    7.0
Name: Date, dtype: float64


In [48]:
data['Date'].isna().value_counts()

False    5458
True      714
Name: Date, dtype: int64

In [49]:
data['Date'].fillna(data['Date'].mode()[0], inplace=True)

In [50]:
print(data['Date'].iloc[-100:-90])

6072    4.0
6073    1.0
6074    1.0
6075    1.0
6076    1.0
6077    7.0
6078    9.0
6079    1.0
6080    1.0
6081    1.0
Name: Date, dtype: float64


In [51]:
data.rename(columns={'Date': 'Month'}, inplace=True)

In [52]:
data['Month'].loc[4537]

1.0

In [53]:
data['Month'] = data['Month'].astype(int)

In [54]:
data['Case_Number'].iloc[1490:1545]

1490      2006.09.02
1491    2006.07.31.R
1492    2006.08.29.b
1493      2006.07.12
1494      2006.07.13
1495      2006.07.17
1496    2006.07.17.R
1497      2006.07.23
1498      2006.07.25
1499      2006.07.28
1500    2006.07.29.a
1501    2006.07.29.b
1502    2006.07.31.a
1503      2006.07.10
1504    2006.08.00.a
1505    2006.08.00.b
1506      2006.08.13
1507      2006.08.15
1508    2006.08.20.a
1509    2006.08.20.b
1510    2006.08.22.a
1511    2006.08.22.b
1512      2006.08.27
1513    2006.08.29.a
1514      2005.05.25
1515      2005.06.04
1516      2005.06.02
1517      2005.05.28
1518      2005.05.27
1519      2005.05.02
1520      2005.05.15
1521      2005.05.14
1522      2005.05.03
1523    2005.05.02.R
1524      2005.06.07
1525      2005.06.05
1526      2005.06.22
1527      2005.06.13
1528      2005.06.16
1529      2005.06.18
1530      2005.06.21
1531    2005.04.17.R
1532      2005.06.25
1533      2005.07.01
1534      2005.07.13
1535      2005.07.15
1536    2005.07.15.R
1537    2005.

In [55]:
data['Case_Number'] = data['Case_Number'].str.replace(r'[a-zA-Z]', '')

In [56]:
data['Case_Number'] = pd.to_datetime(data['Case_Number'], errors='coerce').dt.day

In [57]:
import random

days = []

for i in data.index:
    
    if pd.isnull(data['Case_Number'].iloc[i]):
        
        days.append(random.randint(1,28))
    else:
        
        days.append(data['Case_Number'].iloc[i])
        
data['Case_Number'] = days

In [58]:
data.at[5403, 'Case_Number']

24.0

In [59]:
data['Case_Number'].iloc[5403]

24.0

In [60]:
data.rename(columns={'Case_Number': 'Day'}, inplace=True)

In [61]:
data['Day'] = data['Day'].astype(int)

In [62]:
data.original_order.iloc[-10:]

6162    139.0
6163    135.0
6164    138.0
6165    137.0
6166    136.0
6167    134.0
6168    133.0
6169    132.0
6170    131.0
6171    130.0
Name: original_order, dtype: float64

In [63]:
data['original_order'] = data['original_order'].astype(int)

In [64]:
data['Year'] = data['Year'].astype(int)

In [65]:
data.head()

Unnamed: 0,Day,Month,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number_1,Case_Number_2,original_order,Unnamed_22,Unnamed_23
0,25,6,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and paddle damaged",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf,2018.06.25,2018.06.25,6303,,
1,23,2,2018,Unprovoked,AUSTRALIA,New South Wales,"Little Congwong Beach, La Perouse",Swimming,Anna Shurapey,F,55.0,Laceratons to right leg & foot,N,"19h00, Dusk","Juvenile white shark, 2.7 to 3.2 m","B. Myatt, GSAF",2018.02.23-Shurapey.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.02.23-Shurapey.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.02.23-Shurapey.pdf,2018.02.23,2018.02.23,6263,,
2,15,4,2018,Unprovoked,AUSTRALIA,Western Australia,"Cobblestones, Margaret River Area",Surfing,Alejandro Travaglini,M,37.0,Lacerations to legs,N,08h00,,"B.Myatt, GSAF",2018.04.15.a-Travaglini.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.15.a-Travaglini.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.15.a-Travaglini.pdf,2018.04.15.a,2018.04.15.a,6274,,
3,14,4,2018,Unprovoked,BAHAMAS,New Providence,Nirvana Beach,Surfing,Bruce Rowan,M,,No Injury. Shark swam away with the surf board,N,09h30,Tiger shark,"Tribune242,",2018.04.14-Rowan.pff,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.14-Rowan.pff,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.14-Rowan.pff,2018.04.14,2018.04.14,6273,,
4,10,1,2018,Invalid,BRAZIL,Alagoas,"Praia de Sauaçuhy, Maceió",Fishing,Josias Paz,M,56.0,Injury to ankle from marine animal trapped in weir PROVOKED INCIDENT.,N,,Shark involvement not confirmed,"K. McMurray, TrackingSharks.com",2018.04.10.R-Paz.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.10.R-Paz.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.10.R-Paz.pdf,2018.04.10.R,2018.04.10.R,6272,,


In [66]:
data.Type.value_counts()

Unprovoked      4483
Provoked         567
Invalid          544
Sea Disaster     233
Boating          203
Boat             135
Questionable       2
Boatomg            1
Name: Type, dtype: int64

In [67]:
data['Type'] = data['Type'].replace({'Boatomg': 'Boat'})

In [68]:
data['Type'] = data['Type'].replace({'Boat': 'Boating'})

In [69]:
data.Type.value_counts()

Unprovoked      4483
Provoked         567
Invalid          544
Boating          339
Sea Disaster     233
Questionable       2
Name: Type, dtype: int64

In [70]:
data.Type.dtype

dtype('O')

In [71]:
data[data['Type'] == 'Boatomg']

Unnamed: 0,Day,Month,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number_1,Case_Number_2,original_order,Unnamed_22,Unnamed_23


In [72]:
data[data['Type'] == 'Questionable']

Unnamed: 0,Day,Month,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number_1,Case_Number_2,original_order,Unnamed_22,Unnamed_23
47,25,4,2018,Questionable,AUSTRALIA,New South Wales,Lennox Head,Surfing,Matthew Lee,M,,No injury,N,07h00,Questionable,"B. Myatt, GSAF",2018.04.25.b-Lee.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.25.b-Lee.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.25.b-Lee.pdf,2018.04.25.b,2018.04.25.b,6283,,
51,9,5,2018,Questionable,AUSTRALIA,New South Wales,"Sharpes Beach, Ballina",Surfing,male,M,,"No injury, surfboard damaged",N,10h30,Shark involvement not confirmed,"B. Myatt, GSAF",2018.05.09-SharpesBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.05.09-SharpesBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.05.09-SharpesBeach.pdf,2018.05.09,2018.05.09,6287,,


In [73]:
data = data.drop(data[data['Type'] == 'Questionable'].index)

In [74]:
data = data.reset_index(drop=True)

In [75]:
data.Country.unique()

array(['USA', 'AUSTRALIA', 'BAHAMAS', 'BRAZIL', 'NEW CALEDONIA',
       'SOUTH AFRICA', 'ECUADOR', 'THAILAND', 'ENGLAND', 'MEXICO',
       'MALDIVES', 'COSTA RICA', 'UNITED ARAB EMIRATES',
       'ST HELENA, British overseas territory', 'REUNION', 'NEW ZEALAND',
       'UNITED KINGDOM', 'FRENCH POLYNESIA', 'SPAIN', 'COMOROS',
       'INDONESIA', 'PHILIPPINES', 'MAURITIUS', 'LIBYA', nan, 'CUBA',
       'SAMOA', 'MALAYSIA', 'EGYPT', 'SOLOMON ISLANDS', 'JAPAN',
       'COLUMBIA', 'CAPE VERDE', 'CAYMAN ISLANDS', 'DOMINICAN REPUBLIC',
       'Fiji', 'CHINA', 'PUERTO RICO', 'ATLANTIC OCEAN', 'ITALY',
       'MOZAMBIQUE', 'ARUBA', 'FIJI', 'FRANCE', 'ST. MARTIN',
       'TRINIDAD & TOBAGO', 'PAPUA NEW GUINEA', 'GREECE',
       'PALESTINIAN TERRITORIES', 'JAMAICA', 'TAIWAN', 'DIEGO GARCIA',
       'BELIZE', 'SEYCHELLES', 'GUAM', 'ISRAEL', 'KIRIBATI', 'CHILE',
       'SAUDI ARABIA', 'CROATIA', 'NIGERIA', 'TONGA', 'CANADA',
       'SCOTLAND', 'TURKS & CAICOS', 'UNITED ARAB EMIRATES (UAE)',
      

In [76]:
data['Country'] = data['Country'].replace({'Fiji': 'FIJI', 'ST HELENA, British overseas territory': 'UNITED KINGDOM OVERSEAS', 'DIEGO GARCIA': 'UNITED KINGDOM OVERSEAS', 'Sierra Leone': 'SIERRA LEONE', 'Seychelles': 'SEYCHELLES',
                                 'EGYPT / ISRAEL': 'EGYPT', 'PACIFIC OCEAN ': 'PACIFIC OCEAN', 'BRITISH ISLES': 'UNITED KINGDOM', 'ENGLAND': 'UNITED KINGDOM', 'ST. MAARTIN': 'ST MARTIN', 'ST. MARTIN': 'ST MARTIN',
                                'NORTH ATLANTIC OCEAN ': 'NORTH ATLANTIC OCEAN', 'FEDERATED STATES OF MICRONESIA': 'MICRONESIA', 'BRITISH WEST INDIES': 'UNITED KINGDOM OVERSEAS', 'RED SEA / INDIAN OCEAN': 'RED SEA', 'ANDAMAN / NICOBAR ISLANDAS': 'BAY OF BENGAL',
                                 'SUDAN?': 'SUDAN', 'THE BALKANS': 'SLOVENIA', 'IRAN / IRAQ': 'IRAN', ' PHILIPPINES': 'PHILIPPINES', 'SOLOMON ISLANDS / VANUATU': 'VANUATU', 'ITALY / CROATIA': 'CROATIA', 'YEMEN ': 'YEMEN', 'REUNION': 'REUNION ISLAND',
                                'EGYPT ': 'EGYPT', 'BRITISH NEW GUINEA': 'UNITED KINGDOM OVERSEAS', 'OCEAN': 'PACIFIC OCEAN', 'INDIAN OCEAN?': 'INDIAN OCEAN', 'EQUATORIAL GUINEA / CAMEROON': 'CAMEROON', 'Coast of AFRICA': 'ATLANTIC OCEAN', 'Between PORTUGAL & INDIA': 'INDIAN OCEAN',
                                'TURKS & CAICOS': 'UNITED KINGDOM OVERSEAS', 'TRINIDAD & TOBAGO': 'TOBAGO', 'UNITED ARAB EMIRATES (UAE)': 'UNITED ARAB EMIRATES', 'BRITISH VIRGIN ISLANDS': 'UNITED KINGDOM OVERSEAS', ' TONGA': 'TONGA', 'MEXICO ': 'MEXICO', 'NICARAGUA ': 'NICARAGUA',
                                 'MID-PACIFC OCEAN': 'MID PACIFIC OCEAN'})
                                  

In [77]:
data['Country'].fillna(value='unknown', inplace=True) 

In [78]:
data.Country.isna().unique()

array([False])

In [79]:
data['Type'].fillna(value='Invalid', inplace=True)

In [80]:
data = data.drop(6047)

data = data.reset_index(drop=True)

In [81]:
data.head()

Unnamed: 0,Day,Month,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number_1,Case_Number_2,original_order,Unnamed_22,Unnamed_23
0,25,6,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and paddle damaged",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf,2018.06.25,2018.06.25,6303,,
1,23,2,2018,Unprovoked,AUSTRALIA,New South Wales,"Little Congwong Beach, La Perouse",Swimming,Anna Shurapey,F,55.0,Laceratons to right leg & foot,N,"19h00, Dusk","Juvenile white shark, 2.7 to 3.2 m","B. Myatt, GSAF",2018.02.23-Shurapey.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.02.23-Shurapey.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.02.23-Shurapey.pdf,2018.02.23,2018.02.23,6263,,
2,15,4,2018,Unprovoked,AUSTRALIA,Western Australia,"Cobblestones, Margaret River Area",Surfing,Alejandro Travaglini,M,37.0,Lacerations to legs,N,08h00,,"B.Myatt, GSAF",2018.04.15.a-Travaglini.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.15.a-Travaglini.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.15.a-Travaglini.pdf,2018.04.15.a,2018.04.15.a,6274,,
3,14,4,2018,Unprovoked,BAHAMAS,New Providence,Nirvana Beach,Surfing,Bruce Rowan,M,,No Injury. Shark swam away with the surf board,N,09h30,Tiger shark,"Tribune242,",2018.04.14-Rowan.pff,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.14-Rowan.pff,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.14-Rowan.pff,2018.04.14,2018.04.14,6273,,
4,10,1,2018,Invalid,BRAZIL,Alagoas,"Praia de Sauaçuhy, Maceió",Fishing,Josias Paz,M,56.0,Injury to ankle from marine animal trapped in weir PROVOKED INCIDENT.,N,,Shark involvement not confirmed,"K. McMurray, TrackingSharks.com",2018.04.10.R-Paz.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.10.R-Paz.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.10.R-Paz.pdf,2018.04.10.R,2018.04.10.R,6272,,


In [82]:
data.Location.unique()

array(['Oceanside, San Diego County',
       'Little Congwong Beach, La Perouse ',
       'Cobblestones, Margaret River Area', ..., 'Hooghly River mouth',
       'Ganges Delta', 'River Cochin'], dtype=object)

In [83]:
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6169 entries, 0 to 6168
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Day                     6169 non-null   int32 
 1   Month                   6169 non-null   int32 
 2   Year                    6169 non-null   int32 
 3   Type                    6169 non-null   object
 4   Country                 6169 non-null   object
 5   Area                    5742 non-null   object
 6   Location                5667 non-null   object
 7   Activity                5645 non-null   object
 8   Name                    5964 non-null   object
 9   Sex                     5612 non-null   object
 10  Age                     3457 non-null   object
 11  Injury                  6142 non-null   object
 12  Fatal                   5632 non-null   object
 13  Time                    2939 non-null   object
 14  Species                 3426 non-null   object
 15  Inve

In [84]:
len(data[(data.Area.isna() == True) | (data.Location.isna() == True)])

712

In [85]:
len(data[(data.Area.isna() == True) & (data.Location.isna() == True)])

217

In [86]:
doble_nan = (data.Area.isna() == True) & (data.Location.isna() == True)

data.loc[doble_nan, ["Area","Location"]] = data.loc[doble_nan, ["Area","Location"]].fillna("unknown")

In [87]:
data.head()

Unnamed: 0,Day,Month,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number_1,Case_Number_2,original_order,Unnamed_22,Unnamed_23
0,25,6,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and paddle damaged",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf,2018.06.25,2018.06.25,6303,,
1,23,2,2018,Unprovoked,AUSTRALIA,New South Wales,"Little Congwong Beach, La Perouse",Swimming,Anna Shurapey,F,55.0,Laceratons to right leg & foot,N,"19h00, Dusk","Juvenile white shark, 2.7 to 3.2 m","B. Myatt, GSAF",2018.02.23-Shurapey.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.02.23-Shurapey.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.02.23-Shurapey.pdf,2018.02.23,2018.02.23,6263,,
2,15,4,2018,Unprovoked,AUSTRALIA,Western Australia,"Cobblestones, Margaret River Area",Surfing,Alejandro Travaglini,M,37.0,Lacerations to legs,N,08h00,,"B.Myatt, GSAF",2018.04.15.a-Travaglini.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.15.a-Travaglini.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.15.a-Travaglini.pdf,2018.04.15.a,2018.04.15.a,6274,,
3,14,4,2018,Unprovoked,BAHAMAS,New Providence,Nirvana Beach,Surfing,Bruce Rowan,M,,No Injury. Shark swam away with the surf board,N,09h30,Tiger shark,"Tribune242,",2018.04.14-Rowan.pff,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.14-Rowan.pff,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.14-Rowan.pff,2018.04.14,2018.04.14,6273,,
4,10,1,2018,Invalid,BRAZIL,Alagoas,"Praia de Sauaçuhy, Maceió",Fishing,Josias Paz,M,56.0,Injury to ankle from marine animal trapped in weir PROVOKED INCIDENT.,N,,Shark involvement not confirmed,"K. McMurray, TrackingSharks.com",2018.04.10.R-Paz.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.10.R-Paz.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.10.R-Paz.pdf,2018.04.10.R,2018.04.10.R,6272,,


In [88]:
data.Area.fillna(data.Location, inplace=True)

In [89]:
data.Location.fillna(data.Area, inplace=True)

In [90]:
data.Activity.value_counts().head(60)

Surfing                           968
Swimming                          853
Fishing                           423
Spearfishing                      332
Bathing                           159
Wading                            147
Diving                            115
Standing                           97
Snorkeling                         88
Scuba diving                       75
Body boarding                      61
Body surfing                       49
Swimming                           47
Kayaking                           33
Fell overboard                     32
Treading water                     32
Boogie boarding                    29
Pearl diving                       28
Free diving                        27
Windsurfing                        19
Walking                            17
Boogie Boarding                    16
Shark fishing                      15
Floating                           14
Fishing                            13
Rowing                             12
Surf fishing

In [91]:
data['Activity'] = np.where(data['Activity'].str.contains('urf'), 'Surfing', data['Activity'])

In [92]:
data['Activity'] = np.where(data['Activity'].str.contains('ishin'), 'Fishing', data['Activity'])

In [93]:
data['Activity'] = np.where(data['Activity'].str.contains('iving'), 'Diving', data['Activity'])

In [94]:
data['Activity'] = np.where(data['Activity'].str.contains('wim'), 'Swimming', data['Activity'])

In [95]:
data['Activity'] = np.where(data['Activity'].str.contains('ath'), 'Bathing', data['Activity'])

In [96]:
data['Activity'] = np.where(data['Activity'].str.contains('oard'), 'Boarding', data['Activity'])

In [97]:
data['Activity'] = np.where(data['Activity'].str.contains('ayak'), 'Boating', data['Activity'])

In [98]:
data['Activity'] = np.where(data['Activity'].str.contains('Boat'), 'Boating', data['Activity'])

In [99]:
data['Activity'] = np.where(data['Activity'].str.contains('boat'), 'Boating', data['Activity'])

In [100]:
data['Activity'] = np.where(data['Activity'].str.contains('Walking'), 'Wading', data['Activity'])

In [101]:
data['Activity'] = np.where(data['Activity'].str.contains('Standing'), 'Standing', data['Activity'])

In [102]:
data['Activity'] = np.where(data['Activity'].str.contains('loating'), 'Swimming', data['Activity'])

In [103]:
data['Activity'] = np.where(data['Activity'].str.contains('Canoe'), 'Boating', data['Activity'])

In [104]:
data['Activity'] = np.where(data['Activity'].str.contains('Sail'), 'Boating', data['Activity'])

In [105]:
data['Activity'].fillna(value='unknown', inplace=True)

In [106]:
excepts = {'Surfing':'Surfing', 'Fishing':'Fishing', 'Swimming':'Swimming', 'Diving':'Diving', 'Boarding':'Boarding', 
                'Bathing':'Bathing', 'Wading':'Wading', 'Boating':'Boating', 'Standing':'Standing', 'Snorkeling':'Snorkeling'}

data.loc[~data['Activity'].isin(excepts), 'Activity'] = 'unknown'


In [107]:
data['Name'].fillna(value='unknown', inplace=True)

In [108]:
data.Sex.value_counts()

M      4982
F       624
M         2
N         2
lli       1
.         1
Name: Sex, dtype: int64

In [109]:
data[data['Sex'] == 'N']

Unnamed: 0,Day,Month,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number_1,Case_Number_2,original_order,Unnamed_22,Unnamed_23
4933,11,7,1934,Boating,AUSTRALIA,New South Wales,Cronulla,Fishing,"18' boat, occupants William & Leslie Newton",N,,"No injury to occupants Sharks continually followed the dinghy, and one smashed its rudder",N,,"Blue pointer, 11'","G.P. Whitley, ref: Daily Telegraph, 7/11/1934 & Sydney Morning Herald 7/12/1934",1934.07.11-Newton-boat-Australia.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1934.07.11-Newton-boat-Australia.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1934.07.11-Newton-boat-Australia.pdf,1934.07.11,1934.07.11,1365,,
6126,18,1,1801,Provoked,unknown,unknown,unknown,Standing,Stephen Pettigew,N,,"FATAL, PROVOKED INCIDENT",Y,,12' shark,"The Evening Post, 12/18/1801",1801.12.18.R-Pettigrew.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1801.12.18.R-Pettigrew.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1801.12.18.R-Pettigrew.pdf,1801.12.18.R,1801.12.18.R,172,,


In [110]:
data['Sex'] = data['Sex'].replace({'M ': 'M', 'lli': 'M', 'N': 'M', '.': 'unknown'})
data['Sex'].fillna(value='unknown', inplace=True)

In [111]:
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6169 entries, 0 to 6168
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Day                     6169 non-null   int32 
 1   Month                   6169 non-null   int32 
 2   Year                    6169 non-null   int32 
 3   Type                    6169 non-null   object
 4   Country                 6169 non-null   object
 5   Area                    6169 non-null   object
 6   Location                6169 non-null   object
 7   Activity                6169 non-null   object
 8   Name                    6169 non-null   object
 9   Sex                     6169 non-null   object
 10  Age                     3457 non-null   object
 11  Injury                  6142 non-null   object
 12  Fatal                   5632 non-null   object
 13  Time                    2939 non-null   object
 14  Species                 3426 non-null   object
 15  Inve

In [112]:
len(data[data.Age.isna()==True])

2712

In [113]:
data.Age.value_counts().head(60)

17      154
18      150
19      141
20      140
15      137
16      136
21      118
22      117
25      107
24      106
14      101
13       94
26       83
28       80
23       80
29       78
27       77
30       76
12       73
32       69
35       68
40       56
10       56
31       52
34       50
38       48
33       44
43       43
37       42
36       42
39       38
42       38
41       38
11       37
52       35
9        35
50       32
45       32
47       30
44       29
48       28
49       28
8        28
46       27
55       23
7        22
51       20
54       17
60       16
57       16
58       15
53       13
61       13
6        13
59       11
56       10
69       10
63        9
Teen      9
62        8
Name: Age, dtype: int64

In [114]:
data['Age'] = data['Age'].str.replace(' ', '')

In [115]:
data['Age'] = data['Age'].str.replace('[^0-9]', '')

In [116]:
data['Age'] = data['Age'].astype(str)   # Convierto los valores de 'Age' a str para poder aplicar la función

def no_age_unknown(x):
    
    if len(x) > 2:
        
        return 'unknown'
    
    return x

data['Age'] = data['Age'].apply(no_age_unknown)

In [117]:
data['Age'].replace("", "unknown", inplace=True)

In [118]:
data.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6169 entries, 0 to 6168
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Day                     6169 non-null   int32 
 1   Month                   6169 non-null   int32 
 2   Year                    6169 non-null   int32 
 3   Type                    6169 non-null   object
 4   Country                 6169 non-null   object
 5   Area                    6169 non-null   object
 6   Location                6169 non-null   object
 7   Activity                6169 non-null   object
 8   Name                    6169 non-null   object
 9   Sex                     6169 non-null   object
 10  Age                     6169 non-null   object
 11  Injury                  6142 non-null   object
 12  Fatal                   5632 non-null   object
 13  Time                    2939 non-null   object
 14  Species                 3426 non-null   object
 15  Inve

In [119]:
data.Injury.value_counts().tail(60)

Left inner thigh                                                                                                          1
Sharks prevented recovery of remains                                                                                      1
Minor laceration & 3 punctures to right foot                                                                              1
No injury, flung off board                                                                                                1
No injury to occupant; shark bit propeller                                                                                1
No injury, wetsuit punctured                                                                                              1
Fingers bitten PROVOKED INCIDENT                                                                                          1
Puncture wounds on knee                                                                                                   1
Minor in

In [120]:
data['Injury'] = np.where(data['Injury'].str.contains('FATAL'), 'FATAL', data['Injury'])

In [121]:
data['Injury'] = np.where(data['Injury'].str.contains('atal'), 'FATAL', data['Injury'])

In [122]:
data['Injury'].fillna(value='unknown', inplace=True)

In [123]:
data[data.Fatal.isna()==True]

Unnamed: 0,Day,Month,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number_1,Case_Number_2,original_order,Unnamed_22,Unnamed_23
14,15,4,2018,Unprovoked,BRAZIL,Pernambuco,Piedade,Swimming,Pablo de Melo,M,34,"Multiple severe injuries to arms and leg, leg subsequently surgically amputated",,,,"Globo, 4/16/2018",2018.04.15.c-deMelo.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.15.c-deMelo.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.15.c-deMelo.pdf,2018.04.15.c,2018.04.15.c,6276,,
56,3,5,2017,Invalid,USA,California,"Sunset Beach, Orange County",Surfing,Sophia Raab,F,18,"Laceration to thigh, likely caused by surfboard fin",,14h30,Shark involvement highly doubtful,"R. Collier, GSAF",2017.05.03-Raab.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2017.05.03-Raab.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2017.05.03-Raab.pdf,2017.05.03,2017.05.03,6158,,
62,26,4,2017,Invalid,USA,Florida,Florida,unknown,Molly Cavelli,F,unknown,Alleged laceration to left ankle,,,No shark invovlement - it ws a publicity stunt,"The Sun, 5/6/2017",2017.05.26-Cavelli.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2017.05.26-Cavelli.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2017.05.26-Cavelli.pdf,2017.04.26,2017.04.26,6152,,
64,20,4,2017,Invalid,USA,South Carolina,Georgetown County,Swimming,male,M,unknown,Laceration & puncture wounds to left foot,,08h50,Shark involvement not confirmed,"C. Creswell, GSAF",2017.04.20-PawleysIsland.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2017.04.20-PawleysIsland.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2017.04.20-PawleysIsland.pdf,2017.04.20,2017.04.20,6150,,
79,29,6,2017,Invalid,USA,South Carolina,Wrightsville Beach,Fishing,male,M,unknown,"Arm injured by hook, not by a shark",,19h30,No shark invovlement,"C. Creswell, GSAF",2017.06.29-Wrightsville.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.29-Wrightsville.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.29-Wrightsville.pdf,2017.06.29,2017.06.29,6178,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6121,16,9,1805,Invalid,USA,New York,"Sag Harbor, Suffolk County",Surfing,unknown,M,unknown,human remains (male) found in sharks gut,,,Shark involvement prior to death unconfirmed,S.L. Mitchill (1814),1805.09.00-NY.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1805.09.00-NY.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1805.09.00-NY.pdf,1805.09.00,1805.09.00,177,,
6142,21,1,1767,Invalid,FRANCE,Côte d'Azur,St. Tropez,Bathing,Samuel Matthews,M,unknown,Lacerations to arm & leg,,,Description of shark does not ring true,,1767.00.00-Matthews.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1767.00.00-Matthews.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1767.00.00-Matthews.pdf,1767.00.00,1767.00.00,156,,
6152,19,1,1733,Invalid,ICELAND,Bardestrand,Talkknefiord,Surfing,unknown,unknown,unknown,"Partial hominid remains recovered from shark, probable drowning and scavenging",,,Shark involvement prior to death unconfirmed,E. Olafsen,1733.00.00-Iceland.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1733.00.00-Iceland.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1733.00.00-Iceland.pdf,1733.00.00,1733.00.00,146,,
6153,3,1,1723,Unprovoked,ROATAN,unknown,unknown,Surfing,Philip Ashton,M,unknown,Struck on thigh,,,,"C.Moore, GSAF",1730.00.00-Ashton.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1730.00.00-Ashton.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1730.00.00-Ashton.pdf,1723.00.00,1723.00.00,145,,


In [124]:
data[data['Fatal'] == 'UNKNOWN']

Unnamed: 0,Day,Month,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number_1,Case_Number_2,original_order,Unnamed_22,Unnamed_23
701,8,9,2013,Unprovoked,USA,South Carolina,"St. Helena Island, Beaufort County",Surfing,female,F,unknown,No details,UNKNOWN,,,"WIS-TV, 9/9/2013",2013.09.08-St-Helena.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2013.09.08-St-Helena.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2013.09.08-St-Helena.pdf,2013.09.08,2013.09.08,5679,,
1203,20,4,2008,Unprovoked,AUSTRALIA,New South Wales,Crescent Head,Surfing,Jamie Adlington,M,unknown,FATAL,UNKNOWN,,"Tiger shark, 2.3m","T. Peake, GSAF",2008.04.20.a-Adlington.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2008.04.20.a-Adlington.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2008.04.20.a-Adlington.pdf,2008.04.20.a,2008.04.20.a,5033,,
2208,7,6,1997,Unprovoked,BRAZIL,Rio de Janeiro,"Copacabana, Rio de Janeiro",Bathing,José Luiz Lipiani,M,unknown,FATAL,UNKNOWN,,,"Globo, 6/9/1997",1997.06.07-NV-Lipiani.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1997.06.07-NV-Lipiani.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1997.06.07-NV-Lipiani.pdf,1997.06.07,1997.06.07,4053,,
2227,21,2,1997,Unprovoked,USA,Hawaii,"Sunset Beach, O'ahu",Surfing,Gersome Perreno,M,unknown,No details,UNKNOWN,,,G. Balazs,1997.02.21-NV-Perreno.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1997.02.21-NV-Perreno.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1997.02.21-NV-Perreno.pdf,1997.02.21,1997.02.21,4043,,
2269,28,4,1996,Unprovoked,USA,Hawaii,"La'ie Point, O'ahu",Surfing,Wayne Leong,M,unknown,No details,UNKNOWN,,,G. Balazs,1996.04.28.b-Leong.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1996.04.28.b-Leong.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1996.04.28.b-Leong.pdf,1996.04.28.b,1996.04.28.b,3996,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6162,20,1,1637,Unprovoked,INDIA,West Bengal,Hooghly River mouth,Wading,Hindu pilgrims,unknown,unknown,FATAL,UNKNOWN,,,"H. Edwards, p.31, citing Sebastian Manrique",1637.00.00.R-Manrique.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1637.00.00.R-Manrique.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1637.00.00-Manrique.pdf,1637.00.00.R,1637.00.00.R,137,,
6163,22,1,1617,Unprovoked,INDIA,West Bengal,Ganges Delta,Surfing,Indian people,unknown,unknown,FATAL,UNKNOWN,,,"H. Edwards, p.31, citing Samuel Purchas",1617.00.00-Purchas.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1617.00.00-Purchas.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1617.00.00-Purchas.pdf,1617.00.00.R,1617.00.00.R,136,,
6164,15,1,1595,Unprovoked,INDIA,Kerala,River Cochin,unknown,male,M,unknown,"Leg severed mid-thigh, hand severed, arm above elbow and part of buttocks. Not known if he survived",UNKNOWN,,,The Voyage of John Huyghen van Linschoten,1595.00.00-Cochin.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1595.00.00-Cochin.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1595.00.00-Cochin.pdf,1595.00.00,1595.00.00,134,,
6166,6,1,1555,Unprovoked,unknown,unknown,unknown,Swimming,male,M,unknown,FATAL,UNKNOWN,,,Olaus Magnus,1555.00.00 - Olaus Magnus.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1555.00.00 - Olaus Magnus.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1555.00.00 - Olaus Magnus.pdf,1555.00.00,1555.00.00,132,,


In [125]:
data['Fatal'] = data['Fatal'].replace({' N': 'N', 'M': 'N', '2017': 'N', 'N ': 'N', 'y': 'Y'})

In [126]:
data[(data['Injury'] == 'FATAL') & (data['Fatal'] != 'Y')]

Unnamed: 0,Day,Month,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number_1,Case_Number_2,original_order,Unnamed_22,Unnamed_23
116,12,4,2017,Invalid,SOUTH AFRICA,KwaZulu-Natal,Protea Banks,Diving,Leopold Mairhuber,M,68,FATAL,,,Shark involvement prior to death not confirmed,"E. Ritter, GSAF",2017.04.12.a-Mairhuber.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2017.04.12.a-Mairhuber.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2017.04.12.a-Mairhuber.pdf,2017.04.12.a,2017.04.12.a,6144,,
126,1,10,2017,Invalid,SOUTH AFRICA,Western Cape Province,Dyer Island,Diving,Bradley Fick,M,31,FATAL,,,Death may have been due to drowning,"All Africa, 10/11/2017",2017.10.01-Fick.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2017.10.01-Fick.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2017.10.01-Fick.pdf,2017.10.01,2017.10.01,6227,,
134,15,9,2017,Invalid,SOUTH AFRICA,Western Cape Province,Hawston,Diving,Wayon Love,M,25,FATAL,,Afternoon,,"Ground Up, 9/20/2017",2017.09.15.b-Love.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2017.09.15.b-Love.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2017.09.15.b-Love.pdf,2017.09.15.b,2017.09.15.b,6219,,
372,29,3,2015,Invalid,ITALY,Sardinia,Sardinia,Diving,Eugenio Masala,M,43,FATAL,,,Shark involvement not cofirmed,"A. de Maddalena, GSAF",2015.03.29-Masala.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.03.29-Masala.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.03.29-Masala.pdf,2015.03.29,2015.03.29,5866,,
943,28,8,2011,Invalid,AUSTRALIA,Queensland,Fantome Island,Swimming,Rooster,M,48,FATAL,,19h30,Shark involvement prior to death not confirmed,"Courier Pigeon, 8/30/2011",2011.08.28-Roosteer.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2011.08.28-Roosteer.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2011.08.28-Roosteer.pdf,2011.08.28.b,2011.08.28.b,5429,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6161,4,1,1638,Unprovoked,unknown,unknown,unknown,Surfing,sailors,M,unknown,FATAL,UNKNOWN,,,Sir Thomas Herbert,1638.00.00.R-Herbert,http://sharkattackfile.net/spreadsheets/pdf_directory/1638.00.00.R-Herbert,http://sharkattackfile.net/spreadsheets/pdf_directory/1638.00.00.R-Herbert,1638.00.00.R,1638.00.00.R,138,,
6162,20,1,1637,Unprovoked,INDIA,West Bengal,Hooghly River mouth,Wading,Hindu pilgrims,unknown,unknown,FATAL,UNKNOWN,,,"H. Edwards, p.31, citing Sebastian Manrique",1637.00.00.R-Manrique.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1637.00.00.R-Manrique.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1637.00.00-Manrique.pdf,1637.00.00.R,1637.00.00.R,137,,
6163,22,1,1617,Unprovoked,INDIA,West Bengal,Ganges Delta,Surfing,Indian people,unknown,unknown,FATAL,UNKNOWN,,,"H. Edwards, p.31, citing Samuel Purchas",1617.00.00-Purchas.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1617.00.00-Purchas.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1617.00.00-Purchas.pdf,1617.00.00.R,1617.00.00.R,136,,
6166,6,1,1555,Unprovoked,unknown,unknown,unknown,Swimming,male,M,unknown,FATAL,UNKNOWN,,,Olaus Magnus,1555.00.00 - Olaus Magnus.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1555.00.00 - Olaus Magnus.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1555.00.00 - Olaus Magnus.pdf,1555.00.00,1555.00.00,132,,


In [127]:
data[(data['Injury'] != 'FATAL') & (data['Fatal'] == 'Y')]

Unnamed: 0,Day,Month,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number_1,Case_Number_2,original_order,Unnamed_22,Unnamed_23
1105,8,2,2009,Sea Disaster,USA,Puerto Rico,Quebradillas,unknown,occupant of a Cessna 206,M,unknown,It is probable that all 5 passengers died on impact. The body of one was scavenged by a shark,Y,,,"C. Ekstander, GSAF",2009.02.08-PuertoRicoAirCrash.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2009.02.08-PuertoRicoAirCrash.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2009.02.08-PuertoRicoAirCrash.pdf,2009.02.08,2009.02.08,5145,,
1165,13,9,2009,Provoked,BRAZIL,Pernambuco,"Piedade, Recife",Surfing,Maurício da Silva Monteiro,M,34,Cause of death was drowning; his remains were scavenged by sharks,Y,,,"C. Ekstander, GSAF",2009.09.13-Monteiro.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2009.09.13-Monteiro.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2009.09.13-Monteiro.pdf,2009.09.13,2009.09.13,5213,,
1321,4,5,2007,Sea Disaster,UNITED KINGDOM OVERSEAS,Providenciales,Providenciales,unknown,Haitian refugees perished when their boat capsized in choppy seas,unknown,unknown,Some of the bodies recovered had been bitten by sharks,Y,,,CNN,2007.05.04-HaitianRefugees.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2007.05.04-HaitianRefugees.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2007.05.04-HaitianRefugees.pdf,2007.05.04,2007.05.04,4919,,
1479,25,11,2006,Sea Disaster,PHILIPPINES,Surigao del Norte,"Off Bilisan Point, Hinatuarn Island",unknown,Sinking of the m.v.Leonida,unknown,unknown,15 perished but shark involvement prior to death was not confirmed,Y,14h20,,"Manila Bulletin Online, 11/27/2006",2006.11.25-Leonida.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2006.11.25-Leonida.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2006.11.25-Leonida.pdf,2006.11.25,2006.11.25,4886,,
1621,13,4,2004,Invalid,TONGA,Nuku'alofa,30 nautical miles offshore,Fishing,male 1,M,unknown,"He was was bitten on the arm by small sharks & died, but it was not clear if he died as result o...",Y,,Questionable Incident,"New Zealand Herald, 4/15/2004",2004.04.13.a-Tonga.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2004.04.13.a-Tonga.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2004.04.13.a-Tonga.pdf,2004.04.13.a,2004.04.13.a,4624,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5979,27,3,1860,Sea Disaster,COOK ISLANDS,Mangaia Island,Mangaia Island,unknown,a Cook's Islander,M,unknown,Probable drowning,Y,,,"Brisbane Courier, 8/1/1866",1860.03.27-Clark'sIslander.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1860.03.27-Clark'sIslander.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1860.03.27-Clark'sIslander.pdf,1860.03.27,1860.03.27,318,,
5994,21,1,1856,Unprovoked,UNITED KINGDOM,Isle of Wight,Colwell Bay,Swimming,male,M,unknown,Survived,Y,,,"C. Moore, GSAF",1856.06.21.R-Isle-of-Wight.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1856.06.21.R-Isle-of-Wight.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1856.06.21.R-Isle-of-Wight.pdf,1856.06.21.R,1856.06.21.R,306,,
6096,11,1,1827,Unprovoked,EGYPT,Alexandria,Alexandria,Surfing,Two men,M,unknown,Remains of the men were recovered from a +17-foot shark,Y,,,"C. Moore, GSAF",1827.00.00-Alexandria.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1827.00.00-Alexandria.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1827.00.00-Alexandria.pdf,1827.00.00,1827.00.00,203,,
6134,26,1,1785,Unprovoked,UNITED KINGDOM,Sussex,Brighton,Surfing,unknown,M,unknown,Human remains recovered from shark,Y,,Tiger shark?,"C. Moore, GSAF",1785.09.26.R-Brighton.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1785.09.26.R-Brighton.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1785.09.26.R-Brighton.pdf,1785.09.26.R,1785.09.26.R,164,,


In [128]:
data['Fatal'] = np.where(data['Injury'] == 'FATAL', 'Y', data['Fatal'])

In [129]:
data.loc[data['Fatal'] == 'Y', 'Injury'] = 'FATAL'

In [130]:
data['Injury'] = np.where(data['Fatal'] == 'UNKNOWN', 'unknown', data['Injury'])

In [131]:
data['Fatal'].replace("UNKNOWN", "unknown", inplace=True)

In [132]:
data.Fatal.value_counts()

N          4230
Y          1410
unknown      51
Name: Fatal, dtype: int64

In [133]:
data.Injury.value_counts().head(20)

FATAL                        1410
Survived                       94
Foot bitten                    83
No injury                      77
Leg bitten                     71
unknown                        52
Left foot bitten               50
Right foot bitten              39
No injury, board bitten        31
Hand bitten                    28
Thigh bitten                   27
Minor injury                   21
Foot lacerated                 21
Lacerations to foot            20
Calf bitten                    20
Right leg bitten               20
Arm bitten                     19
Lacerations to right foot      18
Lacerations to left foot       18
Right calf bitten              16
Name: Injury, dtype: int64

In [134]:
len(data.loc[(data['Injury'] == 'unknown') & (data['Fatal'] == 'unknown')])

51

In [135]:
data.loc[(data['Injury'] == 'unknown') & (data['Fatal'] != 'unknown')]

Unnamed: 0,Day,Month,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number_1,Case_Number_2,original_order,Unnamed_22,Unnamed_23
4814,21,1,1938,Unprovoked,FIJI,Viti Levu,Singatoka River,Wading,male,M,unknown,unknown,N,,,"Time Magazine, 3/21/1938",1938.03.21.R-FijianMethodist.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1938.03.21.R-FijianMethodist.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1938.03.21.R-FijianMethodist.pdf,1938.03.21.R,1938.03.21.R,1483,,


In [136]:
data['Injury'] = np.where(data['Fatal'] == 'N', 'FATAL', data['Injury'])

In [137]:
data['Fatal'].fillna('unknown', inplace=True)

In [138]:
data.Time.value_counts().tail(60)

19h05                                   1
16h12                                   1
17h51                                   1
11h41                                   1
10h45-11h15                             1
15h52                                   1
19h00, Dusk                             1
14h21                                   1
11h25                                   1
10h00 -- 11h00                          1
09h00 -10h00                            1
20h45 (Sunset)                          1
11h56                                   1
08h40                                   1
02h30                                   1
Just after 12h00                        1
Shortly after midnight                  1
Early Morning                           1
11h53                                   1
Shortly before 13h00                    1
12h34                                   1
8:04 pm                                 1
12h46                                   1
Late morning                      

In [139]:
data['Time'] = data['Time'].str.replace(' ', '')

In [140]:
def clean_time(x):
    
    x = x.replace('h', ':').replace('am', '').strip()
    
    if ':' not in x:
        
        if 'unchtime' in x:
            return '12:00'
        elif 'fternoon' in x:
            return '16:00'
        elif 'idnight' in x:
            return '23:59'
        elif 'orning' in x:
            return '09:00'
        elif 'usk' in x:
            return '19:00'
        elif 'vening' in x:
            return '17:00'
        elif 'ight' in x:
            return '21:00'
        elif 'idday' in x:
            return '12:00'
        else:
            return 'unknown'
    else:
        return x

In [141]:
data['Time'] = data['Time'].astype(str)   # Convierto los valores de 'Age' a str para poder aplicar la función

data['Time'] = data['Time'].apply(clean_time)

In [142]:
data['Time'] = data['Time'].str.replace('[^0-9:]', '')  

In [143]:
data['Time'] = data['Time'].replace({':': '', '10:4511:15': '11:00', '07:0008:00': '07:30', '18:1518:30': '18:22', '06:0008:': '07:00',
                                 '17:0017:40': '17:20', ':13:00': '13:00', '14:3015:30': '15:00', '09:0010:00': '09:30', '13:345': '13:34', '9:00': '09:00',
                                '05:0008:00': '06:30', '17:0018:00': '17:30', '10:3013:30': '12:00', '06:0007:00': '06:30', '11:01:': '11:01',
                                 ':03:10': '03:10', '11:0012:00': '11:30', '18:1521:30': '20:00', '10:0014:00': '12:00', '12:0014:00': '13:00', '08:0009:30': '08:45', '09:3015:30': '12:30', '12:4513:45': '13:15',
                                '03:4504:00': '03:52', '15:0015:45': '15:22', '09:3010:00': '09:45', '16:3018:00': '17:15', '8:04': '08:04', '10:0011:00': '10:30', '2:': '02:00',
                                '::': '', '14:0015:00': '14:30', '09:0009:30': '09:15', '06:0007:20': '06:40', ':12:00': '12:00', '11:0011:30': '11:15', '19:0020:00': '19:30',
                                    '11:115': '11:15'})

In [144]:
mask5 = data['Time'] != ''

data.loc[mask5, 'Time'] = pd.to_datetime(data.loc[mask5, 'Time'], format='%H:%M').dt.time

In [145]:
data['Time'].replace("", "unknown", inplace=True)

In [146]:
data.Species.value_counts().head(60)

White shark                                           162
Shark involvement prior to death was not confirmed    105
Invalid                                               101
Shark involvement not confirmed                        87
Tiger shark                                            70
Shark involvement prior to death unconfirmed           68
Bull shark                                             47
4' shark                                               40
6' shark                                               39
Questionable incident                                  35
1.8 m [6'] shark                                       33
Questionable                                           32
1.5 m [5'] shark                                       32
3' shark                                               26
1.2 m [4'] shark                                       26
5' shark                                               26
2 m shark                                              25
4' to 5' shark

In [147]:
data.Species.isnull().sum()

2743

In [148]:
data.Species.fillna('unknown', inplace=True)

In [149]:
data['Species'] = np.where(data['Species'].str.contains('hite'), 'White shark', data['Species'])

data['Species'] = np.where(data['Species'].str.contains('iger'), 'Tiger shark', data['Species'])

data['Species'] = np.where(data['Species'].str.contains('ull'), 'Bull shark', data['Species'])

data['Species'] = np.where(data['Species'].str.contains('lue'), 'Blue shark', data['Species'])

data['Species'] = np.where(data['Species'].str.contains('urse'), 'Nurse shark', data['Species'])

data['Species'] = np.where(data['Species'].str.contains('ako'), 'Mako shark', data['Species'])

data['Species'] = np.where(data['Species'].str.contains('hammer'), 'Hammerhead shark', data['Species'])

In [150]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6169 entries, 0 to 6168
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Day                     6169 non-null   int32 
 1   Month                   6169 non-null   int32 
 2   Year                    6169 non-null   int32 
 3   Type                    6169 non-null   object
 4   Country                 6169 non-null   object
 5   Area                    6169 non-null   object
 6   Location                6169 non-null   object
 7   Activity                6169 non-null   object
 8   Name                    6169 non-null   object
 9   Sex                     6169 non-null   object
 10  Age                     6169 non-null   object
 11  Injury                  6169 non-null   object
 12  Fatal                   6169 non-null   object
 13  Time                    6169 non-null   object
 14  Species                 6169 non-null   object
 15  Inve

In [151]:
data.Investigator_or_Source.value_counts().head(60)

C. Moore, GSAF                                                                                        101
C. Creswell, GSAF                                                                                      92
S. Petersohn, GSAF                                                                                     82
R. Collier                                                                                             55
R. Collier, GSAF                                                                                       48
T. Peake, GSAF                                                                                         48
M. Levine, GSAF                                                                                        45
A. Gifford, GSAF                                                                                       28
C. Moore. GSAF                                                                                         27
B. Myatt, GSAF                                

In [152]:
data['Investigator_or_Source'].fillna('unknown', inplace=True)

In [153]:
data.pdf.value_counts().tail(60)

1999.03.05-Knutson.pdf                1
1999.07.29-Capri.pdf                  1
1999.02.26.R-BirdDog.pdf              1
1999.02.23-ScottsHead.pdf             1
1999.03.18.a-Davoodabai.pdf           1
1999.02.03-KennyBurns.pdf             1
1999.01.07-NZ-inflatable.pdf          1
1999.01.03.R-Turcotte.pdf             1
1999.01.03-ReunionIsland.pdf          1
1999.00.00.b-Lootz.pdf                1
1999.00.00.a-NV-SandridgeBeach.pdf    1
2000.09.15-Smith.pdf                  1
2000.09.16.a-NV-IsleOfPalms.pdf       1
2000.03.30-Rojcevic.pdf               1
2000.10.06.b-White.pdf                1
2000.11.10-Skeie.pdf                  1
2000.11.06.b-Avery.pdf                1
2000.11.06.a-KenCrew.pdf              1
2000.11.04-Stewman.pdf                1
2000.10.29-Kelly.pdf                  1
2000.10.20-Licamele.pdf               1
2000.10.18-Musselwhite.pdf            1
2000.10.14-Payne.pdf                  1
2000.10.09-Kraskiecwicz.pdf           1
2000.10.06.a-Holley.pdf               1


In [154]:
data.loc[data['href_formula'].isnull()]

Unnamed: 0,Day,Month,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number_1,Case_Number_2,original_order,Unnamed_22,Unnamed_23
3214,19,1,1975,Unprovoked,AUSTRALIA,South Australia,Coffin Bay,Surfing,David Barrowman,M,17,FATAL,Y,unknown,unknown,"J. West; Adelaide Advertiser, 1/20/1975; P. Kemp, GSAF",1975.01.19-Barrowman.pdf,,http://sharkattackfile.net/spreadsheets/pdf_directory/1975.01.19-Barrowman.pdf,1975.01.19,1975.01.19,3059,,


In [155]:
data.href_formula.iloc[3214] = data.href.iloc[3214]

In [156]:
data.href_formula.iloc[3214]

'http://sharkattackfile.net/spreadsheets/pdf_directory/1975.01.19-Barrowman.pdf'

In [157]:
data.iloc[3214]

Day                                                                                                   19
Month                                                                                                  1
Year                                                                                                1975
Type                                                                                          Unprovoked
Country                                                                                        AUSTRALIA
Area                                                                                     South Australia
Location                                                                                      Coffin Bay
Activity                                                                                         Surfing
Name                                                                                     David Barrowman
Sex                                                    

In [158]:
data.rename(columns={'Case_Number_1': 'Case_Number', 'Case_Number_2': 'Date', 'Unnamed_22': 'no_data_1', 'Unnamed_23': 'no_data_2'}, inplace=True)

In [159]:
len(data)

6169

In [160]:
data['Case_Number'] = [len(data) - i for i in range(len(data))]

In [161]:
data.head(60)

Unnamed: 0,Day,Month,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number,Date,original_order,no_data_1,no_data_2
0,25,6,2018,Boating,USA,California,"Oceanside, San Diego County",unknown,Julie Wolfe,F,57,FATAL,N,18:00:00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf,6169,2018.06.25,6303,,
1,23,2,2018,Unprovoked,AUSTRALIA,New South Wales,"Little Congwong Beach, La Perouse",Swimming,Anna Shurapey,F,55,FATAL,N,19:00:00,White shark,"B. Myatt, GSAF",2018.02.23-Shurapey.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.02.23-Shurapey.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.02.23-Shurapey.pdf,6168,2018.02.23,6263,,
2,15,4,2018,Unprovoked,AUSTRALIA,Western Australia,"Cobblestones, Margaret River Area",Surfing,Alejandro Travaglini,M,37,FATAL,N,08:00:00,unknown,"B.Myatt, GSAF",2018.04.15.a-Travaglini.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.15.a-Travaglini.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.15.a-Travaglini.pdf,6167,2018.04.15.a,6274,,
3,14,4,2018,Unprovoked,BAHAMAS,New Providence,Nirvana Beach,Surfing,Bruce Rowan,M,unknown,FATAL,N,09:30:00,Tiger shark,"Tribune242,",2018.04.14-Rowan.pff,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.14-Rowan.pff,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.14-Rowan.pff,6166,2018.04.14,6273,,
4,10,1,2018,Invalid,BRAZIL,Alagoas,"Praia de Sauaçuhy, Maceió",Fishing,Josias Paz,M,56,FATAL,N,unknown,Shark involvement not confirmed,"K. McMurray, TrackingSharks.com",2018.04.10.R-Paz.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.10.R-Paz.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.10.R-Paz.pdf,6165,2018.04.10.R,6272,,
5,9,4,2018,Unprovoked,NEW CALEDONIA,"Magenta Beach, Noumea","Magenta Beach, Noumea",Surfing,unknown,unknown,unknown,FATAL,N,17:00:00,2 m shark,"Les Nouvelles Caledoniennes, 4/10/2018",2018.04.09-Magenta.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.09-Magenta.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.09-Magenta.pdf,6164,2018.04.09,6271,,
6,5,4,2018,Unprovoked,BAHAMAS,Bimini,Bimini,Swimming,Shane McConnell,M,12,FATAL,N,18:00:00,Bull shark,"K. McMurray, TrackingSharks.com",2018.04.05-McConnell.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.05-McConnell.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.05-McConnell.pdf,6163,2018.04.05,6270,,
7,3,4,2018,Unprovoked,SOUTH AFRICA,Eastern Cape Province,St. Francis Bay,Surfing,Ross Spowart,M,19,FATAL,N,15:00:00,White shark,"K. McMurray, TrackingSharks.com",2018.04.03-StFrancisBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.03-StFrancisBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.03-StFrancisBay.pdf,6162,2018.04.03,6269,,
8,31,3,2018,Unprovoked,USA,Hawaii,Kukio Beach,Boarding,male,M,25,FATAL,N,09:30:00,Tiger shark,"Khon2, 3/31/2018",2018.03.31-Hawaii.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.03.31-Hawaii.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.03.31-Hawaii.pdf,6161,2018.03.31,6268,,
9,14,3,2018,Unprovoked,AUSTRALIA,Western Australia,Waterman's Bay,Wading,Luke Guy & Finn Bald,M,10,FATAL,N,17:35:00,Wobbegong shark,"B. Myatt, GSAF",2018.03.14-WatermansBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.03.14-WatermansBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.03.14-WatermansBay.pdf,6160,2018.03.14,6267,,


In [162]:
data.no_data_1 = 0

In [163]:
data['Date'] = data['Year'].astype(str) + '-' + data['Month'].astype(str) + '-' + data['Day'].astype(str)

In [164]:
data.no_data_2 = 0

In [165]:
data.shape

(6169, 24)

In [170]:
((data == 'unknown').sum(axis=1) > 11).unique()

array([False])

In [171]:
data.head()

Unnamed: 0,Day,Month,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator_or_Source,pdf,href_formula,href,Case_Number,Date,original_order,no_data_1,no_data_2
0,25,6,2018,Boating,USA,California,"Oceanside, San Diego County",unknown,Julie Wolfe,F,57,FATAL,N,18:00:00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf,6169,2018-6-25,6303,0,0
1,23,2,2018,Unprovoked,AUSTRALIA,New South Wales,"Little Congwong Beach, La Perouse",Swimming,Anna Shurapey,F,55,FATAL,N,19:00:00,White shark,"B. Myatt, GSAF",2018.02.23-Shurapey.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.02.23-Shurapey.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.02.23-Shurapey.pdf,6168,2018-2-23,6263,0,0
2,15,4,2018,Unprovoked,AUSTRALIA,Western Australia,"Cobblestones, Margaret River Area",Surfing,Alejandro Travaglini,M,37,FATAL,N,08:00:00,unknown,"B.Myatt, GSAF",2018.04.15.a-Travaglini.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.15.a-Travaglini.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.15.a-Travaglini.pdf,6167,2018-4-15,6274,0,0
3,14,4,2018,Unprovoked,BAHAMAS,New Providence,Nirvana Beach,Surfing,Bruce Rowan,M,unknown,FATAL,N,09:30:00,Tiger shark,"Tribune242,",2018.04.14-Rowan.pff,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.14-Rowan.pff,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.14-Rowan.pff,6166,2018-4-14,6273,0,0
4,10,1,2018,Invalid,BRAZIL,Alagoas,"Praia de Sauaçuhy, Maceió",Fishing,Josias Paz,M,56,FATAL,N,unknown,Shark involvement not confirmed,"K. McMurray, TrackingSharks.com",2018.04.10.R-Paz.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.10.R-Paz.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2018.04.10.R-Paz.pdf,6165,2018-1-10,6272,0,0


In [173]:
data['Time'].value_counts()

unknown     3372
16:00:00     330
09:00:00     187
11:00:00     130
12:00:00     125
            ... 
13:42:00       1
13:19:00       1
12:39:00       1
17:46:00       1
01:50:00       1
Name: Time, Length: 250, dtype: int64