# Data cleaning: Shark attack

## Exploración de la info

In [1]:
import pandas as pd
import numpy as np
import re
import datetime

In [2]:
sharks= pd.read_csv('attacks.csv', encoding='latin-1')
sharks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
Case Number               8702 non-null object
Date                      6302 non-null object
Year                      6300 non-null float64
Type                      6298 non-null object
Country                   6252 non-null object
Area                      5847 non-null object
Location                  5762 non-null object
Activity                  5758 non-null object
Name                      6092 non-null object
Sex                       5737 non-null object
Age                       3471 non-null object
Injury                    6274 non-null object
Fatal (Y/N)               5763 non-null object
Time                      2948 non-null object
Species                   3464 non-null object
Investigator or Source    6285 non-null object
pdf                       6302 non-null object
href formula              6301 non-null object
href                      6302 non-null obje

In [3]:
sharks.head(3)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,


In [4]:
col_names=[]
for column in sharks:
    col_names.append(column)
print(col_names)

['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href', 'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23']


## Revisando NULOS

Se tienen **25723** filas y **24** columnas

In [5]:
sharks.isnull().sum()

Case Number               17021
Date                      19421
Year                      19423
Type                      19425
Country                   19471
Area                      19876
Location                  19961
Activity                  19965
Name                      19631
Sex                       19986
Age                       22252
Injury                    19449
Fatal (Y/N)               19960
Time                      22775
Species                   22259
Investigator or Source    19438
pdf                       19421
href formula              19422
href                      19421
Case Number.1             19421
Case Number.2             19421
original order            19414
Unnamed: 22               25722
Unnamed: 23               25721
dtype: int64

In [6]:
print(sharks['Unnamed: 22'].unique())
print(sharks['Unnamed: 23'].unique())

[nan 'stopped here']
[nan 'Teramo' 'change filename']


**ACCIÓN:** La mayoría de los valores en las columnas 'Unnamed: 22' y 'Unnamed: 23' son nulos (y los tres valores que no lo son  son no nos aportan nada) por lo tanto borrare las dos columnas

In [7]:
sharks=sharks.drop(['Unnamed: 22','Unnamed: 23'], axis=1)

### Filas en donde todos los valores son NaN

In [8]:
idx = sharks.index[sharks.isnull().all(1)]
nans = sharks.iloc[idx]
nans

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
8702,,,,,,,,,,,...,,,,,,,,,,
8703,,,,,,,,,,,...,,,,,,,,,,
8704,,,,,,,,,,,...,,,,,,,,,,
8705,,,,,,,,,,,...,,,,,,,,,,
8706,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25717,,,,,,,,,,,...,,,,,,,,,,
25718,,,,,,,,,,,...,,,,,,,,,,
25719,,,,,,,,,,,...,,,,,,,,,,
25720,,,,,,,,,,,...,,,,,,,,,,


In [9]:
nans_index=nans.index.tolist()
nans_index

[8702,
 8703,
 8704,
 8705,
 8706,
 8707,
 8708,
 8709,
 8710,
 8711,
 8712,
 8713,
 8714,
 8715,
 8716,
 8717,
 8718,
 8719,
 8720,
 8721,
 8722,
 8723,
 8724,
 8725,
 8726,
 8727,
 8728,
 8729,
 8730,
 8731,
 8732,
 8733,
 8734,
 8735,
 8736,
 8737,
 8738,
 8739,
 8740,
 8741,
 8742,
 8743,
 8744,
 8745,
 8746,
 8747,
 8748,
 8749,
 8750,
 8751,
 8752,
 8753,
 8754,
 8755,
 8756,
 8757,
 8758,
 8759,
 8760,
 8761,
 8762,
 8763,
 8764,
 8765,
 8766,
 8767,
 8768,
 8769,
 8770,
 8771,
 8772,
 8773,
 8774,
 8775,
 8776,
 8777,
 8778,
 8779,
 8780,
 8781,
 8782,
 8783,
 8784,
 8785,
 8786,
 8787,
 8788,
 8789,
 8790,
 8791,
 8792,
 8793,
 8794,
 8795,
 8796,
 8797,
 8798,
 8799,
 8800,
 8801,
 8802,
 8803,
 8804,
 8805,
 8806,
 8807,
 8808,
 8809,
 8810,
 8811,
 8812,
 8813,
 8814,
 8815,
 8816,
 8817,
 8818,
 8819,
 8820,
 8821,
 8822,
 8823,
 8824,
 8825,
 8826,
 8827,
 8828,
 8829,
 8830,
 8831,
 8832,
 8833,
 8834,
 8835,
 8836,
 8837,
 8838,
 8839,
 8840,
 8841,
 8842,
 8843,
 8844,

**ACCIÓN:** Borrar todas las filas en donde todos sus valores son NaN

In [10]:
for i in nans_index:
    sharks=sharks.drop(i, axis=0)

In [11]:
sharks.reset_index(inplace= True)

In [12]:
sharks.isnull().sum()

index                        0
Case Number                  1
Date                      2401
Year                      2403
Type                      2405
Country                   2451
Area                      2856
Location                  2941
Activity                  2945
Name                      2611
Sex                       2966
Age                       5232
Injury                    2429
Fatal (Y/N)               2940
Time                      5755
Species                   5239
Investigator or Source    2418
pdf                       2401
href formula              2402
href                      2401
Case Number.1             2401
Case Number.2             2401
original order            2394
dtype: int64

In [13]:
sharks.info() #Se redujeron las filas de 25723 a 8703

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8703 entries, 0 to 8702
Data columns (total 23 columns):
index                     8703 non-null int64
Case Number               8702 non-null object
Date                      6302 non-null object
Year                      6300 non-null float64
Type                      6298 non-null object
Country                   6252 non-null object
Area                      5847 non-null object
Location                  5762 non-null object
Activity                  5758 non-null object
Name                      6092 non-null object
Sex                       5737 non-null object
Age                       3471 non-null object
Injury                    6274 non-null object
Fatal (Y/N)               5763 non-null object
Time                      2948 non-null object
Species                   3464 non-null object
Investigator or Source    6285 non-null object
pdf                       6302 non-null object
href formula              6301 non-null object


## Revisando columna por columna

### Columna 'Case Number'

In [14]:
sharks['Case Number'].unique() #El numero de caso es la fecha

array(['2018.06.25', '2018.06.18', '2018.06.09', ..., 'ND.0001', '0',
       'xx'], dtype=object)

In [15]:
sharks['Case Number'][:60]

0       2018.06.25
1       2018.06.18
2       2018.06.09
3       2018.06.08
4       2018.06.04
5     2018.06.03.b
6     2018.06.03.a
7       2018.05.27
8     2018.05.26.b
9     2018.05.26.a
10      2018.05.24
11      2018.05.21
12    2018.05.13.b
13    2018.05.13.a
14      2018.05.00
15      2018.05.12
16      2018.05.09
17    2018.04.30.R
18    2018.04.28.b
19    2018.04.28.a
20    2018.04.25.b
21    2018.04.25.a
22      2018.04.24
23      2018.04.23
24      2018.04.22
25      2018.04.19
26    2018.04.15.d
27    2018.04.15.c
28    2018.04.15.b
29    2018.04.15.a
30      2018.04.14
31    2018.04.10.R
32      2018.04.09
33      2018.04.05
34      2018.04.03
35      2018.03.31
36      2018.03.14
37    2018.03.09.b
38    2018.03.09.a
39      2018.02.24
40      2018.02.23
41      2018.02.17
42      2018.02.15
43      2018.02.14
44      2018.02.11
45      2018.02.03
46      2018.02.01
47      2018.01.28
48      2018.01.21
49      2018.01.14
50      2018.01.13
51      2018.01.12
52      2018

In [158]:
sharks[sharks['Case Number']=='2018.04.30.R'] #R es de Reported

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
17,2018.04.30.R,Reported 30-Apr-2018,2018.0,Unprovoked,THAILAND,Hua Hin,Sai Noi Beach,Swimming,female,M,...,N,,,"K. McMurray, TrackingSharks.com",2018.04.30.R-Thailand.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.04.30.R,2018.04.30.R,6286.0


**LOS ['Case Number']=='0' SON FILAS VACIAS**

In [16]:
sharks[sharks['Case Number']=='0']

Unnamed: 0,index,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
6302,6302,0,,,,,,,,,...,,,,,,,,,,6304.0
6303,6303,0,,,,,,,,,...,,,,,,,,,,6305.0
6304,6304,0,,,,,,,,,...,,,,,,,,,,6306.0
6305,6305,0,,,,,,,,,...,,,,,,,,,,6307.0
6306,6306,0,,,,,,,,,...,,,,,,,,,,6308.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8697,8697,0,,,,,,,,,...,,,,,,,,,,
8698,8698,0,,,,,,,,,...,,,,,,,,,,
8699,8699,0,,,,,,,,,...,,,,,,,,,,
8700,8700,0,,,,,,,,,...,,,,,,,,,,


In [35]:
sharks_pruebas= sharks.copy()

In [33]:
sharks_pruebas= sharks[sharks['Case Number']=='0']

In [34]:
sharks_pruebas.count()#HAY 2400 FILAS CON CASE NUMBER 0 y solo 7 valores en 'original order'

index                     0
Case Number               0
Date                      0
Year                      0
Type                      0
Country                   0
Area                      0
Location                  0
Activity                  0
Name                      0
Sex                       0
Age                       0
Injury                    0
Fatal (Y/N)               0
Time                      0
Species                   0
Investigator or Source    0
pdf                       0
href formula              0
href                      0
Case Number.1             0
Case Number.2             0
original order            0
dtype: int64

**ACCIÓN:** Todos las filas con 'Case Number'= 0 estan vacías excepto por algunos datos de la columna 'original order' por lo que se eliminarán esas filas

In [24]:
nan_indexcs= sharks_pruebas[sharks_pruebas['Case Number']=='0'].index.tolist()

In [25]:
for i in nan_indexcs:
    sharks_pruebas=sharks_pruebas.drop(i, axis=0)

In [26]:
sharks_pruebas.reset_index(inplace= True)

In [36]:
sharks_pruebas.info() #Se redujeron las filas de 8703 a 6303 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6303 entries, 0 to 8702
Data columns (total 23 columns):
index                     6303 non-null int64
Case Number               6302 non-null object
Date                      6302 non-null object
Year                      6300 non-null float64
Type                      6298 non-null object
Country                   6252 non-null object
Area                      5847 non-null object
Location                  5762 non-null object
Activity                  5758 non-null object
Name                      6092 non-null object
Sex                       5737 non-null object
Age                       3471 non-null object
Injury                    6274 non-null object
Fatal (Y/N)               5763 non-null object
Time                      2948 non-null object
Species                   3464 non-null object
Investigator or Source    6285 non-null object
pdf                       6302 non-null object
href formula              6301 non-null object


In [37]:
sharks.count() #YA NO HAY TANTOS NULOS

index                     6303
Case Number               6302
Date                      6302
Year                      6300
Type                      6298
Country                   6252
Area                      5847
Location                  5762
Activity                  5758
Name                      6092
Sex                       5737
Age                       3471
Injury                    6274
Fatal (Y/N)               5763
Time                      2948
Species                   3464
Investigator or Source    6285
pdf                       6302
href formula              6301
href                      6302
Case Number.1             6302
Case Number.2             6302
original order            6302
dtype: int64

## Columna 'Date'

In [38]:
sharks['Date'].unique()

array(['25-Jun-2018', '18-Jun-2018', '09-Jun-2018', ..., '1883-1889',
       '1845-1853', nan], dtype=object)

In [39]:
sharks[sharks['Date']=='1883-1889']

Unnamed: 0,index,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
6300,6300,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,...,Y,,,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002,3.0


In [40]:
try:
    sharks['Date'] = pd.to_datetime(sharks['Date'], format = '%d-%b-%Y')
except:
    pass

### Buscando valores que no coincidan con el formato '%d-%b-%Y'

In [46]:
shark_pruebas= sharks.copy()

In [47]:
sharks_pruebas.reset_index(inplace= True)

In [48]:
shark_pruebas_indices_incorrectos=[]
shark_pruebas_indices_correctos=[]
for i in range(len(shark_pruebas)):
    try:
        x = re.search(r'^\d{1,2}\-\w{3}\-\d{4}$',shark_pruebas['Date'][i])
        print(x.group())
    except AttributeError:
        shark_pruebas_indices_incorrectos.append(i)

25-Jun-2018
18-Jun-2018
09-Jun-2018
08-Jun-2018
04-Jun-2018
03-Jun-2018
03-Jun-2018
27-May-2018
26-May-2018
26-May-2018
24-May-2018
21-May-2018
13-May-2018
13-May-2018
12-May-2018
09-May-2018
28-Apr-2018
28-Apr-2018
25-Apr-2018
25-Apr-2018
24-Apr-2018
23-Apr-2018
22-Apr-2018
19-Apr-2018
15-Apr-2018
15-Apr-2018
15-Apr-2018
15-Apr-2018
14-Apr-2018
09-Apr-2018
05-Apr-2018
03-Apr-2018
31-Mar-2018
14-Mar-2018
9-Mar-2018
9-Mar-2018
24-Feb-2018
23-Feb-2018
18-Feb-2018
15-Feb-2018
14-Feb-2018
11-Feb-2018
03-Feb-2018
01-Feb-2018
28-Jan-2018
21-Jan-2018
14-Jan-2018
13-Jan-2018
12-Jan-2018
05-Jan-2018
31-Dec-2017
30-Dec-2017
21-Dec-2017
09-Dec-2017
30-Nov-2017
30-Nov-2017
24-Nov-2017
18-Nov-2017
13-Nov-2017
04-Nov-2017
28-Oct-2017
26-Oct-2017
23-Oct-2017
23-Oct-2017
23-Oct-2017
22-Oct-2017
21-Oct-2017
18-Oct-2017
09-Oct-2017
05-Oct-2017
01-Oct-2017
25-Sep-2017
25-Sep-2017
24-Sep-2017
24-Sep-2017
20-Sep-2017
16-Sep-2017
16-Sep-2017
15-Sep-2017
15-Sep-2017
13-Sep-2017
10-Sep-2017
10-Sep-2017
03-Sep

13-Apr-2006
11-Apr-2006
09-Apr-2006
09-Apr-2006
03-Apr-2006
23-Mar-2006
22-Mar-2006
18-Mar-2006
15-Mar-2006
27-Feb-2006
23-Feb-2006
13-Feb-2006
12-Feb-2006
12-Feb-2006
08-Feb-2006
01-Feb-2006
01-Feb-2006
25-Jan-2006
23-Jan-2006
18-Jan-2006
15-Jan-2006
11-Jan-2006
07-Jan-2006
04-Jan-2006
01-Jan-2006
24-Dec-2005
21-Dec-2005
20-Dec-2005
11-Dec-2005
27-Nov-2005
25-Nov-2005
25-Nov-2005
25-Nov-2005
21-Nov-2005
20-Nov-2005
15-Nov-2005
12-Nov-2005
02-Nov-2005
02-Nov-2005
29-Oct-2005
25-Oct-2005
22-Oct-2005
21-Oct-2005
19-Oct-2005
15-Oct-2005
13-Oct-2005
11-Oct-2005
06-Oct-2005
03-Oct-2005
01-Oct-2005
24-Sep-2005
23-Sep-2005
22-Sep-2005
20-Sep-2005
11-Sep-2005
07-Sep-2005
05-Sep-2005
04-Sep-2005
02-Sep-2005
02-Sep-2005
24-Aug-2005
24-Aug-2005
22-Aug-2005
21-Aug-2005
19-Aug-2005
14-Aug-2005
12-Aug-2005
6-Aug-2005
1-Aug-2005
27-Jul-2005
23-Jul-2005
22-Jul-2005
17-Jul-2005
17-Jul-2005
15-Jul-2005
13-Jul-2005
1-Jul-2005
27-Jun-2005
25-Jun-2005
22-Jun-2005
20-Jun-2005
18-Jun-2005
16-Jun-2005
13-Jun-

16-Apr-1991
16-Apr-1991
03-Apr-1991
03-Mar-1991
24-Feb-1991
12-Feb-1991
19-Jan-1991
09-Jan-1991
26-Dec-1990
28-Nov-1990
03-Nov-1990
01-Nov-1990
30-Oct-1990
30-Oct-1990
27-Oct-1990
25-Oct-1990
20-Oct-1990
15-Oct-1990
12-Oct-1990
15-Sep-1990
08-Sep-1990
05-Sep-1990
30-Aug-1990
28-Aug-1990
19-Aug-1990
19-Aug-1990
22-Jul-1990
08-Jul-1990
24-Jun-1990
23-Jun-1990
13-May-1990
10-May-1990
10-May-1990
06-May-1990
14-Apr-1990
09-Apr-1990
08-Apr-1990
07-Apr-1990
06-Apr-1990
01-Apr-1990
05-Mar-1990
17-Feb-1990
05-Feb-1990
12-Jan-1990
19-Dec-1989
02-Dec-1989
22-Nov-1989
18-Nov-1989
12-Nov-1989
02-Nov-1989
22-Oct-1989
14-Oct-1989
11-Oct-1989
08-Oct-1989
01-Oct-1989
17-Sep-1989
13-Sep-1989
10-Sep-1989
09-Sep-1989
09-Sep-1989
03-Sep-1989
29-Aug-1989
22-Aug-1989
22-Aug-1989
13-Aug-1989
09-Aug-1989
27-Jul-1989
20-Jul-1989
19-Jul-1989
14-Jul-1989
07-Jul-1989
29-Jun-1989
17-Jun-1989
06-Jun-1989
05-Jun-1989
03-Jun-1989
23-Apr-1989
12-Apr-1989
09-Apr-1989
03-Apr-1989
09-Mar-1989
04-Mar-1989
19-Feb-1989
15-F

11-Jun-1962
11-Jun-1962
11-Jun-1962
10-Jun-1962
10-Jun-1962
07-Jun-1962
04-Jun-1962
03-Jun-1962
29-May-1962
12-May-1962
20-Apr-1962
09-Apr-1962
07-Apr-1962
05-Apr-1962
25-Mar-1962
25-Mar-1962
24-Mar-1962
23-Feb-1962
18-Feb-1962
15-Feb-1962
07-Feb-1962
05-Feb-1962
04-Feb-1962
02-Feb-1962
27-Jan-1962
26-Jan-1962
21-Jan-1962
18-Jan-1962
16-Jan-1962
15-Jan-1962
14-Jan-1962
14-Jan-1962
14-Jan-1962
11-Jan-1962
11-Jan-1962
11-Jan-1962
10-Jan-1962
08-Jan-1962
07-Jan-1962
07-Jan-1962
06-Jan-1962
02-Jan-1962
01-Jan-1962
28-Dec-1961
28-Dec-1961
28-Dec-1961
27-Dec-1961
19-Dec-1961
18-Dec-1961
18-Dec-1961
13-Dec-1961
14-Nov-1961
17-Oct-1961
09-Oct-1961
26-Sep-1961
24-Sep-1961
24-Sep-1961
23-Sep-1961
07-Sep-1961
06-Sep-1961
20-Aug-1961
16-Aug-1961
04-Aug-1961
02-Aug-1961
01-Aug-1961
29-Jul-1961
16-Jul-1961
07-Jul-1961
24-Jun-1961
18-Jun-1961
02-Jun-1961
01-Jun-1961
21-May-1961
17-May-1961
15-May-1961
07-May-1961
30-Apr-1961
25-Apr-1961
21-Apr-1961
17-Apr-1961
16-Apr-1961
16-Apr-1961
14-Apr-1961
09-A

29-Jun-1920
27-Jun-1920
08-Mar-1920
03-Feb-1920
15-Jan-1920
07-Dec-1919
18-Nov-1919
12-Sep-1919
10-Aug-1919
29-May-1919
06-Apr-1919
16-Mar-1919
17-Jan-1919
15-Jan-1919
09-Jan-1919
05-Jan-1919
19-Sep-1918
22-Mar-1918
15-Dec-1917
21-Sep-1917
09-Sep-1917
18-Jul-1917
15-Jul-1917
03-Jun-1917
31-May-1917
30-Dec-1916
08-Dec-1916
08-Dec-1916
15-Nov-1916
10-Nov-1916
09-Nov-1916
11-Oct-1916
26-Jul-1916
13-Jul-1916
13-Jul-1916
12-Jul-1916
12-Jul-1916
12-Jul-1916
11-Jul-1916
08-Jul-1916
07-Jul-1916
06-Jul-1916
01-Jul-1916
30-Jun-1916
23-Jun-1916
03-Apr-1916
19-Mar-1916
10-Nov-1915
08-Nov-1914
03-Aug-1915
29-Mar-1915
06-Feb-1915
13-Jan-1915
01-Jan-1915
17-Oct-1914
09-Sep-1914
07-Jul-1914
13-Jun-1914
10-Jun-1914
31-May-1914
14-May-1914
03-Mar-1914
27-Nov-1913
21-Nov-1913
21-Sep-1913
03-Sep-1913
26-Aug-1913
21-May-1913
02-May-1913
27-Mar-1913
30-Aug-1912
23-Jul-1912
04-May-1912
18-Mar-1912
22-Feb-1912
19-Feb-1912
03-Feb-1912
26-Jan-1912
06-Jan-1912
01-Jan-1912
08-Nov-1911
26-Oct-1911
25-Oct-1911
23-S

KeyError: 6302

### Fechas en formato incorrecto

In [50]:
%pprint
shark_pruebas_indices_incorrectos #370 filas con formato de fecha incorrecto

Pretty printing has been turned OFF


[14, 17, 31, 59, 62, 65, 86, 90, 110, 122, 131, 132, 136, 143, 171, 187, 226, 248, 249, 301, 304, 306, 317, 370, 405, 423, 469, 471, 475, 498, 504, 512, 525, 538, 542, 556, 558, 565, 581, 588, 589, 598, 619, 620, 621, 622, 623, 625, 626, 627, 628, 629, 630, 635, 636, 637, 639, 640, 642, 643, 644, 645, 647, 648, 651, 653, 668, 688, 692, 708, 719, 733, 767, 771, 802, 822, 830, 844, 847, 848, 849, 877, 878, 885, 888, 911, 913, 926, 934, 940, 942, 948, 955, 961, 966, 971, 973, 987, 989, 990, 1024, 1035, 1036, 1041, 1078, 1084, 1102, 1105, 1119, 1131, 1147, 1149, 1161, 1162, 1170, 1174, 1177, 1187, 1210, 1224, 1239, 1243, 1251, 1271, 1275, 1277, 1288, 1292, 1295, 1298, 1299, 1314, 1357, 1358, 1366, 1376, 1380, 1385, 1389, 1397, 1400, 1403, 1425, 1426, 1429, 1447, 1448, 1456, 1462, 1465, 1466, 1485, 1493, 1506, 1519, 1520, 1525, 1528, 1544, 1569, 1602, 1626, 1648, 1655, 1656, 1704, 1709, 1719, 1724, 1748, 1753, 1760, 1761, 1762, 1771, 1777, 1814, 1838, 1849, 1850, 1860, 1938, 1951, 1967, 197

In [932]:
shark_pruebas_indices_incorrectos[0]

14

In [933]:
shark_pruebas['Date'][2452]

'Feb-1994'

In [934]:
len(shark_pruebas_indices_incorrectos)

708

In [51]:
shark_pruebas_indices_incorrectos_espacio=[]
shark_pruebas_indices_incorrectos_otros=[]
for i in range(len(sharks)):
    try:
        x = re.search(r'^\s.+',sharks['Date'][i])
        print(x.group())
        shark_pruebas_indices_incorrectos_espacio.append(i)
    except AttributeError:
        shark_pruebas_indices_incorrectos_otros.append(i)

 19-Feb-2016
 25-Sep-2013
 21-Sep-2013
 21-Sep-2013
 14-Sep-2013
 12-Sep-2013
 07-Sep-2013
 07-Sep-2013
 02-Sep-2013
 01-Sep-2013
 01-Sep-2013
 01-Sep-2013
 25-Aug-2013
 25-Aug-2013
 18-Aug-2013
 14-Aug-2013
 13-Aug-2013
 05-Aug-2013
  31-Jul-2013
  30-Jul-2013
  29-Jul-2013
  29-Jul-2013
    22-Jul-2013
  29-Oct-2011
  29-Oct-2011
 13-Sep-2010
 04-Sep-2010
 27-Mar-2010
    10-Jan-2009
 19-Jul-2004 Reported to have happened  "on the weekend"
 13-Jan-1999
 05-Nov-1997
 2-Jul-1997
 19-Aug-1993
  24-Mar-1990
 15-Feb-1988
  05-Oct-1985
 11-Jul-1982
  25-Jun-1982
 01-Dec-1979
    16-Jan-1970
 Jan-1970
  Reported 31-Jul-1958
 08-Jul-1958
 22-Jun-1956

1951.12.15.R
 18-Nov-1948
    15-Jun-1937
  21-Jun-1934
 24-Aug-1916
  03-Feb-1914
  16-Feb-1910
   21-Sep-1908
  10-Jan-1903
  28-Jan-1900
  02-Jun-1899
 Jul-1898
 11-Jan-1896
 08-Aug-1890
  19-Jul-1889
 22-Sep-1879
 07-Apr-1877
 11-Mar-1877
  28-Jan-1877


KeyError: 6302

#### Buscando que empiecen por espacio

In [52]:
shark_pruebas_indices_incorrectos_espacio=[]
shark_pruebas_indices_incorrectos_otros=[]
for i in shark_pruebas_indices_incorrectos:
    try:
        x = re.search(r'^\s.+',shark_pruebas['Date'][i])
        print(x.group())
        shark_pruebas_indices_incorrectos_espacio.append(i)
    except AttributeError:
        shark_pruebas_indices_incorrectos_otros.append(i)

 19-Feb-2016
 25-Sep-2013
 21-Sep-2013
 21-Sep-2013
 14-Sep-2013
 12-Sep-2013
 07-Sep-2013
 07-Sep-2013
 02-Sep-2013
 01-Sep-2013
 01-Sep-2013
 01-Sep-2013
 25-Aug-2013
 25-Aug-2013
 18-Aug-2013
 14-Aug-2013
 13-Aug-2013
 05-Aug-2013
  31-Jul-2013
  30-Jul-2013
  29-Jul-2013
  29-Jul-2013
    22-Jul-2013
  29-Oct-2011
  29-Oct-2011
 13-Sep-2010
 04-Sep-2010
 27-Mar-2010
    10-Jan-2009
 19-Jul-2004 Reported to have happened  "on the weekend"
 13-Jan-1999
 05-Nov-1997
 2-Jul-1997
 19-Aug-1993
  24-Mar-1990
 15-Feb-1988
  05-Oct-1985
 11-Jul-1982
  25-Jun-1982
 01-Dec-1979
    16-Jan-1970
 Jan-1970
  Reported 31-Jul-1958
 08-Jul-1958
 22-Jun-1956

1951.12.15.R
 18-Nov-1948
    15-Jun-1937
  21-Jun-1934
 24-Aug-1916
  03-Feb-1914
  16-Feb-1910
   21-Sep-1908
  10-Jan-1903
  28-Jan-1900
  02-Jun-1899
 Jul-1898
 11-Jan-1896
 08-Aug-1890
  19-Jul-1889
 22-Sep-1879
 07-Apr-1877
 11-Mar-1877
  28-Jan-1877


In [53]:
print(shark_pruebas_indices_incorrectos_otros)

[14, 17, 31, 59, 62, 65, 86, 90, 110, 122, 131, 132, 136, 143, 171, 187, 226, 248, 249, 301, 306, 317, 370, 405, 423, 469, 471, 475, 498, 504, 512, 525, 538, 542, 556, 558, 565, 581, 588, 589, 598, 642, 653, 668, 688, 692, 708, 719, 733, 767, 771, 802, 822, 830, 844, 848, 877, 878, 885, 888, 911, 913, 926, 934, 940, 942, 948, 955, 961, 966, 971, 973, 989, 1024, 1036, 1041, 1078, 1084, 1102, 1105, 1119, 1131, 1147, 1149, 1161, 1162, 1170, 1177, 1187, 1210, 1224, 1239, 1243, 1251, 1271, 1275, 1277, 1288, 1292, 1295, 1298, 1299, 1314, 1357, 1358, 1366, 1376, 1380, 1385, 1389, 1397, 1400, 1403, 1425, 1426, 1429, 1447, 1448, 1456, 1462, 1465, 1466, 1485, 1493, 1506, 1519, 1520, 1525, 1528, 1544, 1569, 1602, 1626, 1648, 1656, 1704, 1709, 1719, 1724, 1748, 1753, 1760, 1761, 1762, 1771, 1777, 1814, 1838, 1849, 1850, 1860, 1938, 1951, 1967, 1973, 1977, 1987, 2017, 2020, 2027, 2044, 2053, 2068, 2070, 2076, 2078, 2089, 2091, 2092, 2100, 2128, 2141, 2143, 2144, 2147, 2164, 2166, 2170, 2196, 2201, 

In [54]:
print(shark_pruebas_indices_incorrectos_espacio)

[304, 619, 620, 621, 622, 623, 625, 626, 627, 628, 629, 630, 635, 636, 637, 639, 640, 643, 644, 645, 647, 648, 651, 847, 849, 987, 990, 1035, 1174, 1655, 2139, 2216, 2246, 2479, 2643, 2747, 2841, 2975, 2981, 3085, 3404, 3407, 4145, 4149, 4251, 4425, 4535, 4849, 4939, 5300, 5345, 5406, 5430, 5524, 5561, 5577, 5593, 5624, 5700, 5717, 5831, 5855, 5858, 5860]


In [55]:
shark_pruebas['Date'][304].replace(' ', '')

'19-Feb-2016'

In [56]:
for i in shark_pruebas_indices_incorrectos_espacio:
    shark_pruebas['Date'][i]=shark_pruebas['Date'][i].replace(' ','')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [57]:
shark_pruebas_indices_incorrectos_espacio=[]
shark_pruebas_indices_incorrectos_otros=[]
for i in shark_pruebas_indices_incorrectos:
    try:
        x = re.search(r'^\s.+',shark_pruebas['Date'][i])
        print(x.group())
        shark_pruebas_indices_incorrectos_espacio.append(i)
    except AttributeError:
        shark_pruebas_indices_incorrectos_otros.append(i)


1951.12.15.R


In [58]:
shark_pruebas['Date'][4425] ##Volver luego

'\n1951.12.15.R'

In [59]:
 shark_pruebas_indices_incorrectos_otros=shark_pruebas_indices_incorrectos_otros.copy()

#### Buscando que empiecen por letras

In [170]:
shark_pruebas_str= shark_pruebas.copy()

In [130]:
shark_pruebas_str['Date'][3890]

'Jul-1961'

In [175]:
shark_pruebas_str.drop('index',axis=1, inplace=True)

In [173]:
shark_pruebas_str.reset_index(inplace=True)

In [176]:
shark_pruebas_str

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6298,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,...,Y,,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004,5.0
6299,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,...,Y,,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003,4.0
6300,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,...,Y,,,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002,3.0
6301,ND.0001,1845-1853,0.0,Unprovoked,CEYLON (SRI LANKA),Eastern Province,"Below the English fort, Trincomalee",Swimming,male,M,...,Y,,,S.W. Baker,ND-0001-Ceylon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0001,ND.0001,2.0


In [177]:
shark_pruebas_indices_incorrectos_letras=[]
shark_pruebas_indices_incorrectos_otros2=[]
for i in shark_pruebas_indices_incorrectos_otros:
    try:
        x = re.search(r'^\w.+',shark_pruebas_str['Date'][i])
        print(x.group())
        shark_pruebas_indices_incorrectos_letras.append(i)
    except AttributeError:
        shark_pruebas_indices_incorrectos_otros2.append(i)

May 2018
Reported 30-Apr-2018
Reported 10-Apr-2018
Reported 25-Nov-2017
Reported 13-Nov-2017
Reported 31-Oct-2017
Sep-2017
Reported 06-Sep-2017
Reported 26-Jul-2017
Reported 07-Jul-2017
2017.06.05
Reported 14-Jun-2017
Reported 07-Jun-2017
Reported 06-May-2017
Reported 09-Mar-2017
Reported 08-Jan-2017
Sep-2016
Reported  14-Jul-2016
Reported 08-Jul-2016
Reported 03-Mar-2016
19-Feb-2016
Reported 10-Feb-2016
Reported 11-Jan-2016
Sep-2015
Reported 25-Jun-2015
20-May2015
Reported 23-Dec-2014
Reported 03-Dec-2014
Reported 17-Nov-2014
Reported 12-Sep-2014
Sep-2014
Reported 25-Aug-2014
Aug-2014
Reported 27-Jun-2014
Reported 17-Jun-2014
13-May2014
Reported 10-May-2014
Reported 12-Apr-2014
Reported 17-Feb-2014
2014
2014
29-Nov2013
25-Sep-2013
21-Sep-2013
21-Sep-2013
14-Sep-2013
12-Sep-2013
07-Sep-2013
07-Sep-2013
02-Sep-2013
01-Sep-2013
01-Sep-2013
01-Sep-2013
25-Aug-2013
25-Aug-2013
18-Aug-2013
14-Aug-2013
13-Aug-2013
Reported 08-Aug-2013
05-Aug-2013
31-Jul-2013
30-Jul-2013
29-Jul-2013
29-Jul-20

1555
Ca. 1554
Ca. 1543
Circa 500 A.D.
77  A.D.
Ca. 5 A.D.
Ca. 214 B.C.
Ca. 336.B.C..
493 B.C.
Ca. 725 B.C.
Before 1939
1990 or 1991
Before 2016
Before Oct-2009
Before 1934
Before 1934
2009?
Before 1930
1880-1899
Before 1909
Before 2012
Before 1916
Between   1951-1963
Before 1908
Before 1900
Before 1876
Before 2012
Before 2011
Before 2011
Before 2009
Beforer 1994
Before 1963
1896-1913
Before 1936
Before 08-Jun-1912
Before 2012
Before 1911
Before 1901
No date, late 1960s
Before 2006
Before 2003
Before 2004
Before 1962
1950s
No date, Before 1963
2003?
No date
No date
Before Feb-1998
No date, Before May-1996
No date, Before Mar-1995
Before 1996
No date, Before Aug-1989
No date, Before Aug-1987
No date, Before 1987
No date, Before  1975
No date, Before 1975
No date, Before 1969
No date, Before 3-Jan-1967
No date, Before 1963
No date, Before 8-May-1965
No date, Before 1963
No date, Before 1902
No date, Before 1902
No date, Before 1963
No date, After August 1926 and before 1936
No date, Befor

In [178]:
shark_pruebas_indices_incorrectos_letras

[14, 17, 31, 59, 62, 65, 86, 90, 110, 122, 131, 132, 136, 143, 171, 187, 226, 248, 249, 301, 304, 306, 317, 370, 405, 423, 469, 471, 475, 498, 504, 512, 525, 538, 542, 556, 558, 565, 581, 588, 589, 598, 619, 620, 621, 622, 623, 625, 626, 627, 628, 629, 630, 635, 636, 637, 639, 640, 642, 643, 644, 645, 647, 648, 651, 653, 668, 688, 692, 708, 719, 733, 767, 771, 802, 822, 830, 844, 847, 848, 849, 877, 878, 885, 888, 911, 913, 926, 934, 940, 942, 948, 955, 961, 966, 971, 973, 987, 989, 990, 1024, 1035, 1036, 1041, 1078, 1084, 1102, 1105, 1119, 1131, 1147, 1149, 1161, 1162, 1170, 1174, 1177, 1187, 1210, 1224, 1239, 1243, 1251, 1271, 1275, 1277, 1288, 1292, 1295, 1298, 1299, 1314, 1357, 1358, 1366, 1376, 1380, 1385, 1389, 1397, 1400, 1403, 1425, 1426, 1429, 1447, 1448, 1456, 1462, 1465, 1466, 1485, 1493, 1506, 1519, 1520, 1525, 1528, 1544, 1569, 1602, 1626, 1648, 1655, 1656, 1704, 1709, 1719, 1724, 1748, 1753, 1760, 1761, 1762, 1771, 1777, 1814, 1838, 1849, 1850, 1860, 1938, 1951, 1967, 197

#### Remplazando Reported

In [179]:
for i in shark_pruebas_indices_incorrectos_letras:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('Reported ','')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [180]:
for i in shark_pruebas_indices_incorrectos_letras:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('Before ','')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [162]:
shark_pruebas_str['Date'][17]

'30-Apr-2018'

In [181]:
shark_p_indices_incorrectos_str=[]
shark_p_indices_incorrectos_otros=[]
for i in shark_pruebas_indices_incorrectos:
    try:
        x = re.search(r'^\w+\s',shark_pruebas_str['Date'][i])
        print(x.group())
        shark_p_indices_incorrectos_str.append(i)
    except AttributeError:
        shark_p_indices_incorrectos_otros.append(i)

May 
December 
Late 
Fall 
November 
Early 
July 
July 
Late 
Early 
Early 
13 
1998 
Early 
Last 
Late 
May 
Fall 
Between 
May 
July 
Mid 
June 
Late 
May 
Summer 
Late 
Early 
Summer 
1980s 
1980s 
05 
May 
May 
Early 
Summer 
Late 
Late 
Winter 
13 
Early 
Mid 
Summer 
May 
May 
Summer 
Early 
May 
Early 
Late 
May 
Early 
Late 
Between 
Late 
Summer 
Circa 
May 
May 
May 
May 
1954 
1954 
Between 
1950 
Summer 
Summer 
Summer 
Between 
Some 
Summer 
Fall 
Fall 
Winter 
Summer 
Woirld 
May 
Late 
Some 
Summer 
May 
to 
Late 
Summer 
Mid 
Summer 
Late 
Early 
1899 
1898 
Summer 
Reportd 
1890 
October 
Summer 
Reprted 
June 
Circa 
Early 
1868 
Circa 
Circa 
Sep 
1853 
1853 
in 
Ca 
October 
May 
June 
Late 
Letter 
Circa 
77 
493 
1990 
Between 
Beforer 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
Said 
1940 
1940 
1940 
World 
World 
A 
No 
Early 
Between 
No 
No 
No 
1920 
No 
Circa 


#### Remplazando meses

In [182]:
for i in shark_p_indices_incorrectos_str:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('May ','01-May-')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [165]:
shark_pruebas_str['Date'][14] ## Volver a espacios

'01-May-2018'

In [183]:
for i in shark_p_indices_incorrectos_str:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('Late ','25-')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [184]:
shark_pruebas_str['Date'][719]

'December 2012'

In [185]:
shark_pruebas_str['Date'][1239]

'25-Jul-2008'

In [186]:
shark_pruebas_str['Date'][1761]

'25-Jul-2003'

In [187]:
for i in shark_p_indices_incorrectos_str:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('Late ','28-')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [188]:
shark_pruebas_str['Date'][1761]

'25-Jul-2003'

In [189]:
shark_pruebas_str['Date'][1298]

'Fall 2008'

In [190]:
for i in shark_p_indices_incorrectos_str:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('Fall ','01-Oct-')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [191]:
shark_pruebas_str['Date'][1298]

'01-Oct-2008'

In [193]:
shark_pruebas_str['Date'][1314]

'01-Nov-2011'

In [192]:
for i in shark_p_indices_incorrectos_str:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('November ','01-Nov-')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [89]:
shark_pruebas_str['Date'][1314]

'01-Nov-2011'

In [965]:
shark_pruebas_str['Date'][1448]

'05-Aug-2006'

In [966]:
shark_pruebas_str['Date'][2017]

'05-Sep-2000'

In [967]:
shark_pruebas_str['Date'][2053]

'05-Jun-2000'

In [194]:
for i in shark_p_indices_incorrectos_str:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('Early ','05-')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [968]:
shark_pruebas_str['Date'][2053]

'05-Jun-2000'

In [969]:
shark_pruebas_str['Date'][1465] 

'01-Jul-2006'

In [970]:
shark_pruebas_str['Date'][1466] 

'01-Jul-2006'

In [195]:
for i in shark_p_indices_incorrectos_str:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('July ','01-Jul-')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [197]:
shark_pruebas_str['Date'][1466] 

'01-Jul-2006'

In [198]:
x= [6012, 6013, 6045, 6107, 6170, 6175, 6182, 6193, 6201, 6209, 6215, 6217, 6218, 6220, 6221, 6223, 6224, 6225, 6226, 6227, 6228, 6229, 6230, 6231, 6232, 6233, 6234, 6235, 6236, 6237, 6262, 6266, 6267, 6268, 6271, 6272, 6274, 6275, 6278, 6279, 6280, 6281, 6282, 6289]

In [199]:
shark_p_indices_incorrectos_str2=[]
shark_p_indices_incorrectos_otros2=[]
for i in x:
    try:
        x = re.search(r'^\w+\s',shark_pruebas_str['Date'][i])
        print(x.group())
        shark_p_indices_incorrectos_str2.append(i)
    except TypeError: 
        shark_p_indices_incorrectos_otros2.append(i)

1853 
1853 
in 
Ca 
Letter 
77 
1990 
Between 
Beforer 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
No 
Said 
1940 
1940 
1940 
World 
World 
A 
No 
Between 
No 
No 
No 
1920 
No 


In [200]:
shark_pruebas_str['Date'][2089]  ## Regresar luego a espacios

'13 -Nov-1999'

In [205]:
shark_pruebas_str['Date'][2209]

'01-Jul-1998'

In [202]:
for i in shark_p_indices_incorrectos_str2:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('1998 ','01-Jul-1998')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [204]:
shark_pruebas_str['Date'][2209]=shark_pruebas_str['Date'][2209].replace('1998 ','01-Jul-1998')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [206]:
shark_pruebas_str['Date'][2459]=shark_pruebas_str['Date'][2459].replace('Last incident of 1994 in Hong Kong','01-Jul-1994')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [207]:
shark_pruebas_str['Date'][2459]

'01-Jul-1994'

In [208]:
shark_pruebas_str['Date'][2514]

'Between 01-May-& Nov-1993'

In [209]:
shark_pruebas_str['Date'][2514]=shark_pruebas_str['Date'][2514].replace(
    'Between 01-May & Nov-1993','01-Nov-1993')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [112]:
shark_pruebas_str['Date'][2514]

'01-Nov-1993'

In [210]:
shark_pruebas_str['Date'][4061]=shark_pruebas_str['Date'][4061].replace(
    'Between 10 and 12-Sep-1959','12-Sep-1959')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [114]:
shark_pruebas_str['Date'][4061]

'12-Sep-1959'

In [212]:
shark_pruebas_str['Date'][2853]

'01-Jul-1986'

In [211]:
shark_pruebas_str['Date'][2853]=shark_pruebas_str['Date'][2853].replace(
    'Mid Jul-1985 or mid Jul-1986','01-Jul-1986')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [213]:
shark_pruebas_str['Date'][2939]=shark_pruebas_str['Date'][2939].replace('June ','01-Jun-')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [224]:
shark_pruebas_str['Date'][2939]

'01-Jun-1983'

In [223]:
for i in shark_p_indices_incorrectos_str2:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('June ','01-Jun-')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [229]:
shark_pruebas_str['Date'][3047]

'Summer of 1981'

In [226]:
for i in shark_p_indices_incorrectos_str2:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('Summer of ','01-Jun-')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [228]:
for i in shark_p_indices_incorrectos_str2:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('Summer ','01-Jun-')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [230]:
shark_pruebas_str['Date'][3079]

'Summer 1980'

In [231]:
shark_pruebas_str['Date'][3444]

'Winter 1969'

In [234]:
shark_pruebas_str['Date'][4721]

'Winter 1942'

In [233]:
for i in shark_p_indices_incorrectos_str2:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('Winter ','01-Dic-')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [998]:
shark_pruebas_str['Date'][3564]

'15-Aug-1966'

In [999]:
shark_pruebas_str['Date'][5543]

'15-Oct-1901'

In [1000]:
for i in shark_p_indices_incorrectos_str2:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('Mid ','15-')

In [1001]:
shark_pruebas_str['Date'][4180]

'15-Jul-1958'

In [1002]:
shark_pruebas_str['Date'][6292]

'15-Jul-1862'

In [1003]:
for i in shark_p_indices_incorrectos_str2:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('Circa ','15-Jul-')

In [1004]:
shark_pruebas_str['Date'][4357] ##Quizás duplicado, revisar despues

'15-Feb-1954'

In [235]:
for i in shark_p_indices_incorrectos_str2:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace(
        '1954 (same day as  1954.00.00.f)','15-Feb-1954')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [236]:
shark_pruebas_str['Date'][4439]=shark_pruebas_str['Date'][4439].replace(
        'Between 01-Aug-1951 & 08-Aug-1951','08-Aug-1951')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [237]:
shark_pruebas_str['Date'][4439]

'08-Aug-1951'

In [240]:
shark_pruebas_str['Date'][4631]

'22-Dec-1944'

In [239]:
shark_pruebas_str['Date'][4631]=shark_pruebas_str['Date'][4631].replace(
        'Between 18 & 22-Dec 1944','22-Dec-1944')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [246]:
shark_pruebas_str['Date'][4488]

'01-Jul-1950'

In [242]:
shark_pruebas_str['Date'][4488]=shark_pruebas_str['Date'][4488].replace(
        '1950 - 1951','01-Jul-1950')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [245]:
shark_pruebas_str['Date'][4659]

'01-Nov-1944'

In [244]:
shark_pruebas_str['Date'][4659]=shark_pruebas_str['Date'][4659].replace(
        'Some time between Apr & Nov-1944','01-Nov-1944')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [1014]:
shark_pruebas_str['Date'][4802]

'01-May-1944'

In [247]:
shark_p_indices_incorrectos_str2

[6012, 6013, 6045, 6107, 6170, 6175, 6182, 6193, 6201, 6209, 6215, 6217, 6218, 6220, 6221, 6223, 6224, 6225, 6226, 6227, 6228, 6229, 6230, 6231, 6232, 6233, 6234, 6235, 6236, 6237, 6262, 6266, 6267, 6268, 6271, 6272, 6274, 6275, 6278, 6279, 6280, 6281, 6282, 6289]

In [248]:
shark_pruebas_str['Date'][6011]=shark_pruebas_str['Date'][6011].replace(
        'Sep or ','01-')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [249]:
for i in shark_p_indices_incorrectos_str2:
    shark_pruebas_str['Date'][i]=shark_pruebas_str['Date'][i].replace('Reprted ','')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [250]:
shark_pruebas_str['Date'][5707]

'1890 '

In [251]:
len(shark_pruebas_str)

6303

In [252]:
shark_pruebas_str['Date'].unique()

array(['25-Jun-2018', '18-Jun-2018', '09-Jun-2018', ..., '1883-1889',
       '1845-1853', nan], dtype=object)

In [1023]:
shark_pruebas_str.loc[shark_pruebas_str['Date'] == 'xx']

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order


In [253]:
try:
    shark_pruebas_str['Date'] = pd.to_datetime(sharks['Date'], format = '%d-%b-%Y')
except:
    pass

#### Remplazando caracteres

In [255]:
prox_indice= shark_pruebas_indices_incorrectos_otros2.copy()

In [256]:
prox_indice

[5538, 6022, 6260, 6261]

In [257]:
shark_pruebas_str['Date'][6219]=shark_pruebas_str['Date'][6219].replace('Feb','01-Feb')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [258]:
shark_pruebas_str['Date'][6219]

'01-Feb-1998'

In [259]:
shark_pruebas_str['Date'].unique()

array(['25-Jun-2018', '18-Jun-2018', '09-Jun-2018', ..., '1883-1889',
       '1845-1853', nan], dtype=object)

In [261]:
shark_pruebas_str.drop([:-50],axis=0,inplace= True)

SyntaxError: invalid syntax (<ipython-input-261-da0c5b042c4c>, line 1)

In [262]:
shark_pruebas_str[0:50]

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0
5,2018.06.03.b,03-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,"Flat Rock, Ballina",Kite surfing,Chris,M,...,N,,,"Daily Telegraph, 6/4/2018",2018.06.03.b-FlatRock.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.03.b,2018.06.03.b,6298.0
6,2018.06.03.a,03-Jun-2018,2018.0,Unprovoked,BRAZIL,Pernambuco,"Piedade Beach, Recife",Swimming,Jose Ernesto da Silva,M,...,Y,Late afternoon,Tiger shark,"Diario de Pernambuco, 6/4/2018",2018.06.03.a-daSilva.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.03.a,2018.06.03.a,6297.0
7,2018.05.27,27-May-2018,2018.0,Unprovoked,USA,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,male,M,...,N,,"Lemon shark, 3'","K. McMurray, TrackingSharks.com",2018.05.27-Ponce.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.27,2018.05.27,6296.0
8,2018.05.26.b,26-May-2018,2018.0,Unprovoked,USA,Florida,"Cocoa Beach, Brevard County",Walking,Cody High,M,...,N,17h00,"Bull shark, 6'","K.McMurray, TrackingSharks.com",2018.05.26.b-High.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.26.b,2018.05.26.b,6295.0
9,2018.05.26.a,26-May-2018,2018.0,Unprovoked,USA,Florida,"Daytona Beach, Volusia County",Standing,male,M,...,N,14h00,,"K. McMurray, Tracking Sharks.com",2018.05.26.a-DaytonaBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.26.a,2018.05.26.a,6294.0


## Columna 'Year'

In [263]:
shark_attack= shark_pruebas_str.copy()

In [264]:
shark_attack.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6303 entries, 0 to 6302
Data columns (total 22 columns):
Case Number               6302 non-null object
Date                      6302 non-null object
Year                      6300 non-null float64
Type                      6298 non-null object
Country                   6252 non-null object
Area                      5847 non-null object
Location                  5762 non-null object
Activity                  5758 non-null object
Name                      6092 non-null object
Sex                       5737 non-null object
Age                       3471 non-null object
Injury                    6274 non-null object
Fatal (Y/N)               5763 non-null object
Time                      2948 non-null object
Species                   3464 non-null object
Investigator or Source    6285 non-null object
pdf                       6302 non-null object
href formula              6301 non-null object
href                      6302 non-null object

In [265]:
type(shark_attack['Year'][0])

<class 'numpy.float64'>

In [266]:
try:
    for i in range(len(shark_attack)):
        shark_attack['Year'][i]=shark_attack['Date'][i][-4:]
except:
    pass


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


### Reseteando el index

In [267]:
shark_attack.reset_index(inplace= True)

In [268]:
shark_attack['Year'][-100:]

6203    1913
6204    1936
6205    1912
6206    2012
6207    1911
        ... 
6298    1903
6299    1905
6300    1889
6301    1853
6302     NaN
Name: Year, Length: 100, dtype: object

In [269]:
shark_attack[shark_attack['Year']==0] #82 valores en 0 

Unnamed: 0,index,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order


## Columna 'Type'

In [270]:
shark_attack['Type'].unique()

array(['Boating', 'Unprovoked', 'Invalid', 'Provoked', 'Questionable',
       'Sea Disaster', nan, 'Boat', 'Boatomg'], dtype=object)

In [271]:
idx = shark_attack.index[shark_attack['Type'].isnull()] #Solo son 4 valores nulos, se les asiganará el valor más común
nans = shark_attack.iloc[idx]
nans

Unnamed: 0,index,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
85,85,2017.09.15.a,15-Sep-2017,2017.0,,SAMOA,Upolu Island,Nofoalii,Fishing,male,...,N,Night,,"Samoa Observer, 9/16/2017",2017.09.15.a-Samoa.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.09.15.a,2017.09.15.a,6218.0
382,382,2015.07.27,27-Jul-2015,2015.0,,AUSTRALIA,Victoria,Tyrendarra Beach,Surfing,male,...,,,,,2015.07.27-Victoria.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2015.07.27,2015.07.27,5921.0
4867,4867,1936.09.11.R,11-Sep-1936,1936.0,,VIETNAM,,Saigon,Wreck of a sampam,8 crew,...,Y,,,"Lansing State Journal, 9/11/1936",1936.09.11-Saigon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1936.09.11.R,1936.09.11.R,1436.0
5705,5705,1890.03.03.R,03-Mar-1890,1890.0,,CEYLON,,,Diving,a pearl diver,...,Y,,,"The Guardian, 3/3/1890",1890.03.03.R-Ceylon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1890.03.03.R,1890.03.03.R,598.0
6302,6302,xx,,,,,,,,,...,,,,,,,,,,


In [272]:
print(shark_attack['Type'][shark_attack['Type']== 'Boating'].count())

203


In [273]:
print(shark_attack['Type'][shark_attack['Type']== 'Unprovoked'].count()) ##Tipo más común

4595


In [274]:
print(shark_attack['Type'][shark_attack['Type']== 'Invalid'].count())

547


In [275]:
print(shark_attack['Type'][shark_attack['Type']== 'Provoked'].count())

574


In [277]:
index_corregir=[85,382,4867,5705]

In [278]:
for i in index_corregir:
    shark_attack['Type']== shark_attack['Type'].fillna('Unprovoked', inplace= True)
    

In [279]:
shark_attack['Type'].isnull().sum()

0

**ACCIÓN:** Se considera 'Boatomg' un error y se remplazará por 'Boat'

In [280]:
for i in range(len(shark_attack)):
    shark_attack['Type']== shark_attack['Type'].replace('Boatomg','Boat', inplace= True)

### Columna Country

In [281]:
sharks=shark_attack.copy()

In [296]:
sharks['Country'].unique()

array(['USA', 'AUSTRALIA', 'MEXICO', 'BRAZIL', 'ENGLAND', 'SOUTH AFRICA',
       'THAILAND', 'COSTA RICA', 'MALDIVES', 'BAHAMAS', 'NEW CALEDONIA',
       'ECUADOR', 'MALAYSIA', 'LIBYA', nan, 'CUBA', 'MAURITIUS',
       'NEW ZEALAND', 'SPAIN', 'SAMOA', 'SOLOMON ISLANDS', 'JAPAN',
       'EGYPT', 'ST HELENA, British overseas territory', 'COMOROS',
       'REUNION', 'FRENCH POLYNESIA', 'UNITED KINGDOM',
       'UNITED ARAB EMIRATES', 'PHILIPPINES', 'INDONESIA', 'CHINA',
       'COLUMBIA', 'CAPE VERDE', 'Fiji', 'DOMINICAN REPUBLIC',
       'CAYMAN ISLANDS', 'ARUBA', 'MOZAMBIQUE', 'FIJI', 'PUERTO RICO',
       'ITALY', 'ATLANTIC OCEAN', 'GREECE', 'ST. MARTIN', 'FRANCE',
       'PAPUA NEW GUINEA', 'TRINIDAD & TOBAGO', 'KIRIBATI', 'ISRAEL',
       'DIEGO GARCIA', 'TAIWAN', 'JAMAICA', 'PALESTINIAN TERRITORIES',
       'GUAM', 'SEYCHELLES', 'BELIZE', 'NIGERIA', 'TONGA', 'SCOTLAND',
       'CANADA', 'CROATIA', 'SAUDI ARABIA', 'CHILE', 'ANTIGUA', 'KENYA',
       'RUSSIA', 'TURKS & CAICOS', 'UNITE

In [295]:
for i in range(len(sharks)):
    sharks['Country']=sharks['Country'].replace('RED SEA?','RED SEA')

In [297]:
sharks['Country'].astype('str')

0                      USA
1                      USA
2                      USA
3                AUSTRALIA
4                   MEXICO
               ...        
6298             AUSTRALIA
6299                   USA
6300                PANAMA
6301    CEYLON (SRI LANKA)
6302                   nan
Name: Country, Length: 6303, dtype: object

**ACCIÓN:** Eliminar caracteres como / y ? y se pasarn todos los valores a mayúsculas

In [298]:
type(sharks['Country'][2])

<class 'str'>

In [317]:
sharks['Country'].isnull().sum()

0

In [314]:
sharks['Country'].fillna(' ', inplace= True)

In [322]:
sharks

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2018.06.25,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,2018.06.18,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,2018.06.09,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,2018.06.08,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,2018.06.04,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6298,ND.0004,1903,1903,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,...,Y,,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004,5.0
6299,ND.0003,1900-1905,1905,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,...,Y,,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003,4.0
6300,ND.0002,1883-1889,1889,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,...,Y,,,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002,3.0
6301,ND.0001,1845-1853,1853,Unprovoked,CEYLON (SRI LANKA),Eastern Province,"Below the English fort, Trincomalee",Swimming,male,M,...,Y,,,S.W. Baker,ND-0001-Ceylon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0001,ND.0001,2.0


In [311]:
sharks.drop(['level_0','index'], axis=1, inplace= True)

In [324]:
nans=[]
for i in range(len(sharks)):
    if sharks['Country'][i]== ' ':
        nans.append(i)

In [325]:
nans

[62, 525, 2956, 3378, 3387, 3388, 3395, 3399, 3425, 3605, 3661, 4018, 4231, 4266, 4498, 4639, 4700, 4712, 4726, 4729, 4911, 5020, 5060, 5425, 5586, 5587, 5612, 5636, 5742, 5770, 5808, 5810, 5839, 5885, 5920, 5996, 6001, 6092, 6119, 6131, 6133, 6137, 6155, 6165, 6171, 6175, 6177, 6199, 6206, 6259, 6302]

**ACCIÓN:** Se considera que 'Country' es una columna importante (por que de ella dependen otras 3) por lo tanto se eliminaran los valores nulos

In [327]:
for i in nans:
    sharks=sharks.drop(i,axis=0)

In [328]:
sharks.reset_index(inplace= True)

In [330]:
sharks.drop('index', axis=1)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2018.06.25,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,2018.06.18,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,2018.06.09,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,2018.06.08,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,2018.06.04,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6247,ND.0005,1903,1903,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,...,Y,,,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005,6.0
6248,ND.0004,1903,1903,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,...,Y,,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004,5.0
6249,ND.0003,1900-1905,1905,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,...,Y,,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003,4.0
6250,ND.0002,1883-1889,1889,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,...,Y,,,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002,3.0


In [331]:
sharks['Country'].isnull().sum()

0

In [332]:
nans=sharks[sharks['Area'].isnull()== True].index.to_list()

**ACCIÓN:** Se remplazarán los valores nulos por el 'Country'

In [333]:
for i in nans:
    sharks['Area'][i]=sharks['Country'][i].title()        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [334]:
sharks['Area'].unique()

array(['California', 'Georgia', 'Hawaii', 'New South Wales', 'Colima',
       'Pernambuco', 'Florida', 'Queensland', 'South Carolina',
       'Cornwall', 'Westerm Australia', 'Eastern Cape Province',
       'Hua Hin', 'Cocos Island', 'Western Australia',
       'Alifu Alifu Atoll', 'Western Cape Province', 'New Providence',
       'Alagoas', 'New Caledonia', 'Bahamas', 'Victoria', 'KwaZulu-Natal',
       'Boi Island, Victoria', 'Galapagos Islands', 'Fernando de Noronha',
       'Sepang', 'Libya', 'Holquin Province', 'Pamplemousses ',
       'South Australia', 'North Island', 'New York', 'Canary Islands',
       ' Upolu Island', 'Solomon Islands', 'Shizuoka Prefecture', 'Texas',
       'Castellón', 'Massachusetts', 'Red Sea Protectorate',
       'New Providence District', '40 miles off Grand Bahama Island',
       'Ascension Island', 'New Jersey', 'Majorca', 'Washington',
       'Tabasco', 'Anjouan', 'Reunion', 'Ibiza Island', 'Marquesas',
       'South Devon', 'New Providence ', 'Sharj

In [335]:
sharks['Area'].astype('str')

0              California
1                 Georgia
2                  Hawaii
3         New South Wales
4                  Colima
              ...        
6247    Western Australia
6248    Western Australia
6249       North Carolina
6250               Panama
6251     Eastern Province
Name: Area, Length: 6252, dtype: object

### Columna 'Location'

**ACCIÓN:** Se remplazarán los valores nulos por el 'Area'

In [336]:
shark_attack= sharks.copy()

In [337]:
nans=shark_attack[shark_attack['Location'].isnull()== True].index.to_list()

In [338]:
for i in nans:
    shark_attack['Location'][i]=shark_attack['Area'][i]  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [339]:
shark_attack['Location'].isnull().sum()

0

In [340]:
shark_attack['Location'].unique()

array(['Oceanside, San Diego County', 'St. Simon Island, Glynn County',
       'Habush, Oahu', ..., 'Ocracoke Inlet', 'Panama Bay 8ºN, 79ºW',
       'Below the English fort, Trincomalee'], dtype=object)

In [341]:
shark_attack['Location'][6202]=shark_attack['Area'][6202]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### Columna 'Activity'

In [342]:
sharks= shark_attack.copy()

In [343]:
nans=sharks[sharks['Activity'].isnull()== True].index.to_list()
nans

[47, 107, 111, 113, 153, 181, 184, 244, 254, 288, 293, 295, 299, 367, 372, 393, 413, 423, 461, 464, 474, 506, 515, 574, 622, 626, 627, 658, 671, 683, 686, 703, 719, 727, 730, 759, 800, 827, 856, 876, 880, 938, 991, 1051, 1052, 1086, 1088, 1128, 1160, 1168, 1188, 1217, 1222, 1236, 1268, 1391, 1432, 1439, 1454, 1458, 1480, 1483, 1526, 1558, 1567, 1591, 1597, 1604, 1626, 1669, 1681, 1708, 1763, 1775, 1846, 1847, 1880, 1895, 1924, 1927, 1928, 1929, 1935, 1941, 1945, 1953, 2006, 2026, 2043, 2053, 2069, 2114, 2159, 2194, 2210, 2258, 2259, 2277, 2293, 2294, 2305, 2313, 2314, 2321, 2324, 2326, 2328, 2330, 2366, 2386, 2388, 2389, 2402, 2407, 2422, 2437, 2438, 2473, 2495, 2500, 2512, 2568, 2569, 2570, 2592, 2609, 2648, 2682, 2685, 2716, 2717, 2740, 2745, 2758, 2760, 2776, 2784, 2804, 2806, 2812, 2814, 2831, 2859, 2865, 2898, 2901, 2904, 2912, 2935, 2944, 2948, 2951, 2963, 2964, 2974, 2978, 2980, 2997, 3022, 3038, 3046, 3088, 3096, 3101, 3105, 3119, 3123, 3127, 3145, 3146, 3153, 3154, 3156, 3159,

**ACCIÓN**: Se sustituiran los nulos por 'Unknown'

In [344]:
for i in nans:
    sharks['Activity'][i]='Unknown'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [345]:
sharks['Activity'].unique() #\w+.ing()

array(['Paddling', 'Standing', 'Surfing', ...,
       'Crew swimming alongside their anchored ship',
       '4 men were bathing', 'Wreck of  large double sailing canoe'],
      dtype=object)

In [346]:
sharks['Activity'].astype('str')

0           Paddling
1           Standing
2            Surfing
3            Surfing
4        Free diving
            ...     
6247          Diving
6248    Pearl diving
6249        Swimming
6250         Unknown
6251        Swimming
Name: Activity, Length: 6252, dtype: object

In [353]:
sharks.drop('index', axis=1, inplace= True)

In [355]:
x= re.match('(\w+.ing)',sharks['Activity'][6183])
y=str(x.group())
print(type(y))
print(y)

<class 'str'>
Diving


In [349]:
y

'Diving'

In [358]:
sharks[-50:]

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Activity short
6202,ND.0054,1958,1958,Unprovoked,INDONESIA,Riau Province,Riau Province,Swimming near anchored ship,a ship's engineer,M,...,,,"C.H. Townsend, p. 172; V.M. Coppleson, p.258",ND-0054-NatunaIslands.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0054,ND.0054,52.0,Swimming
6203,ND.0053,1958,1958,Unprovoked,INDIA,Maharashtra,"Malwan, near Ratnagiri",Unknown,male,M,...,,,"V.M. Coppleson (1958), p.261",ND-0053-India.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0053,ND.0053,51.0,Unknown
6204,ND.0052,1957,1957,Unprovoked,NICARAGUA,Lake Nicaragua (fresh water),A village north of San Carlos,Lashing logs together when he fell into the water,an Indian,M,...,,"Bull shark caught, leg recovered & buried besi...","F. Poli, pp.150-153",ND-0052-NicaraguanIndian.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0052,ND.0052,50.0,Lashing
6205,ND.0051,1957,1957,Provoked,CUBA,Havana Province,Cojimar,"Shark fishing, knocked overboard",Sandrillio,M,...,,,"F. Poli, pp.75, 81-83",ND-0051-Sandrillio.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0051,ND.0051,49.0,"Shark fishing, knocked overboard"
6206,ND.0049,1956,1956,Unprovoked,MARSHALL ISLANDS,Bikini Atoll,Bikini Atoll,Swimming,male,M,...,,,J.E. Lasch,ND-0049-BikiniAtoll.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0049,ND.0049,48.0,Swimming
6207,ND.0048,1956,1956,Unprovoked,KIRIBATI,Phoenix Islands,Canton Island,Diving,Dusty Rhodes,M,...,,,"J. Oetzel, Skin Diver Magazine, March 1956, p.19",ND-0048-DustyRhodes.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0048,ND.0048,47.0,Diving
6208,ND.0047,Mar-1956,1956,Unprovoked,NORTH PACIFIC OCEAN,North Pacific Ocean,Wake Island,"Fishing, wading with string of fish",male,M,...,,,"J. Oetzel, Skin Diver Magazine, March 1956, p.19",ND-0047-WakeIsland.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0047,ND.0047,46.0,Fishing
6209,ND.0046,1952,1952,Unprovoked,KIRIBATI,Gilbert Islands,Nonouti,Unknown,Gilbertese fisherman,M,...,,,"Grimble, pp. 142-143",ND-0046-GilberteseFisherman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0046,ND.0046,45.0,Unknown
6210,ND.0043,"""During the war"" 1943-1945",1945,Unprovoked,SOLOMON ISLANDS,New Georgia,"Munda Island, Roviana Lagoon",Floating on his back,American male,M,...,,,"W. Chapman (1949) Fishing in Troubled Waters, ...",ND-0043-American-male.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0043,ND.0043,43.0,Floating
6211,ND.0042,"""Before the war""","war""",Unprovoked,AUSTRALIA,Torres Strait,Thursday Island?,Free diving,Mortakee,M,...,,,Press clipping dated 6/28/1950,ND-0042-Mortakee.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0042,ND.0042,42.0,Free diving


**ACCIÓN:** Se creará otra columna para la actividad abreviada

In [356]:
sharks['Activity short']=sharks['Activity']

In [357]:
for i in range(len(sharks)):
    try:
        x= re.match('(\w+.ing)',sharks['Activity'][i])
        y=str(x.group()).title()
        sharks['Activity short'][i]=y
    except:
        sharks['Activity short'][i]= sharks['Activity'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


### Columna 'Time'

In [384]:
shark_pruebas= sharks.copy()

In [388]:
shark_pruebas['Time'].unique()

array(['18:00', '14:00  -15:00', '07:45', 'nan', 'Late afternoon',
       '17:00', '14:00', 'Morning', '15:00', '08:15', '11:00', '10:30',
       '10:40', '16:50', '07:00', '09:30', 'Afternoon', '21:50', '09:40',
       '08:00', '17:35', '15:30', '07:30', '19:00, Dusk', 'Nig:t',
       '16:00', '15:01', '12:00', '13:45', '23:30', '09:00', '14:30',
       '18:30', '12:30', '16:30', '18:45', '06:00', '10:00', '10:44',
       '13:19', 'Midday', '13:30', '10:45', '11:20', '11:45', '19:30',
       '08:30', '15:45', 'S:ortly before 12:00', '17:34', '17:10',
       '11:15', '08:50', '17:45', '13:00', '10:20', '13:20', '02:00',
       '09:50', '11:30', '17:30', '9:00', '10:43', 'After noon', '15:15',
       '15:40', '19:05', '1300', '14:30 / 15:30', '22:00', '16:20',
       '14:34', '15:25', '14:55', '17:46', 'Morning ', '15:49', '19:00',
       'Midnig:t', '09:30 / 10:00', '10:15', '18:15', '04:00', '14:50',
       '13:50', '19:20', '10:25', '10:45-11:15', '16:45', '15:52',
       '06:15', '1

In [361]:
shark_pruebas['Time'][0]=shark_pruebas['Time'][0].replace('h',':')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [364]:
shark_pruebas['Time'].isnull().sum()

3311

In [365]:
shark_pruebas['Time']=shark_pruebas['Time'].fillna('Unknown')

In [386]:
shark_pruebas['Time']=shark_pruebas['Time'].astype('str')

In [387]:
for i in range(len(shark_pruebas)):
    shark_pruebas['Time'][i]=shark_pruebas['Time'][i].replace('h',':')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [389]:
for i in range(len(shark_pruebas)):
    shark_pruebas['Time'][i]=re.sub('[^:\d]', '',shark_pruebas['Time'][i] )

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [390]:
shark_pruebas['Time'][3]

''

In [391]:
x=re.search(r'\d{2}:\d{2}','14:00')
print(x.group())

14:00


In [392]:
for i in range(len(shark_pruebas)):
    if len(shark_pruebas['Time'][i])>5:
        shark_pruebas['Time'][i]=shark_pruebas['Time'][i][0:5]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
### iNDICES QUE NO CUMPLEN CON EL FORMATO 00:00

In [393]:
x=re.match(r'[^\d{2}:\d{2}]',shark_pruebas['Time'][3])
print(x.group())
    

AttributeError: 'NoneType' object has no attribute 'group'

In [428]:
wrong_indexes=[]
for i in range(len(shark_pruebas)):
    try:
        x=re.match('(\d{2}:\d{2}|Unknown)',shark_pruebas['Time'][i]) 
        x.group()
    except: 
            wrong_indexes.append(i)

In [429]:
for i in wrong_indexes:
    print(shark_pruebas['Time'][i])

In [430]:
wrong_indexes

[]

In [427]:
shark_pruebas['Time'][3]

'12:00'

In [426]:
for i in wrong_indexes:
    shark_pruebas['Time'][i]=shark_pruebas['Time'][i].replace('', '12:00')
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [417]:
for i in wrong_indexes:
    shark_pruebas['Time'][i]=shark_pruebas['Time'][i].replace('2Unknown', 'Unknown')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Columna 'Name'

In [478]:
sharks= shark_pruebas.copy()

In [481]:
type(sharks['Name'][0])

<class 'str'>

In [486]:
for i in range(len(sharks)):
    print(sharks['Name'][i])

Julie Wolfe
Adyson Mcneely 
John Denges
Male
Gustavo Ramos 
Chris 
Jose Ernesto Da Silva 
Male
Cody High
Male
Male
Trey De Boer
Jei Turrell
Max Berryman
Melisa Brunning
Male
Male
Female
Male
Adam Murray
Matthew Lee
Rob Bruce
Nathan Burch
Ahmed Rasheed
Male
Dylan Mcwilliams
Werner Danielsen
Pablo De Melo
Jason Longrass
Alejandro Travaglini
Bruce Rowan
Josias Paz
nan
Shane Mcconnell
Ross Spowart
Male
Luke Guy & Finn Bald
Rob Crossland
Lachie Brown
Elton Polly
Anna Shurapey 
Brett Palmer
Adam Hoare
Sharna Babd
Rosalida Souza
Andrew Phipps Newman
Lucas Martin
Male
Male
Mathew Vickers
Callum Stewart
Ricardo Ferrari Bulhoes 
Justin 
Marjorie Mariano
Natalie Jones
Sairolharun
Sarah Illig-Carroll
__ Jimenez
Rohina Bhandari
31 Migrants
Grigor Azatian
Kaia Anderson
Charlie Fry
Jesús Cabrera González 
Jack Tolley
Male
Male
Male
Jason Hartl
Susan Peteka
Sarah Williams
David Lomas
Carlos Iribam
Mitch Milan
Female
Bradley Fick
Justin White
Catherine Vissers
Michah Behrend
Male
Richard Inniss
Male
M

nan
Peter Edmonds
Joey Giangrasso
Jacques Peens
Mark Lemelin
Teresa Holloway
Male
Male
Male
Thomas Larkin
Markus Groh
Apia Hauta
Harold Bradner
Fiona Casey
Unidentified
nan
Wayne Symington
Jarryd Tinson 
Matthew O'Neill
Inflatable Rescue Boat. Occupants: Lauren Johnson &. Kris O'Neill
Jordan Marsden
Johnny Silva
Chase Edwards
Sam Judd
Wayne Francis Johanning
Ben Morcom
Josh Edwards
Scott Wright
Valentino Ramirez
Olivia Hislop
Malvis Cristino De Souza
Male
Craig Evans
Andrew Smith
Joseph Fox
Jeffrey Nolan
Lee Mellin
Patrick Walsh & Paul Damgaard 
Aaron Finley
Linda Whitehurst
Adam Wood
Sam Bendall
E.H.
Andrew Sinagra
nan
Stéphanie Belliard 
Leslie Gano
Sue Snyder
Kristina Aleksandrova
Nikki Cuomo
nan
Tyler Robertson
Corey Howell
Jack Calogero
Jessica Riley
Brandon Chapman
Carolyn Griffin
Colette Wilson
Rory Corr
Female
Dominco Iaciofano
Joshua Sumait
Todd Endris
Joseph Coursey
Taylor Smith
Bruce Pennington
Male
Chris Olstad
Male
Male
Andrea Lynch
Female
Matthew Barton
Noah Green
Chase C

Tony Donoghue
Charles Heitor Barbosa Pires
Sylvia Lanner
Guy Oudin
Male
Zane Smith
Dan Cartamil
Navid Davoudabai
Blaise Wouanena
Mr. Spain
Jonathan Allen
Robyne Knutson
28' Sport Fishing Boat, Bird Dog
Male
Kenny Burns
Rajkumar Mansaram
Evan Ridge
nan
Donna Turcotte
nan
Healy Lootz
Male
Andy Thompson
Megan O'Leary
Greg Harrison
Douw Van Der Merwe
nan
James Willie Tellasmon
Larry Foor
Male
Dale Inskeep
Tadashi Kodama
Claudio Roberto Florencio De Freitas
Jessica Stephens
Liam Victor
Jarod Ruszkowski 
Júlio César De Barros Correia
nan
Mike Duncan
Kai Haire
Jade Blackstock
Grant Rielly
Danny Hoopes
A.D
J. Howington
30' Cabin Cruiser Owned By Stefano Catalani
Jonathan Kathrein
Male
Kevin Paffrath
Robert Parcus
Kobus Goosen
Ross Taylor
Christian Lombard
Rodrigo Rocha Menezes
Michael Rinto
Clark Thomas (Father / Rescuer)
Darren James
Mark Thomas
Doug Chesser
Anton Devos
Brian Catarra
Danny Bravier
Jamieharrington
Marc Jucker
Jan-Henrick Opperman
Jack Mounteer
Janelle Dickinson
Roger Moore
Nea

Philip Light
Mike Degruy
Colin Wrankmore
6 M Skiboat, Occupants: P.A. Reeder & Crew
The June, Occupants Bunny Pendelbury And Crew Of 6
Nicky Alberts
Laurence Evans
nan
Sharon Wolfe Cranston
Vanuatuweekly Hebdomadaire
Harold Corbett
Raymond Brockway
Kim Pearce
Boat, Occupant: Danie Schoeman
Male
Andre Hartman
Philip Horley
George Walter
6 M Skiboat, Occupants: Alex Mamacos, Noel Glintenkamp, Tony Mountifield & Dillon Alexandra
Glenn Friedman
Michael Muradian
Steve Posey
Willie White
Dan Baen, Jr.
5 M Skiboat Graanjan, Occupants: Rudy Van Graan, Jan De Waal Lombard
Gordon Gibbs
Gary Jones
Ruskin Vest
Dr. Rolf Johan Lund
Verdon Harrison
Victor Beaver
John Hayes
Paul Howard
Male
Jean Blanchet
Mexican Male
Graham Archall
Jay Worrell
Geoffrey Kirkam Spence
Albert Van Ryseen
Al Brenneka
Jimmy Jackson
Boat, Occupant: Danie Schoeman
7 M Skiboat Alrehmah Iii, Occupants: Adolph Schlechter & 3 Friends
William Kennedy
Marshall Flanagan
A Small Boat
Darryl Richardson
nan
Michaelkarras, Jr.
Ricky Kar

F. L. Fernando
R. Nauth
An Infant
Boat, Occupant: Portuondo
William Bolster
Jim Kline
Semesa Vasu
Mr. Paniry
Male,From Laluoro
Mr. Falah
Male
Patterson (John) Nikuniko
Titus Tiso
P. Allen
Richard Mckenzie
Graham Smith
Bogana Sabati
K
nan
Joel Healy, Jr.
2 Males
Goffredo Lombardo
Kara Benagi, From Hula
Native
Jose Luis Nufize Lago
Lyle Davis
Douglas Clarke
Maximilliaan Roual Van Dam
Richard Kirby
Leslie Nye
Boat:Occupants: Nazzareno Zammit & Emmanuel
Tsira Native
Native Boy
Jose Alengo
Bansie Koide
Jack Smedley
Eleanor Nelson
Eric Rawls
Kila
Manuel Pereira
nan
Multiple Boats Including B.J. C. Brunt
Russ Shearman
nan
nan
Ian Nolan
John Patrick Wishart
Theo Brown
Barry Keith Antonini
Brian Hamilton
Stephen Conedo
Lesterburton
Ken Howell
Native Boy
Margoulis
Josh Vaughan
Girl
Anonymous
Male
Lamoman
Yacht Even
Boat, Occupants: P.D. Neilly & Charlton Anderson
Niu Bodu
Male
Noel Cross
R.C. Olsen
Robert C. Hightower
Philip C. Diez
Eric Vaughters
Male
Otamatsu H. Yoshii
Dale Strand
Carla Podzum

Herbert Webster
nan
A Native
Charles A. Burke
Augusto Casellato
Boat, Occupants:Andrew Peterson & Peter Jergerson
Norman Piexotto
Sebastian Llopis Puges
William J. Goins
Thomas (Or Tony) Madison
Mrs. Leonard Carlsmith
Primrose Whyte
Mr. Bennett
Male
Boy
Bosun Of The Ship
Woman
Mr. Daniels
Simeon (Samuel) Ettelton
LawyerS Secretary
American Lawyer
Pascual Gurran
Mr. S. Page 
Mrs. Walter H. Kahrs
nan
American University Student
Jack Dagworthy
Jack Canning
A Papuan
Frank Chorie
Male
Fijian Boy
Ofelia Rivas
Male
Professor Winslow
Huri-Huri
Fred White
Lewis Kornahrens
2 Fishermen
18' Boat, Occupants Richard Gunther & Donald Cavanaugh
Robert Martin
Ernest Conroy
Claremont L. Staden
Noel Knight
Male
Boat Owned By Ricardo Laneiro
Nita Derritt
Frederick Dullroy
Johannes Karlschultz
Male
Charles Brown
Leo Wohill
James Elton
Selim And Dea Opre,Koepang Islanders
Male
Male
15' Boat
Amano, A Japanese Diver
Charles Blair
Aboriginal Male
J. Rigby
Boat, Occupants; Carl Sjoistrom & 2 Other Crew
Percy E

Seaman
Seaman From The York
Crew Member Of The Nieuwstadt
Hindu Pilgrims
Indian People
Antony Van Corlear
Male
Male
Males (Wearing Armor)
Indian Slave
Male
Male
A Candidate For Initiation
Males
Males
Joe Folsom
Conway Plough &Dr. Jonathan Higgs
Hamisi Njenga
Male
nan
Maciello
Rick Donnis
Male
Lassie
Lieutenant Hexton
Ken O'Keefe
Male
Martha Hatagouei
Males
Male
Kahlifeh
nan
Female
Danniell Washington
C.M
Passenger & Crew
A Diver From Kalymnos
August Eichmann
Kai-Tawaro
Male
Male
4 French Divers
Ed Snyder
Male
4.8-Metre Skiboat, Occupants: Rod Salm & 4 Friends
Male, A Mental Patient
Male
Male
C.D. Dollar
Paul Menta
nan
Albert Raiti
Female, A Hae Nyeo
Male
Erik Bjurstrom
Female
A Chief
Mr. Jabar-Kaaby
Carl Bruster
Dan Hogan
Jill Reed
Sinsa
Charles Fleming
Woman
Val Valentine
Male
Male, A Ship Carpenter
Male
Ted Luck
nan
Dalton Baldwin
Les Bishop
Aristede
Male
Male
Male
Male, A Sponge Diver
Girl
Ross Doe
Fijian Girl
Male
Horton Chase
John Fenton
A Ship'S Engineer
Male
An Indian
Sandrillio

In [483]:
for i in range(len(sharks)):
    try:
        sharks['Name'][i]=sharks['Name'][i].replace('  ', '')
    except:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [484]:
for i in range(len(sharks)):
    try:
        sharks['Name'][i]=re.sub(r'^\s', '', sharks['Name'][i])
    except:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [485]:
for i in range(len(sharks)):
    try:
        sharks['Name'][i]=sharks['Name'][i].title()
    except:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [494]:
sharks['Name'].fillna('Unknown', inplace= True)

In [495]:
sharks['Name'].isnull().sum()

0

### Columna 'Sex'

In [496]:
shark_pruebas=sharks.copy()

In [500]:
shark_pruebas.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Activity short'],
      dtype='object')

In [517]:
shark_pruebas.rename(columns={'Sex ':'Sex'}, inplace= True)

In [541]:
shark_pruebas['Sex'].isnull().sum()

0

In [524]:
nans=shark_pruebas[shark_pruebas['Sex'].isnull()== True].index.to_list()

In [525]:
nans

[32, 59, 85, 123, 153, 178, 217, 225, 242, 296, 325, 445, 462, 464, 465, 474, 506, 512, 527, 558, 563, 587, 699, 738, 745, 751, 769, 803, 816, 818, 837, 847, 868, 909, 954, 1065, 1162, 1168, 1172, 1175, 1176, 1177, 1184, 1249, 1275, 1294, 1364, 1382, 1387, 1392, 1403, 1415, 1424, 1447, 1454, 1475, 1515, 1517, 1520, 1523, 1526, 1558, 1567, 1586, 1591, 1614, 1624, 1626, 1635, 1638, 1646, 1653, 1675, 1706, 1708, 1720, 1741, 1768, 1775, 1827, 1846, 1847, 1848, 1849, 1885, 1890, 1926, 1930, 1935, 1961, 1971, 1972, 1977, 1982, 1993, 2006, 2015, 2025, 2042, 2046, 2052, 2053, 2064, 2065, 2096, 2133, 2138, 2140, 2158, 2166, 2194, 2206, 2219, 2223, 2244, 2264, 2271, 2277, 2293, 2314, 2315, 2316, 2321, 2322, 2353, 2383, 2386, 2394, 2404, 2407, 2413, 2414, 2422, 2438, 2450, 2454, 2467, 2470, 2476, 2478, 2480, 2483, 2500, 2503, 2512, 2513, 2514, 2528, 2548, 2549, 2558, 2569, 2599, 2600, 2631, 2662, 2685, 2704, 2745, 2748, 2754, 2756, 2764, 2772, 2784, 2786, 2797, 2804, 2806, 2807, 2808, 2809, 2812,

In [540]:
shark_pruebas['Sex'][59]

'M'

**ACCIÓN:** Se llenaran los nans mitad con F y mitad con M

In [537]:
import random

In [539]:
g=['F','M']
for i in nans:
    shark_pruebas['Sex'][i]=random.choice(g)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Column 'Species'

In [545]:
sharks= shark_pruebas.copy()

In [553]:
sharks['Species '].rename(columns={'Species ': 'Species'} ,inplace= True)

0           White shark
1                   NaN
2                   NaN
3             2 m shark
4       Tiger shark, 3m
             ...       
6247                NaN
6248                NaN
6249                NaN
6250                NaN
6251                NaN
Length: 6252, dtype: object

In [555]:
sharks.rename(columns={'Species ':'Species'}, inplace= True)

In [557]:
sharks.fillna('Shark', inplace= True)

In [566]:
sharks['Species'][2]

'Shark'

In [567]:
x= re.match(r'(\w+.shark|Shark)', sharks['Species'][2])
y=str(x.group())
print(y)

Shark


In [568]:
for i in range(len(sharks)):
    try:
        x=re.match(r'(\w+.shark|Shark)',sharks['Species'][i])
        y=str(x.group())
        sharks['Species'][i]= y
    except:
        pass
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [571]:
sharks[:50]

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Activity short
0,2018.06.25,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,18:00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,Paddling
1,2018.06.18,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson Mcneely,F,...,14:00,Shark,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,Standing
2,2018.06.09,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,07:45,Shark,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,Surfing
3,2018.06.08,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,Male,M,...,12:00,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,Surfing
4,2018.06.04,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,12:00,Tiger shark,A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,Free diving
5,2018.06.03.b,03-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,"Flat Rock, Ballina",Kite surfing,Chris,M,...,12:00,Shark,"Daily Telegraph, 6/4/2018",2018.06.03.b-FlatRock.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.03.b,2018.06.03.b,6298.0,Kite surfing
6,2018.06.03.a,03-Jun-2018,2018,Unprovoked,BRAZIL,Pernambuco,"Piedade Beach, Recife",Swimming,Jose Ernesto Da Silva,M,...,12:00,Tiger shark,"Diario de Pernambuco, 6/4/2018",2018.06.03.a-daSilva.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.03.a,2018.06.03.a,6297.0,Swimming
7,2018.05.27,27-May-2018,2018,Unprovoked,USA,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,Male,M,...,12:00,Lemon shark,"K. McMurray, TrackingSharks.com",2018.05.27-Ponce.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.27,2018.05.27,6296.0,Fishing
8,2018.05.26.b,26-May-2018,2018,Unprovoked,USA,Florida,"Cocoa Beach, Brevard County",Walking,Cody High,M,...,17:00,Bull shark,"K.McMurray, TrackingSharks.com",2018.05.26.b-High.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.26.b,2018.05.26.b,6295.0,Walking
9,2018.05.26.a,26-May-2018,2018,Unprovoked,USA,Florida,"Daytona Beach, Volusia County",Standing,Male,M,...,14:00,Shark,"K. McMurray, Tracking Sharks.com",2018.05.26.a-DaytonaBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.26.a,2018.05.26.a,6294.0,Standing


### Columna 'Case Number1' y 'Case Number2'

In [573]:
shark= sharks.copy()

In [575]:
igual=0
dif=0
for i in range(len(shark)):
    if shark['Case Number.1'][i]==shark['Case Number.2'][i]:
        igual+=1
    else:
        dif+=1

In [578]:
print(len(sharks))
print(igual)
print(dif)


6252
6232
20


**ACCIÓN:** Las columnas son en su mayoría lo mismo por lo que se borarará shark['Case Number.2']

In [580]:
shark.drop('Case Number.2', axis= 1, inplace= True)

In [581]:
shark

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,original order,Activity short
0,2018.06.25,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,N,18:00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,6303.0,Paddling
1,2018.06.18,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson Mcneely,F,...,N,14:00,Shark,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,6302.0,Standing
2,2018.06.09,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,N,07:45,Shark,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,6301.0,Surfing
3,2018.06.08,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,Male,M,...,N,12:00,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,6300.0,Surfing
4,2018.06.04,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,N,12:00,Tiger shark,A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,6299.0,Free diving
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6247,ND.0005,1903,1903,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,Male,M,...,Y,12:00,Shark,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,6.0,Diving
6248,ND.0004,1903,1903,Unprovoked,AUSTRALIA,Western Australia,Western Australia,Pearl diving,Ahmun,M,...,Y,12:00,Shark,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,5.0,Pearl diving
6249,ND.0003,1900-1905,1905,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard Personnel,M,...,Y,12:00,Shark,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,4.0,Swimming
6250,ND.0002,1883-1889,1889,Unprovoked,PANAMA,Panama,"Panama Bay 8ºN, 79ºW",Unknown,Jules Patterson,M,...,Y,12:00,Shark,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,3.0,Unknown


### Columna 'Fatal'

In [583]:
sharks= shark.copy()

In [592]:
sharks['Fatal (Y/N)'].unique()

array(['N', 'Y', 'Unknown'], dtype=object)

In [587]:
for i in range(len(sharks)):
    sharks['Fatal (Y/N)'][i]=sharks['Fatal (Y/N)'][i].upper()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [589]:
for i in range(len(sharks)):
    sharks['Fatal (Y/N)'][i]=sharks['Fatal (Y/N)'][i].replace(' ', '')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [591]:
for i in range(len(sharks)):
    if sharks['Fatal (Y/N)'][i] not in ['Y','N']:
        sharks['Fatal (Y/N)'][i]= 'Unknown'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [594]:
sharks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6252 entries, 0 to 6251
Data columns (total 22 columns):
Case Number               6252 non-null object
Date                      6252 non-null object
Year                      6252 non-null object
Type                      6252 non-null object
Country                   6252 non-null object
Area                      6252 non-null object
Location                  6252 non-null object
Activity                  6252 non-null object
Name                      6252 non-null object
Sex                       6252 non-null object
Age                       6252 non-null object
Injury                    6252 non-null object
Fatal (Y/N)               6252 non-null object
Time                      6252 non-null object
Species                   6252 non-null object
Investigator or Source    6252 non-null object
pdf                       6252 non-null object
href formula              6252 non-null object
href                      6252 non-null object


### Columna 'Age'

In [616]:
shark= sharks.copy()

In [621]:
shark['Age'].unique()

array(['57', '11', '48', '', '18', '52', '15', '12', '32', '10', '21',
       '34', '30', '60', '33', '29', '54', '41', '37', '56', '19', '25',
       '69', '38', '55', '35', '46', '45', '14', '40', '28', '20', '24',
       '26', '49', '22', '7', '31', '17', '13', '42', '3', '8', '50',
       '16', '82', '73', '68', '51', '39', '58', '47', '61', '65', '36',
       '66', '43', '9', '72', '59', '6', '27', '64', '23', '71', '44',
       '62', '63', '70', '53', '77', '74', '5', '86', '81', '84', '75',
       '87', '67', '91', '1', '78', '2'], dtype=object)

In [617]:
for i in range(len(shark)):
    shark['Age'][i]=re.sub(r'[^\d+]','', shark['Age'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [606]:
x='1314'

In [619]:
x[0:2]

'13'

In [609]:
len(x)

4

In [605]:
 shark['Age'][26]

'54'

In [620]:
for i in range(len(shark)):
    if len(shark['Age'][i])>2:
        shark['Age'][i]=shark['Age'][i][0:2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Columna 'Injury'

In [699]:
sharks= shark.copy()

In [850]:
sharks['Injury'].unique()

array(['NO INJURY', 'MINOR INJURY', 'LEG DAMAGE', 'LACERATIONS', 'FATAL',
       'BITTEN', 'SEVERE DAMAGE',
       'INJURED BY TEETH OF A DEAD PORBEAGLE SHARK HE WAS TOSSING OVERBOARD.',
       'HAND INJURY', 'ANKLE DAMAGE', 'FEET DAMAGE',
       ' THE SHARK & MAN SIMPLY COLLIDED; NEITHER WERE INJURED',
       'ARM DAMAGE', 'PUNCTURE WOUNDS TO LEFT SHOULDER', 'CALF INJURED',
       'ABRASIONS', 'DROWNING', 'FINGERNAIL PULLED OFF ', 'MAJOR INJURY',
       'TORSO NIPPED', 'INJURIES TO LEFT CALF',
       'FACE BRUISED WHEN PARTLY BLIND SHARK COLLIDED WITH HIM',
       'INJURIES TO RIGHT CALF', 'PUNCTURE WOUNDS TO RIGHT CALF',
       'BRUISED RIBS & TAIL BONE, SPEARGUN BROKEN AND WETSUIT CUT',
       'RIGHT HEEL INJURED', 'PUNCTURE WOUND TO FINGER',
       'INJURIES TO HEAD & TORSO', 'BUMPED BY SHARK',
       'BOARD SNAPPED IN TWO',
       'DURING HIS 16-HOUR SWIM TO SHORE, HE WAS CIRCLED BY A SHARK BUT IT DID NOT INJURE HIM',
       'THUMB & FINGER NIPPED', 'A HOAX, NO SHARK INVOLVEMENT',

In [848]:
sharks['Injury'][1990:2050]

1990                                LEG DAMAGE
1991                                     FATAL
1992                                    BITTEN
1993                                 NO INJURY
1994                                LEG DAMAGE
1995                               LACERATIONS
1996                               HAND INJURY
1997                               FEET DAMAGE
1998                               LACERATIONS
1999                               FEET DAMAGE
2000                                ARM DAMAGE
2001                                 NO INJURY
2002                                     FATAL
2003                                     FATAL
2004                               FEET DAMAGE
2005                                     FATAL
2006                                     FATAL
2007                                LEG DAMAGE
2008                   PUNCTURE WOUNDS ON KNEE
2009                               LACERATIONS
2010                              MINOR INJURY
2011         

In [701]:
for i in range(len(sharks)):
    sharks['Injury'][i]=sharks['Injury'][i].upper()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [830]:
tb_inj=[]
for i in range(len(sharks)):
    try:
        x= re.search(r'(TAKEN BY)',sharks['Injury'][i])
        x.group()
        tb_inj.append(i)
    except:
        pass

In [835]:
no_inj

[186]

In [833]:
for i in no_inj:
    sharks['Injury'][i]= 'NO INJURY'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [799]:
m_inj=[]
for i in range(len(sharks)):
    try:
        x= re.search(r'(MAJOR)',sharks['Injury'][i])
        x.group()
        m_inj.append(i)
    except:
        pass

In [800]:
for i in m_inj:
    sharks['Injury'][i]= 'MAJOR INJURY'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [796]:
la_inj=[]
for i in range(len(sharks)):
    try:
        x= re.search(r'(LACERATED)',sharks['Injury'][i])
        x.group()
        la_inj.append(i)
    except:
        pass

In [797]:
for i in la_inj:
    sharks['Injury'][i]= 'LACERATIONS'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [756]:
fatal_inj=[]
for i in range(len(sharks)):
    try:
        x= re.search(r'(FATAL)',sharks['Injury'][i])
        x.group()
        fatal_inj.append(i)
    except:
        pass

In [757]:
for i in fatal_inj:
    sharks['Injury'][i]= 'FATAL'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [845]:
t_inj=[]
for i in range(len(sharks)):
    try:
        x= re.search(r'(THIGH|THIGHS)',sharks['Injury'][i])
        x.group()
        t_inj.append(i)
    except:
        pass

In [846]:
t_inj

[142, 169, 198, 253, 273, 302, 332, 555, 589, 684, 737, 768, 833, 1113, 1433, 1734, 1742, 1883, 2038, 2057, 2132, 2250, 2306, 2314, 2354, 2504, 2601, 2687, 2776, 2845, 3033, 3056, 3103, 3294, 3509, 3566, 3587, 3595, 3626, 3629, 3630, 4068, 4244, 4316, 4321, 4413, 5524, 6115]

In [847]:
for i in t_inj:
    sharks['Injury'][i]= 'LEG DAMAGE'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [728]:
sharks['Injury'][127]

'LEG DAMAGE'

In [821]:
bu_inj=[]
for i in range(len(sharks)):
    try:
        x= re.search(r'(BUMP|BUMPED)',sharks['Injury'][i])
        x.group()
        bu_inj.append(i)
    except:
        pass

In [822]:
bu_inj

[442, 4871, 4886, 4892, 6190, 6222]

In [882]:
sharks['Injury'][4871]

'BUMPED BY SHARK'

In [824]:
for i in bu_inj:
    sharks['Injury'][i]= 'BUMPED BY SHARK'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [750]:
S_inj=[]
for i in range(len(sharks)):
    try:
        x= re.search(r'(SERIOUS|SEVERE)',sharks['Injury'][i])
        x.group()
        S_inj.append(i)
    except:
        pass

In [751]:
S_inj

[12, 27, 49, 53, 57, 69, 109, 137, 155, 219, 256, 285, 288, 292, 336, 344, 347, 348, 352, 357, 373, 400, 412, 413, 459, 495, 541, 590, 603, 605, 753, 787, 789, 858, 859, 881, 902, 934, 937, 942, 962, 964, 1021, 1025, 1082, 1108, 1152, 1154, 1155, 1161, 1170, 1218, 1257, 1300, 1317, 1323, 1414, 1444, 1502, 1549, 1585, 1612, 1656, 1679, 1715, 1716, 1725, 1775, 1807, 1819, 1842, 1847, 1871, 1873, 1906, 1934, 2019, 2022, 2049, 2231, 2279, 2387, 2423, 2464, 2467, 2479, 2481, 2524, 2553, 2589, 2591, 2599, 2634, 2640, 2643, 2670, 2778, 2780, 2820, 2835, 2837, 2977, 3070, 3095, 3160, 3235, 3237, 3255, 3257, 3282, 3296, 3368, 3392, 3404, 3405, 3452, 3462, 3464, 3465, 3529, 3572, 3599, 3620, 3640, 3644, 3704, 3715, 3753, 3837, 3848, 3860, 3866, 3887, 3955, 3967, 4126, 4174, 4281, 4329, 4334, 4366, 4367, 4420, 4426, 4430, 4433, 4445, 4478, 4491, 4497, 4512, 4516, 4564, 4567, 4582, 4583, 4588, 4592, 4612, 4633, 4636, 4669, 4687, 4713, 4725, 4798, 4861, 4904, 4930, 4933, 4935, 4952, 4970, 5005, 504

In [752]:
for i in S_inj:
    sharks['Injury'][i]= 'SEVERE DAMAGE'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [883]:
sharks['Injury'][103]

'BITTEN'

In [746]:
x= re.search(r'(BITTEN)',sharks['Injury'][103])
x.group()


'BITTEN'

In [765]:
feet_inj=[]
for i in range(len(sharks)):
    try:
        x= re.search(r'(FEET|FOOT)',sharks['Injury'][i])
        x.group()
        feet_inj.append(i)
    except:
        pass

In [766]:
for i in feet_inj:
    sharks['Injury'][i]= 'FEET DAMAGE'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [806]:
not_inj=[]
for i in range(len(sharks)):
    try:
        x= re.search(r'(NOT CONFIRMED)',sharks['Injury'][i])
        x.group()
        not_inj.append(i)
    except:
        pass

In [807]:
for i in not_inj:
    sharks['Injury'][i]= 'NOT CONFIRMED'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Columna 'href'

In [884]:
shark= sharks.copy()

In [885]:
igual=0
dif=0
for i in range(len(shark)):
    if shark['href formula'][i]==shark['href'][i]:
        igual+=1
    else:
        dif+=1

In [888]:
print(igual)
print(dif)

6228
24


In [887]:
igual=0
dif=0
for i in range(len(shark)):
    if shark['Case Number.1'][i]==shark['Case Number'][i]:
        igual+=1
    else:
        dif+=1

**ACCIÓN:** En su mayor parte las columnas mencionadas con iguales por lo que se borrará una

In [889]:
shark.drop(['href formula'], axis=1, inplace= True)

**ACCIÓN:** Tambien las columnas de case number son casi iguales por lo que se eliminará una

In [890]:
shark.drop(['Case Number.1'], axis=1, inplace= True)

## Pasando a csv

In [891]:
shark.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6252 entries, 0 to 6251
Data columns (total 20 columns):
Case Number               6252 non-null object
Date                      6252 non-null object
Year                      6252 non-null object
Type                      6252 non-null object
Country                   6252 non-null object
Area                      6252 non-null object
Location                  6252 non-null object
Activity                  6252 non-null object
Name                      6252 non-null object
Sex                       6252 non-null object
Age                       6252 non-null object
Injury                    6252 non-null object
Fatal (Y/N)               6252 non-null object
Time                      6252 non-null object
Species                   6252 non-null object
Investigator or Source    6252 non-null object
pdf                       6252 non-null object
href                      6252 non-null object
original order            6252 non-null float64

In [892]:
shark.to_csv('shark_attack.csv', sep=',')