In [615]:
import pandas as pd
import numpy as np
import regex as re
from fancyimpute import KNN
# we increase the maximum display number in order to be able to see some columns that we weren't able to see before,
# as though the two columns between 'Sex' and 'Fatal(Y/N)', which are 'Age' and 'Injury', weren't displayed at first.
pd.set_option("display.max_columns", 120)

In [616]:
!ls ../Pandas-project

README.md         attacks.csv       clean.ipynb
analysis.ipynb    attacks_clean.csv


In [617]:
file = '../Pandas-project/attacks.csv'
df = pd.read_csv(file)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf3 in position 7: invalid continuation byte

In [618]:
# We had to introduce a decoder in order to visualize the dataset, as it was printing an error of 'utf-8' encoding.
# The info from were we obtained the info to solve this problem can be found in this Kaggle post:
# "https://www.kaggle.com/paultimothymooney/how-to-resolve-a-unicodedecodeerror-for-a-csv-file"

import chardet

with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}

In [619]:
# We introduce the 'encoding' parameter and there we go, the table of this set is printed!

df = pd.read_csv(file, encoding='Windows-1252')
df.head(5)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [620]:
# We clean the latest to columns because we ran several .samples() and none of them showed that they actually had
# any useful info.
to_drop = ['Unnamed: 22', 'Unnamed: 23']

df.drop(to_drop, inplace=True, axis=1)

In [621]:
# I was thinking about to delete column 'Case number.2' as I thought they were exactly the same but with different
# names. But after running several samples, they showed a difference in one of them.

In [622]:
# We observe now the two columns 'href formula' and 'href', as they might store the same links and information
# but twice, so that wouldn't be neccessary.
df.head(5)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0


In [106]:
df['href formula']

0        http://sharkattackfile.net/spreadsheets/pdf_di...
1        http://sharkattackfile.net/spreadsheets/pdf_di...
2        http://sharkattackfile.net/spreadsheets/pdf_di...
3        http://sharkattackfile.net/spreadsheets/pdf_di...
4        http://sharkattackfile.net/spreadsheets/pdf_di...
                               ...                        
25718                                                  NaN
25719                                                  NaN
25720                                                  NaN
25721                                                  NaN
25722                                                  NaN
Name: href formula, Length: 25723, dtype: object

In [107]:
df['href']

0        http://sharkattackfile.net/spreadsheets/pdf_di...
1        http://sharkattackfile.net/spreadsheets/pdf_di...
2        http://sharkattackfile.net/spreadsheets/pdf_di...
3        http://sharkattackfile.net/spreadsheets/pdf_di...
4        http://sharkattackfile.net/spreadsheets/pdf_di...
                               ...                        
25718                                                  NaN
25719                                                  NaN
25720                                                  NaN
25721                                                  NaN
25722                                                  NaN
Name: href, Length: 25723, dtype: object

In [623]:
# It looks as they store the same info and their lenght is exactly the same, so we've decided to delete one of them.
df.drop(['href formula'], inplace=True, axis=1)

In [624]:
df.sample(5)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href,Case Number.1,Case Number.2,original order
7655,0.0,,,,,,,,,,,,,,,,,,,,
11089,,,,,,,,,,,,,,,,,,,,,
20884,,,,,,,,,,,,,,,,,,,,,
8747,,,,,,,,,,,,,,,,,,,,,
14911,,,,,,,,,,,,,,,,,,,,,


In [625]:
# Another thing we've noticed is that the links are completely broken, none of them work and all of them display
# the message: '404 Not Found, The requested URL was not found on this server.' But when opening the .csv with 
# another program, they seem to work just fine.

In [626]:
# There are a lot of "NaN's" values in this dataset, so we will have to deal with them.
df.sample(5)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href,Case Number.1,Case Number.2,original order
17149,,,,,,,,,,,,,,,,,,,,,
19260,,,,,,,,,,,,,,,,,,,,,
17644,,,,,,,,,,,,,,,,,,,,,
12941,,,,,,,,,,,,,,,,,,,,,
24371,,,,,,,,,,,,,,,,,,,,,


In [627]:
# We delete also the "Case Number" column to be able to reduce our dataframe to 6000 columns or so.
df.drop(['Case Number'], inplace=True, axis=1)

In [628]:
# We drop all rows with "NaN" values only.
df.dropna(how="all", inplace=True)

In [629]:
# After printing the top 5 positions of the list, we notice that the last column called 'original order' seems to be
# a float type, also filled with 'NaN' values throughout the complete list, so we need to clean that column.
df.dtypes


Date                       object
Year                      float64
Type                       object
Country                    object
Area                       object
Location                   object
Activity                   object
Name                       object
Sex                        object
Age                        object
Injury                     object
Fatal (Y/N)                object
Time                       object
Species                    object
Investigator or Source     object
pdf                        object
href                       object
Case Number.1              object
Case Number.2              object
original order            float64
dtype: object

In [630]:
df

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href,Case Number.1,Case Number.2,original order
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6304,,,,,,,,,,,,,,,,,,,,6306.0
6305,,,,,,,,,,,,,,,,,,,,6307.0
6306,,,,,,,,,,,,,,,,,,,,6308.0
6307,,,,,,,,,,,,,,,,,,,,6309.0


In [634]:
df

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href,Case Number.1,Case Number.2,original order
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6304,,,,,,,,,,,,,,,,,,,,6306.0
6305,,,,,,,,,,,,,,,,,,,,6307.0
6306,,,,,,,,,,,,,,,,,,,,6308.0
6307,,,,,,,,,,,,,,,,,,,,6309.0


In [633]:
df['original order']

0       6303.0
1       6302.0
2       6301.0
3       6300.0
4       6299.0
         ...  
6304    6306.0
6305    6307.0
6306    6308.0
6307    6309.0
6308    6310.0
Name: original order, Length: 6309, dtype: float64

In [423]:
# Now we want to test if two columns are exactly the same or not. Here's a little example:
# We create a random dataframe and put it to the test.
sd = pd.DataFrame({'a': [1,2,3], 'b': [1,2,4]})
sd

Unnamed: 0,a,b
0,1,1
1,2,2
2,3,4


In [431]:
comps = np.where(sd["a"] == sd["b"], True, False)
print(comps)
# As we can see, the test works perfectly fine. The first two columns are the same so the output is "True",
# While the third column is not the same so the output is "False".

[ True  True False]


In [604]:
# Now we do that with our actual dataset. The intention is to test if the columns "Case Number.1" and 
# the other column "Case Number.2" are the same of they contain different info.
comp = np.where(df["Case Number.1"] == df["Case Number.2"], True, False)
print(comp)

[ True  True  True ... False False False]


In [430]:
# We get some "False" outputs, so we need to look closer to see if the result is valid.
df["equal"] = comp
print(df)

      Case Number         Date    Year        Type    Country  \
0      2018.06.25  25-Jun-2018  2018.0     Boating        USA   
1      2018.06.18  18-Jun-2018  2018.0  Unprovoked        USA   
2      2018.06.09  09-Jun-2018  2018.0     Invalid        USA   
3      2018.06.08  08-Jun-2018  2018.0  Unprovoked  AUSTRALIA   
4      2018.06.04  04-Jun-2018  2018.0    Provoked     MEXICO   
...           ...          ...     ...         ...        ...   
25718         NaN          NaN     NaN         NaN        NaN   
25719         NaN          NaN     NaN         NaN        NaN   
25720         NaN          NaN     NaN         NaN        NaN   
25721         NaN          NaN     NaN         NaN        NaN   
25722          xx          NaN     NaN         NaN        NaN   

                  Area                        Location     Activity  \
0           California     Oceanside, San Diego County     Paddling   
1              Georgia  St. Simon Island, Glynn County     Standing   
2     

In [439]:
# As we can see, the "False" output seems to be in the comparison between "NaN" values/data types.
# So we check once again to make sure that this is so.
NaN_1 = np.nan
NaN_2 = np.nan

print(NaN_1 == NaN_2)

False


In [None]:
# So the final conclusion is that the columns "Case Number.1" and "Case Number.2" are probably the same
# We once again proceed to remove the second one because we won't need both of them for the analysis.

In [635]:
df.drop("Case Number.2", inplace=True, axis=1)

In [636]:
# We also need to drop the aforementioned column called "equal" as we only needed it for a check.
df.drop("equal", inplace=True, axis=1)

In [639]:
df.sample(10)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href,Case Number.1,original order
582,08-Feb-2014,2014.0,Unprovoked,AUSTRALIA,South Australia,"Goldsmith Beach, Yorke Peninsula",Spearfishing / Free diving,Sam Kellett,M,28.0,FATAL,Y,12h00,,"The Advertiser, 2/9/2014",2014.02.08-Kellett.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2014.02.08,5721.0
5913,01-Jun-1870,1870.0,Unprovoked,INDIA,West Bengal,"Hoogly River, Calcutta",Bathing / standing,"B., ""an Ooryah coolie""",M,40.0,"Right foot & leg bitten, surgically amputated",N,,,"J. Fayrer, M.D.",1870.06.01-Ooryah.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,1870.06.01,390.0
5255,08-Mar-1920,1920.0,Unprovoked,AUSTRALIA,Queensland,"Between Bay Rock & Magnetic Island, Cleveland Bay","Boat capsized, swimming to shore",Alfred Burgess,M,20.0,"Tossed in air by shark, sustained abrasions",N,,,H. Miller (1920); V.M. Coppleson Q.2.(1933); V...,1920.03.08-Burgess.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,1920.03.08,1048.0
4578,20-Apr-1947,1947.0,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,"Country Club Beach, Durban",Treading water,"Aubrey ""Bill"" Nielsen",M,21.0,Ankle & shin lacerated,N,12h00,"1.5 m, 45-kg shark","A. Nielsen, M. Levine, G. Charter, GSAF",1947.04.20-Neilson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,1947.04.20,1725.0
4707,12-Jul-1942,1942.0,Sea Disaster,ATLANTIC OCEAN,,,The SS Potlach was torpedoed & sunk by the U-1...,John Martin Miller,M,32.0,FATAL Arm bitten,Y,,,"Kingsport Times, 8/6/1942, et al",1942.07.12-Miller.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,1942.07.12,1596.0
1723,01-Nov-2003,2003.0,Unprovoked,AUSTRALIA,New South Wales,"Seal Rocks, north of Newcastle",Standing,male,M,,Minor lacerations to leg & foot,N,Dusk,,"The Sun; 11/2/2003, p.9",2003.11.01-SealRocks.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2003.11.01,4580.0
245,16-Jul-2016,2016.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",,female,F,11.0,Minor injury to toes,N,11h00,,"Orlando Sentinel, 7/21/2016",2016.07.16.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.07.16.a,6058.0
2754,14-Jan-1988,1988.0,Provoked,USA,Louisiana,,Fishing,Chip,M,,Hand bitten by captured shark PROVOKED INCIDENT,N,,Mako shark,"C. Johansson, GSAF",1988.01.14-Chip.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,1988.01.14,3549.0
5532,08-Aug-1902,1902.0,Unprovoked,USA,Hawaii,"Kalihi, O'ahu",Catching crabs,Hawaiian boy,M,,"FATAL, both arms severed",Y,,,G. H. Balazs & A. H. Kam; V.M. Coppleson (1958...,1902.08.08-HawaiianBoy.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,1902.08.08,771.0
1557,22-Aug-2005,2005.0,Invalid,USA,South Carolina,"6th Avenue North, Myrtle Beach, Horry County","Boogie boarding, kicked at object in the water",Nicholas House,M,17.0,"Laceration to knee, possibly by a small black...",,Afternoon,Shark involvement not confirmed,"Clay Creswell, GSAF",2005.08.22-House.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2005.08.22,4746.0


In [640]:
# After running once again several "df.samples()" we choose to substitute the "NaN" values/types for 0 in all the 
# columns that we see we cannot guess a number.
df['Date'].fillna('-', inplace=True)
df['Year'].fillna('-', inplace=True)
df['Type'].fillna('-', inplace=True)
df['Country'].fillna('-', inplace=True)
df['Area'].fillna('-', inplace=True)
df['Location'].fillna('-', inplace=True)
df['Activity'].fillna('-', inplace=True)
df['Name'].fillna('-', inplace=True)
df['Injury'].fillna('-', inplace=True)
df['Time'].fillna('-', inplace=True)
df['Investigator or Source'].fillna('-', inplace=True)
df['pdf'].fillna('-', inplace=True)
df['href'].fillna('-', inplace=True)
df['Case Number.1'].fillna('-', inplace=True)
df['Sex '].fillna('-', inplace=True)
df['Fatal (Y/N)'].fillna('-', inplace=True)
df['Species '].fillna('-', inplace=True)

In [645]:
df.sample(5)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href,Case Number.1,original order
1250,07-Jun-2008,2008,Unprovoked,USA,Florida,"Cocoa Beach, Brevard County",Body surfing,John Vasbinder,M,40,Lacerations & abrasions to right hand,N,-,-,"TC Palm, 6/20/08",2008.06.07-Vasbinder.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2008.06.07,5053.0
2653,18-Nov-1989,1989,Unprovoked,SOUTH AFRICA,Western Cape Province,Melkbosstrand,Free diving & spearfishing,Gerjo Van Niekerk,M,29,FATAL,Y,12h05,White shark,"M. Levine, GSAF",1989.11.18-VanNiekerk.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,1989.11.18,3650.0
3815,11-Jun-1962,1962,Unprovoked,USA,California,San Francisco Bay,Escaping from Alacatraz,John William Anglin,M,32,"FATAL, but shark involvement uncomfirmed. Deat...",Y,Night,-,"San Francisco Chronicle, 5/3/1986",1962.06.11.b-Anglin.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,1962.06.11.b,2488.0
1826,21-Sep-2002,2002,Unprovoked,USA,Oregon,Cape Kiwanda,Boogie boarding or Surfing,Garry Turner,M,24,Ankle lacerated,N,,2.4 m [8'] shark,"R. Collier, GSAF",2002.09.21.b-GarryTurner.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2002.09.21.b,4477.0
6184,Before Oct-2009,0,Unprovoked,PANAMA,Bocas del Toro Province,Red Frog Beach,Swimming/,male,M,20,FATAL,Y,-,-,C. Mendieta & A. Duarte,ND-0151-Panama.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,ND-0151,119.0


In [646]:
# Now we are trying to use the "describe()" function in order to see if the "min()" value of the column 'Age' could 
# be equal to '0', so in that case it could be a good idea to turn all those 'NaN' values to '0' too, but since
# the column 'Age' is an object-type of column we cannot see any output when using "describe()" and also we are having
# trouble when trying to change it's type to int64 or float64, so we don't seem to be able to use the "describe()"
# function in any useful way in this case. The info with some algorithms to replace all those "NaN" values and other
# tips or hints can be found in this URL: 'https://towardsdatascience.com/whats-the-best-way-to-handle-nan-values-62d50f738fc'.
df['Age'].apply(np.int64)
df.describe()

ValueError: cannot convert float NaN to integer

In [647]:
# As we haven't found any other solution to this problem, We will proceed to change those 'NaN' values also for '-' too.
df['Age'].fillna('-', inplace=True)
df

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href,Case Number.1,original order
0,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,6303.0
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,-,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,6302.0
2,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,-,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,6301.0
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,-,Minor injury to lower leg,N,-,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,6300.0
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,-,Lacerations to leg & hand shark PROVOKED INCIDENT,N,-,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,6299.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6304,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,6306.0
6305,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,6307.0
6306,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,6308.0
6307,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,6309.0


In [648]:
# Now, we proceed to save the cleaned dataframe and then analyze it.
df.to_csv('../Pandas-project/attacks_clean.csv')