# **Cleaning of Shark Attacks Data**

In [241]:
import numpy as np
import pandas as pd
import re
#import src.functions as fc

In [242]:
attacks = pd.read_csv("../attacks.csv", encoding='latin-1')
attacks

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25718,,,,,,,,,,,...,,,,,,,,,,
25719,,,,,,,,,,,...,,,,,,,,,,
25720,,,,,,,,,,,...,,,,,,,,,,
25721,,,,,,,,,,,...,,,,,,,,,,




## **Initial cleaning by analysis of number of nulls**

In [243]:
attacks.isnull().sum()

Case Number               17021
Date                      19421
Year                      19423
Type                      19425
Country                   19471
Area                      19876
Location                  19961
Activity                  19965
Name                      19631
Sex                       19986
Age                       22252
Injury                    19449
Fatal (Y/N)               19960
Time                      22775
Species                   22259
Investigator or Source    19438
pdf                       19421
href formula              19422
href                      19421
Case Number.1             19421
Case Number.2             19421
original order            19414
Unnamed: 22               25722
Unnamed: 23               25721
dtype: int64

In [244]:
# Removing columns containing a large number of nulls
attacks = attacks.drop(columns = ["Unnamed: 22", "Unnamed: 23"])

In [245]:
attacks.isnull().sum()

Case Number               17021
Date                      19421
Year                      19423
Type                      19425
Country                   19471
Area                      19876
Location                  19961
Activity                  19965
Name                      19631
Sex                       19986
Age                       22252
Injury                    19449
Fatal (Y/N)               19960
Time                      22775
Species                   22259
Investigator or Source    19438
pdf                       19421
href formula              19422
href                      19421
Case Number.1             19421
Case Number.2             19421
original order            19414
dtype: int64

In [246]:
# Still columns with many null values. Analysing nulls by rows 
attacks.isnull().sum(axis=1).value_counts().sort_values(ascending=False)

22    17020
21     2394
1      1516
0      1422
2      1200
3      1196
4       540
5       293
6       102
7        26
20        7
8         7
dtype: int64

In [247]:
# Many rows with 20 or more null values out of 22. Getting their index to remove them
morethan19Nulls = attacks.isnull().sum(axis=1)[attacks.isnull().sum(axis=1) >= 20].index
morethan19Nulls

Int64Index([ 6302,  6303,  6304,  6305,  6306,  6307,  6308,  6309,  6310,
             6311,
            ...
            25713, 25714, 25715, 25716, 25717, 25718, 25719, 25720, 25721,
            25722],
           dtype='int64', length=19421)

In [248]:
attacks = attacks.drop(index=morethan19Nulls)
attacks.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0


##### **Shape from (25723, 24) to (6302, 22) by removing rows and columns with a large number of null values.**

In [249]:
# Analysing duplicates
attacks.drop_duplicates().shape

(6302, 22)


## **Removing unnecessary columns for my hypothesis**

In [250]:
# Deciding unnecessary columns for my hypothesis to remove them
attacks.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order'],
      dtype='object')

In [251]:
attacks = attacks.drop(columns = ['Investigator or Source', 'pdf', 'href formula', 'href'])

In [252]:
attacks.drop_duplicates().shape

(6302, 18)


## **Cleaning columns with info about dates**

In [253]:
attacks.iloc[0]

Case Number                                              2018.06.25
Date                                                    25-Jun-2018
Year                                                           2018
Type                                                        Boating
Country                                                         USA
Area                                                     California
Location                                Oceanside, San Diego County
Activity                                                   Paddling
Name                                                    Julie Wolfe
Sex                                                               F
Age                                                              57
Injury            No injury to occupant, outrigger canoe and pad...
Fatal (Y/N)                                                       N
Time                                                          18h00
Species                                         

In [254]:
# Starting with columns Case Number, Case Number.1 y Case Number.2 by the similarity of the values
# Analysing how many times are different
def count_3(a,b,c):
    count=0
    for e in zip(a,b,c):
        if (e[0]!=e[1]) | (e[0]!=e[2]) | (e[2]!=e[1]): 
            count+=1
    return count

count_3(attacks["Case Number"], attacks["Case Number.1"], attacks["Case Number.2"])
#fc.count_3(attacks["Case Number"], attacks["Case Number.1"], attacks["Case Number.2"])

24

##### **The three columns are different 24 times**

In [255]:
# Applying "year" function to clean the three columns and save only the year from values
def year(value):
    return (str(value))[0:4] # Lo convierto a str porque el dato es un float

attacks["Case Number"] = attacks["Case Number"].apply(year)
attacks["Case Number.1"] = attacks["Case Number.1"].apply(year)
attacks["Case Number.2"] = attacks["Case Number.2"].apply(year)


#attacks["Case Number"] = attacks["Case Number"].apply(fc.year)
#attacks["Case Number.1"] = attacks["Case Number.1"].apply(fc.year)
#attacks["Case Number.2"] = attacks["Case Number.2"].apply(fc.year)

In [256]:
count_3(attacks["Case Number"], attacks["Case Number.1"], attacks["Case Number.2"])
#fc.count_3(attacks["Case Number"], attacks["Case Number.1"], attacks["Case Number.2"])

6

##### **Now the three columns are just different 6 times. Choosing Case Number to continue with the cleaning**
### **Cleaning column "Case Number"**

In [257]:
# Removing columns Case Number.1 and Case Number.2
attacks = attacks.drop(columns = ["Case Number.1","Case Number.2"])

In [258]:
attacks.drop_duplicates().shape

(6302, 16)

In [259]:
attacks["Case Number"]

0       2018
1       2018
2       2018
3       2018
4       2018
        ... 
6297    ND.0
6298    ND.0
6299    ND.0
6300    ND.0
6301    ND.0
Name: Case Number, Length: 6302, dtype: object

In [260]:
# Function that changes the value to "NoData" from values different from 4 digits
def solo_digitos(value):
    if re.match(r'\d{4}', value):
        return value
    else:
        return "NoData"

attacks["Case Number"] = attacks["Case Number"].apply(solo_digitos)
#attacks["Case Number"] = attacks["Case Number"].apply(fc.solo_digitos)

attacks["Case Number"]

0         2018
1         2018
2         2018
3         2018
4         2018
         ...  
6297    NoData
6298    NoData
6299    NoData
6300    NoData
6301    NoData
Name: Case Number, Length: 6302, dtype: object

### **Cleaning column "Year"**

In [261]:
display(attacks["Year"].dtype)
display(attacks["Year"].iloc[0])

dtype('float64')

2018.0

In [262]:
#function to eliminate values after decimal point and change incorrect values to "NoData"
def tipo(value):
    value = str(value)
    value = value.split(".")
    value = value[0]
    if len(value)!=4:
        return "NoData"
    else:
        return value
    
attacks["Year"] = attacks["Year"].apply(tipo)
#attacks["Year"] = attacks["Year"].apply(fc.tipo)

In [263]:
display(attacks["Year"].dtype)
display(attacks["Year"].iloc[0])

dtype('O')

'2018'

In [264]:
#how many times the columns are different
def count_2(n,m):  #Me quedaría con esta, no con la de arriba
    count=0
    for e in zip(n, m):
        if (e[0]!=e[1]): 
            count+=1
    return count

count_2(attacks["Case Number"], attacks["Year"])
#fc.count_2(attacks["Case Number"], attacks["Year"])

18

##### **Now the two columns "Year" and "Case Number" are just different 18 times. Choosing "Case Number" to continue with the cleaning**

In [265]:
# Removing column "Year"
attacks= attacks.drop(columns=["Year"])
attacks.head()

Unnamed: 0,Case Number,Date,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,original order
0,2018,25-Jun-2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,6303.0
1,2018,18-Jun-2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,6302.0
2,2018,09-Jun-2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,6301.0
3,2018,08-Jun-2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,6300.0
4,2018,04-Jun-2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",6299.0



### **Cleaning column "Date"**
##### **This column contains interesting data (dates before 1900)**

In [266]:
attacks["Date"] = attacks["Date"].fillna("NoData")

In [267]:
# function to select the year from the value
def date(value):
    if (len(value)==4) | (value=="NoData"):
        return value
    elif re.match(r'\d{4}\-\d{4}.*', value):
        value = value.split("-")
        value1 = value[0]
        value2 = value[1]
        value2 = re.search(r"^\d{4}", value2)
        value2 = value2.group()
        value = [int(value1),int(value2)]
        value_mean = int(sum(value)/len(value))
        return str(value_mean)
    elif re.search(r".*\d{4}.*", value):
            value = (re.search(r"\d{4}", value)).group()
            return value
    else:
        return "NoData"


    
attacks["Date"] = attacks["Date"].apply(date)
attacks["Date"].value_counts()

#attacks["Date"] = attacks["Date"].apply(fc.date)

2015    143
2017    137
2016    131
2011    129
2014    128
       ... 
1834      1
1791      1
1801      1
1742      1
1792      1
Name: Date, Length: 250, dtype: int64

In [268]:
count_2(attacks["Case Number"], attacks["Date"])
#fc.count_2(attacks["Case Number"], attacks["Date"])

152

##### **"Case number" and "Date" columns are different 152 times (just a 2,4% of the values). Choosing "Date".**

In [269]:
# Removing "Case number" column
attacks = attacks.drop(columns=["Case Number"])

In [270]:
attacks.head()

Unnamed: 0,Date,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,original order
0,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,6303.0
1,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,6302.0
2,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,6301.0
3,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,6300.0
4,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",6299.0




## **Cleaning column "Activity" with info about activities**

In [271]:
# Getting info about values in "Activity"
attacks["Activity"].value_counts()

Surfing                                                                                               971
Swimming                                                                                              869
Fishing                                                                                               431
Spearfishing                                                                                          333
Bathing                                                                                               162
                                                                                                     ... 
Wreck of  large double sailing canoe                                                                    1
Ditched plane in the sea & were adrift on a rubber life raft.                                           1
The schooner Elizabeth, bound from Bluefields, Nicaragua to the river port of San Carlos foundered      1
Vehicle plunged over cliff into the water     

In [272]:
attacks["Activity"].isnull().sum()

544

In [273]:
# Data wrangling --> "NoData" for null data 
attacks["Activity"] = attacks["Activity"].fillna("NoData")

In [274]:
# taking a look at the values in "Activity"
lista=[]
for e in attacks["Activity"]:
    if e not in lista:
        lista.append(e)
#print(lista)

In [275]:
# function using regex library to group all the values related to sports as Water sports
d = {r".*(o|O)verboard.*":"Others", 
     r".*(s|S)urf.*":"Water sports", 
     r".*(p|P)addl(e|i).*":"Water sports", 
     r".*(b|B)oarding.*":"Water sports", 
     r".*(s|S)wim.*":"Water sports", 
     r".*(d|D)iv(e|i).*":"Water sports", 
     r".*(s|S)norkel.*":"Water sports",
     r".*(k|K)ayak.*":"Water sports",
     r".*(c|C)anoe.*":"Water sports"}

def activ(value):
    value = str(value)
    if value=="NoData":
        return value
    for key in d.keys():
        if re.search(key, value):
            return d[key]
    else:
        return value

attacks["Activity"] = attacks["Activity"].apply(activ)

In [276]:
attacks["Activity"].value_counts().head(10)

Water sports      3271
NoData             544
Fishing            431
Spearfishing       333
Bathing            162
Wading             149
Others             106
Standing            99
Treading water      32
Walking             17
Name: Activity, dtype: int64

In [277]:
# Grouping the rest of activities in others
def activ2(value):
    if (value!="Water sports") & (value!="NoData"):
        return "Others"
    else:
        return value
    
attacks["Activity"] = attacks["Activity"].apply(activ2)
attacks["Activity"].value_counts()

#attacks["Activity"] = attacks["Activity"].apply(fc.activ2)

Water sports    3271
Others          2487
NoData           544
Name: Activity, dtype: int64

In [278]:
# Checking for duplicates
attacks.drop_duplicates().shape

(6302, 14)

In [279]:
attacks.head()

Unnamed: 0,Date,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,original order
0,2018,Boating,USA,California,"Oceanside, San Diego County",Water sports,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,6303.0
1,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Others,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,6302.0
2,2018,Invalid,USA,Hawaii,"Habush, Oahu",Water sports,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,6301.0
3,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Water sports,male,M,,Minor injury to lower leg,N,,2 m shark,6300.0
4,2018,Provoked,MEXICO,Colima,La Ticla,Water sports,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",6299.0



## **Removing unnecessary columns for my hypothesis**

In [280]:
attacks = attacks.drop(columns=['Type', 'Area', 'Location', 'Name','Age',"Sex ", 'Injury','Time', 'Species ',
       'original order', 'Fatal (Y/N)'])

In [281]:
attacks

Unnamed: 0,Date,Country,Activity
0,2018,USA,Water sports
1,2018,USA,Others
2,2018,USA,Water sports
3,2018,AUSTRALIA,Water sports
4,2018,MEXICO,Water sports
...,...,...,...
6297,1903,AUSTRALIA,Water sports
6298,1903,AUSTRALIA,Water sports
6299,1902,USA,Water sports
6300,1886,PANAMA,NoData


## **Removing null Values ("NoData" values)**

In [282]:
filter_attacks = (attacks.Date != "NoData") & (attacks.Activity != "NoData")
attacks = attacks[filter_attacks]

In [283]:
attacks

Unnamed: 0,Date,Country,Activity
0,2018,USA,Water sports
1,2018,USA,Others
2,2018,USA,Water sports
3,2018,AUSTRALIA,Water sports
4,2018,MEXICO,Water sports
...,...,...,...
6296,1906,AUSTRALIA,Water sports
6297,1903,AUSTRALIA,Water sports
6298,1903,AUSTRALIA,Water sports
6299,1902,USA,Water sports


## **Saving clean CSV file**

In [284]:
attacks.to_csv("output/attacks_clean.csv")