In [77]:
import pandas as pd
import src.cleaning_functions as cf
import re
import plotly.express as px
import numpy as np

In [78]:
#Loading my csv file.
df = pd.read_csv("data/attacks.csv",encoding = "ISO-8859-1")

In [79]:
#I explore 5 random rows in the dataframe.
df.sample(5)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
20714,,,,,,,,,,,...,,,,,,,,,,
18064,,,,,,,,,,,...,,,,,,,,,,
17780,,,,,,,,,,,...,,,,,,,,,,
7463,0.0,,,,,,,,,,...,,,,,,,,,,
15799,,,,,,,,,,,...,,,,,,,,,,


In [80]:
#I use my function "prepare" (see cf.prepare.__doc__) in the columns to ease the data analysis.
df.columns = cf.prepare(df.columns)

#I delete the rows where I only have Nan.
df.dropna(how="all", inplace=True)

#I explore again 5 random rows to see the changes.
df.head(5)

#I explore the number of Nan in my dataframe.
df.isna().sum()

case_number                  1
date                      2401
year                      2403
type                      2405
country                   2451
area                      2856
location                  2941
activity                  2945
name                      2611
sex                       2966
age                       5232
injury                    2429
fatal_(y/n)               2940
time                      5755
species                   5239
investigator_or_source    2418
pdf                       2401
href_formula              2402
href                      2401
case_number.1             2401
case_number.2             2401
original_order            2394
unnamed:_22               8702
unnamed:_23               8701
dtype: int64

In [81]:
#During data exploration, I realised the column "unnamed : 22" has a single entry that is not NaN, so I explore it.
unnamed22NaN=df["unnamed:_22"].unique()
unnamed22NaN
#The entry is "stopped here", which is not useful, so I decide to delete the column.
df.drop(["unnamed:_22"], axis=1, inplace=True)


#I also realised the column "unnamed : 23" has only 2 entries that are not NaN, so I explore them.
unnamed23NaN=df["unnamed:_23"].unique()
unnamed23NaN
#The entries are 'Teramo' and 'change filename', which are not useful, so I decide to delete the column.
df.drop(["unnamed:_23"], axis=1, inplace=True)

#I save in a variable the columns that include links to webs in case I had to use them
#and delete the columns from the datafra.
pdfcol = df["href_formula"],df["href"]
df.drop(["href_formula","href"],axis=1, inplace=True)

In [82]:
#Now that I have explored more about my data, I decide that my hypothesis will be:
#Are surfers the most targeted by sharks?
#Do sharks attack more often in Australia on summer?

#Para probar dichas hipótesis, elimino las columnas que no son relevantes para su estudio.
df.drop(["case_number", "year", "type", "area", "location", "name", "sex","age", "injury", "fatal_(y/n)", "time","species", "investigator_or_source","pdf", "case_number.1","case_number.2", "original_order"],axis=1, inplace=True)

#Tras eliminar las columnas, elimino también las filas donde sólo hay NaN
df.dropna(how="all", inplace=True)

In [83]:
#I will use my funtion  "strdata" to convert to string the data in the column "country"
#and be able to use "prepare" function.
df.activity = cf.strdata(df.activity)

#I will use "prepare" function in "activity" column.
df.activity = cf.prepare(df.activity)

In [84]:
#I  will  use my function "uniform" to convert all my data into a single string so I can count its occurrence.
#Every register in the "activity" column that contains the words "surf" will become "surfing".
#UNIFY activity column

actunified = cf.uniforme(df.activity,"surf","surfing")
df.activity = actunified

#Every register in the "activity" column that contains the words "swim", "bathing", "bath", "dive" or "diving"
#will become "swimming", as I  consider that the former words are a kind of swimming.
actunified = cf.uniforme(df.activity,"swim","swimming")
df.activity = actunified

actunified = cf.uniforme(df.activity, "diving","swimming")
df.activity = actunified

actunified = cf.uniforme(df.activity, "swim","swimming")
df.activity = actunified

actunified = cf.uniforme(df.activity, "bath","swimming")
df.activity = actunified

actunified = cf.uniforme(df.activity, "bathing","swimming")
df.activity = actunified

actunified = cf.uniforme(df.activity, "dive","swimming")
df.activity = actunified

#The same process for other registers.
actunified = cf.uniform(df.activity,"fish","fishing")
df.activity = actunified

actunified = cf.uniforme(df.activity,"snorkel","snorkeling")
df.activity = actunified

actunified = cf.uniforme(df.activity,"wading","wading")
df.activity = actunified

actunified = cf.uniforme(df.activity,"boat", "boat")
df.activity = actunified

actunified = cf.uniforme(df.activity,"yacht", "boat")
df.activity = actunified

actunified = cf.uniforme(df.activity,"ship", "boat")
df.activity = actunified


df.activity.value_counts()

swimming                                       1869
surfing                                        1261
fishing                                        1125
nan                                             544
boat                                            172
                                               ... 
pulling_shark_from_the_water                      1
standing_on_sandbar                               1
sinking_of_the_m/v_mindoro_during_a_typhoon       1
picking_up_shark_by_the_tail                      1
wreck_of__large_double_sailing_canoe              1
Name: activity, Length: 558, dtype: int64

In [85]:
#I will use my function "act" to add Nan where the activities are minority and not as repeated.
def act(col):
    for activity in col:
        if col == "swimming":
            return "swimming"
        elif col == "surfing":
            return "surfing"
        elif col == "fishing":
            return "fishing"
        elif col == "boat":
            return "boat"
        elif col == None:
            pass
        else:
            return "other_activities"

    
df.activity = df.activity.apply(act)


In [86]:
df.activity.value_counts()

other_activities    1873
swimming            1869
surfing             1261
fishing             1125
boat                 172
Name: activity, dtype: int64

In [12]:
#I will use my funtion  "strdata" to convert to string the data in the column "country"
#and be able to use "prepare" function.
df.country = cf.strdata(df.country)

df.country = cf.prepare(df.country)
df.head()

Unnamed: 0,date,country,activity
0,25-Jun-2018,usa,
1,18-Jun-2018,usa,
2,09-Jun-2018,usa,surfing
3,08-Jun-2018,australia,surfing
4,04-Jun-2018,mexico,swimming


In [13]:
df.date = cf.strdata(df.date)
df.date = cf.prepare(df.date)
def onlymonths(col):
    return [(i[3:6]) for i in list(col)]

def getmonth(col):
    for month in col:
        if col[3:6] == "jan" or "jan" in col:
            return "january"
        elif col[3:6] == "feb" or "feb" in col:
            return "february"
        elif col[3:6] == "mar" or "mar" in col:
            return "march"
        elif col[3:6] == "apr" or "apr" in col:
            return "april"
        elif col[3:6] == "may" or "may" in col:
            return "may"
        elif col[3:6] == "jun" or "jun" in col:
            return "june"
        elif col[3:6] == "jul" or "jul" in col:
            return "july"
        elif col[3:6] == "aug" or "aug" in col:
            return "august"
        elif col[3:6] == "sep" or "sep" in col:
            return "september"
        elif col[3:6] == "oct" or "oct" in col:
            return "october"
        elif col[3:6] == "nov" or "nov" in col:
            return "november"
        elif col[3:6] == "dec" or "dec" in col:
            return "december"
        else:
            return np.nan

def australia(col):
    for country in col:
        if col == "australia":
            return "australia"
        else:
            return np.nan
df["month"] = df.date.apply(getmonth)
df["australia"] = df.country.apply(australia)

In [14]:
#LOS ATAQUES DE TIBURÓN EN AUSTRALIA SE DAN A SURFEROS DURANTE EL VERANO
dfaus = df.copy()
dfaus.drop(["date","country"], axis=1, inplace=True)
dfaus

Unnamed: 0,activity,month,australia
0,,june,
1,,june,
2,surfing,june,
3,surfing,june,australia
4,swimming,june,
...,...,...,...
6297,swimming,,australia
6298,swimming,,australia
6299,swimming,,
6300,,,


In [15]:
#We add a new column called "surfing" deleting the values in the column "activities" that are not "surfing"
def borracol(col):
    for i in str(col):
        if col != "surfing":
            return np.nan
        elif col == np.nan:
            pass
        else:
            pass

dfaus["surfing"] = dfaus.activity.apply(borracol)
dfaus.drop(["activity"], axis=1, inplace=True)


In [32]:
#I will create a new column called "summer", where "summer" will appear if the "month" column is
#december, january or february (these are the "southern" summer months)
def summer(col):
    for i in str(col):
        if col == "december":
            return "summer"
        elif col == "january":
            return "summer"
        elif col == "february":
            return "summer"
        else:
            return np.nan

dfaus["summer"] = dfaus.month.apply(summer)
dfaus.dropna(how="all", inplace=True)


In [41]:
fig  = px.histogram(df, x="activity").update_xaxes(categoryorder="total descending")
fig.show()

In [None]:
fig  = px.histogram(dfaus, x="activity").update_xaxes(categoryorder="total descending")
fig.show()