In [11]:
# Import data

import pandas as pd
shark_attacks = pd.read_excel('https://www.sharkattackfile.net/spreadsheets/GSAF5.xls')


In [12]:
shark_attacks.drop_duplicates(inplace=True)

In [13]:
# Filter for data containing “Australia”  from “Country” (Florian)
shark_attacks_australia = shark_attacks[shark_attacks['Country'].str.contains("Australia", case=False, na=False)]
print(shark_attacks_australia)



             Date    Year        Type    Country              State  \
0     09-Jan-2024  2024.0  Unprovoked  AUSTRALIA    South Australia   
4     28 Dec-2023  2023.0  Unprovoked  AUSTRALIA    South Australia   
5     25 Dec-2023  2023.0  Unprovoked  AUSTRALIA    New South Wales   
6     24-Dec-2023  2023.0  Unprovoked  AUSTRALIA  Western Australia   
10    08 Dec-2023  2023.0  Unprovoked  AUSTRALIA         Queensland   
...           ...     ...         ...        ...                ...   
6924  Before 1906     0.0  Unprovoked  AUSTRALIA                NaN   
6925  Before 1906     0.0  Unprovoked  AUSTRALIA                NaN   
6926  Before 1906     0.0  Unprovoked  AUSTRALIA    New South Wales   
6927  Before 1903     0.0  Unprovoked  AUSTRALIA  Western Australia   
6928  Before 1903     0.0  Unprovoked  AUSTRALIA  Western Australia   

                     Location       Activity             Name Sex  Age  ...  \
0     Walkers Beach, Elliston        Surfing     Murray Adams   M   

In [14]:
# Extract “Month” from “Date” (Bartek)

def custom_date_parser(date_str):
    try:
        return pd.to_datetime(date_str, format='%d %b-%Y')
    except ValueError:
        return pd.to_datetime(date_str, errors='coerce')
# Apply the custom date parser to the "Date" column in the shark_attacks_australia DataFrame
shark_attacks_australia['Date'] = shark_attacks_australia['Date'].apply(custom_date_parser)
# Extract the month from the "Date" column
shark_attacks_australia['Month'] = shark_attacks_australia['Date'].dt.month
# Display the DataFrame with the extracted month
print(shark_attacks_australia[['Date', 'Month']])



           Date  Month
0    2024-01-09    1.0
4    2023-12-28   12.0
5    2023-12-25   12.0
6    2023-12-24   12.0
10   2023-12-08   12.0
...         ...    ...
6924        NaT    NaN
6925        NaT    NaN
6926        NaT    NaN
6927        NaT    NaN
6928        NaT    NaN

[1478 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shark_attacks_australia['Date'] = shark_attacks_australia['Date'].apply(custom_date_parser)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shark_attacks_australia['Month'] = shark_attacks_australia['Date'].dt.month


In [15]:
# Remove Null Values from "Month"
shark_attacks_australia = shark_attacks_australia.dropna(subset=['Month'])

shark_attacks_australia

Unnamed: 0,Date,Year,Type,Country,State,Location,Activity,Name,Sex,Age,...,Source,pdf,href formula,href,Case Number,Case Number.1,original order,Unnamed: 21,Unnamed: 22,Month
0,2024-01-09,2024.0,Unprovoked,AUSTRALIA,South Australia,"Walkers Beach, Elliston",Surfing,Murray Adams,M,64,...,"A. Currie, GSAF",,,,,,,,,1.0
4,2023-12-28,2023.0,Unprovoked,AUSTRALIA,South Australia,Ethel Beach,Surfing,Khai Cowley,M,15,...,"S. DeMarchi, GSAF",,,,,,,,,12.0
5,2023-12-25,2023.0,Unprovoked,AUSTRALIA,New South Wales,Old Bar,Surfing,Julian McLennan,M,16,...,"9 News, 12/26/2023",,,,,,,,,12.0
6,2023-12-24,2023.0,Unprovoked,AUSTRALIA,Western Australia,Wedge Island,Foil Boarding,Troy Brown,M,46,...,"9 News, 12/25/2023",,,,,,,,,12.0
10,2023-12-08,2023.0,Unprovoked,AUSTRALIA,Queensland,1770,Snorkeling,Matteo Mariotti,M,20,...,"B. Myatt, GSAF",,,,,,,,,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6707,1832-06-04,1832.0,Unprovoked,AUSTRALIA,New South Wales,"South Head, Sydney",Fishing,Aboriginal female,F,,...,"Sydney Herald, 6/11/1832",1832.06.04-AboriginalWoman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1832.06.04,1832.06.04,224.0,,,6.0
6746,1807-01-12,1807.0,Unprovoked,AUSTRALIA,New South Wales,"Cockle Bay, Sydney Harbour",,male,M,,...,"J. Green, p.31",1807.01.12-Cockle-Bay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1807.01.12,1807.01.12,185.0,,,1.0
6751,1803-03-01,1803.0,Unprovoked,AUSTRALIA,Western Australia,"Hamelin Harbour, at Faure Island",,M. Lefevre & a sailor (rescuer),M,,...,F. Peron ref in G.P. Whitley (Fishes of Austr...,1803.03.00-Lefevre.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1803.03.00,1803.03.00,180.0,,,3.0
6757,1791-01-01,1791.0,Unprovoked,AUSTRALIA,New South Wales,Port Jackson,,"female, an Australian aboriginal",F,,...,"G.P. Whitley; D. Baldridge, p.162",1791.00.00-aboriginal woman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1791.00.00,1791.00.00,174.0,,,1.0


In [16]:
#State (Hanieh)
shark_attacks_australia.isnull().sum()
shark_attacks_australia["State"].mode()[0]
shark_attacks_australia = shark_attacks_australia.fillna(shark_attacks_australia["State"].mode()[0])
shark_attacks_australia["State"].isnull().sum()

0

In [17]:
#Injury (Erensu)
import re
def fatal_or_not(Injury):
    case = str(Injury).strip()
    if re.search(r'fatal', case, flags=re.IGNORECASE):
        return 1
    else:
        return 0
fatal_or_not_list=[]
for i in shark_attacks_australia.Injury:
    a=fatal_or_not(i)
    fatal_or_not_list.append(a)

shark_attacks_australia['fatal_or_not']=fatal_or_not_list

In [18]:
# pip install pandas
!pip install pandas




In [20]:
# Export as an excel file

shark_attacks_australia.to_csv("Shark_Attack_CSV.csv", index=False)



In [10]:
shark_attacks_australia.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1312 entries, 0 to 6758
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            1312 non-null   datetime64[ns]
 1   Year            1312 non-null   float64       
 2   Type            1312 non-null   object        
 3   Country         1312 non-null   object        
 4   State           1312 non-null   object        
 5   Location        1312 non-null   object        
 6   Activity        1312 non-null   object        
 7   Name            1312 non-null   object        
 8   Sex             1312 non-null   object        
 9   Age             1312 non-null   object        
 10  Injury          1312 non-null   object        
 11  Unnamed: 11     1312 non-null   object        
 12  Time            1312 non-null   object        
 13  Species         1312 non-null   object        
 14  Source          1312 non-null   object        
 15  pdf      