# Shark Data Frame | Importing & Creating

In [4]:
import pandas as pd

In [6]:
import re # Import ReGex module

In [8]:
url='https://www.sharkattackfile.net/spreadsheets/GSAF5.xls'

In [10]:
shark_df=pd.read_excel(url)

# Shark Data Frame | Cleaning

In [12]:
# drop all rowsthat have no values
shark_df = shark_df.dropna(how='all')

In [13]:
# Fixes error in "Species " column name.
shark_df.rename(columns={"Species ": "Species"}, inplace = True)

In [14]:
# Changing column name to a simplified version
shark_df.rename(columns={"Fatal Y/N": "Fatal"}, inplace=True)

In [15]:
# Dropping unnecessary columns
shark_df.drop(['Unnamed: 22',
               'Unnamed: 21',
               'original order',
               'Case Number.1',
               'href',
               'href formula',
               'pdf',
               'Case Number'
              ], axis=1, inplace=True)

In [16]:
# Changing Column "Type" entries: Provoked (2 --> 1) & Questionable (2 --> 1) & Unverified =
shark_df["Type"]=shark_df["Type"].apply(lambda t: "Provoked" if t==" Provoked" else "Questionable" if t=="?" else  t)

In [17]:
# Changing Column "Type" entries: Unverified = Unconfirmed, Invalid, Under investigation, Questionable & Watercraft = Boat
shark_df["Type"]=shark_df["Type"].apply(lambda t: "Watercraft" if t=="Boat" else "Unverified" if t=="Unconfirmed" else "Unverified" if t=="Invalid" else "Unverified" if t=="Under investigation" else "Unverified" if t=="Questionable" else t)

In [18]:
# Changing Column "Fatal" entries: unifying yes and no.
shark_df["Fatal"]=shark_df["Fatal"].apply(lambda f: True if f=="F" else False if f=="n" else False if f==" N" else False if f=="N " else False if f=="Nq" else True if f=="y" else True if f=="Y x 2" else False if f=="M" else f)

In [19]:
shark_df["Fatal"]=shark_df["Fatal"].apply(lambda f: True if f=="Y" else False if f=="N" else False)

# Shark Data Frame | Species Dataframe

In [21]:
shark_df = shark_df.dropna(subset=['Species'])

In [32]:
def map_species(value):
        if 'white' in value.lower():
            return 'white'
        elif 'tiger' in value.lower():
            return 'tiger'
        elif 'bull' in value.lower():
            return 'bull'
        else:
            return 'none'

In [34]:
shark_df['Species'] = shark_df['Species'].apply(map_species)

In [36]:
shark_df.reset_index(drop = True)

Unnamed: 0,Date,Year,Type,Country,State,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source
0,2025-01-11 00:00:00,2025.0,Provoked,USA,Hawaii,Off Haleiwa Boat Harbour Oahu,Diving,Male not stated was a dive tour worker,M,23,Bitten on the arm,False,1340hrs,none,Kevin McMurray Trackingsharks.com
1,2025-01-02 00:00:00,2025.0,Unprovoked,New Caledonia,Grande Terre,Islet of Kendek near Koumac,Spearfishing,Robert Cuewapuru,M,40,Severe arm injury and delay in medical treatme...,True,1615hr,tiger,Johannes Marchand Todd Smith
2,2025-01-02 00:00:00,2025.0,Unprovoked,Australia,South Australia,Granites Beach near Westall Streaky Bay,Surfing,Lance Appleby,M,28,Body not recovered,True,1710hr,white,Glen Folkard: Simon De Marchi News.com.au: The...
3,2024-12-29 00:00:00,2024.0,Unprovoked,Egypt,North of Marsa Alam,Red Sea,SCUBA Diving,Peppino Fappani,M,69,Injuries to stomach preventing attack on friend,False,?,tiger,Todd Smith : Kevin McMurray Trackingsharks .co...
4,2024-12-29 00:00:00,2024.0,Unprovoked,Egypt,North of Marsa Alam,Red Sea,SCUBA Diving,Gianluca Di Gioia,M,48,Entire calf muscle removed and bitten both arm...,True,?,tiger,Todd Smith : Kevin McMurray Trackingsharks .co...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3853,Early 1930s,0.0,Unprovoked,BELIZE,,,Standing,a servant,M,16,FATAL,True,,tiger,Mitchell-Hedges
3854,Before 1906,0.0,Unprovoked,AUSTRALIA,,,Fishing,boy,M,,"FATAL, knocked overboard by tail of shark & ca...",True,,none,"NY Sun, 9/9/1906, referring to account by Loui..."
3855,Before 1906,0.0,Unprovoked,AUSTRALIA,,,Fishing,fisherman,M,,FATAL,True,,none,"NY Sun, 9/9/1906, referring to account by Loui..."
3856,Before 1906,0.0,Unprovoked,AUSTRALIA,,,Fishing,fisherman,M,,FATAL,True,,none,"NY Sun, 9/9/1906, referring to account by Loui..."


In [38]:
shark_df.rename(columns={"Type": "Total"}, inplace = True)

In [40]:
total_attacks_group = shark_df.groupby('Species')[['Total']].count().reset_index()

In [42]:
total_attacks_group

Unnamed: 0,Species,Total
0,bull,222
1,none,2546
2,tiger,337
3,white,746


In [44]:
shark_df["Species"].value_counts()

Species
none     2552
white     746
tiger     337
bull      223
Name: count, dtype: int64

In [46]:
fatal_filter = shark_df[(shark_df['Species'] != 'none') & (shark_df['Fatal'] == True)]

In [48]:
fatal_none = shark_df[(shark_df['Species'] == 'none') & (shark_df['Fatal'] == True)]

In [50]:
other_species_fatal = fatal_none.groupby("Species")[['Fatal']].count().reset_index()

In [52]:
other_species_fatal

Unnamed: 0,Species,Fatal
0,none,175


In [54]:
fatal_attacks = fatal_filter.groupby("Species")["Fatal"].count()

In [56]:
fatal_attacks

Species
bull      40
tiger     84
white    173
Name: Fatal, dtype: int64

In [58]:
fatal_attacks.sum()

297

In [60]:
total_filter = shark_df[(shark_df["Species"] != "none")]

In [62]:
total_attacks_fatal = fatal_filter.groupby("Species")[["Fatal"]].count().reset_index()

In [64]:
total_attacks_fatal

Unnamed: 0,Species,Fatal
0,bull,40
1,tiger,84
2,white,173


In [66]:
other_species_fatal

Unnamed: 0,Species,Fatal
0,none,175


In [68]:
total_attacks_group

Unnamed: 0,Species,Total
0,bull,222
1,none,2546
2,tiger,337
3,white,746


In [70]:
all_fatal_attacks = pd.concat([total_attacks_fatal, other_species_fatal], axis=0) 
all_fatal_attacks

Unnamed: 0,Species,Fatal
0,bull,40
1,tiger,84
2,white,173
0,none,175


In [72]:
total_vs_fatal = pd.merge(total_attacks_group, all_fatal_attacks, on='Species', how='right')
total_vs_fatal

Unnamed: 0,Species,Total,Fatal
0,bull,222,40
1,tiger,337,84
2,white,746,173
3,none,2546,175


In [74]:
total_vs_fatal['Percentage_fatal'] = (total_vs_fatal['Fatal']*100 // total_vs_fatal['Total']) 
total_vs_fatal

Unnamed: 0,Species,Total,Fatal,Percentage_fatal
0,bull,222,40,18
1,tiger,337,84,24
2,white,746,173,23
3,none,2546,175,6


# Shark Data Frame | Time Dataframe

In [101]:
shark_df["Fatal"].unique()

array([False,  True])

In [103]:
shark_df["Fatal"].unique()

array([False,  True])

In [105]:
shark_df["Time"].unique()

array(['1340hrs', '1615hr', '1710hr', '?', '1637hr', 'AM', '16.30hrs',
       '1600hr', '11.30hr', '1100hr', '1735hr', '11hr15', '16hr15',
       'after 1200hr', '1400hr', 15.5, '13h15', '9h', 1300, '14h',
       '15h30', 'Not stated', '13h30', '9h15', 'Not advised', '13h40',
       '12h30', '16h00', nan, '11h30', '06h30', '20h00', '13h00', '15h00',
       '02h00', '09h15', 'Early Morning', '14h00', '09h00', '10h20',
       '15h05', '17h00', '01h00', '10h00', 'Afternoon', '19h30', '08h45',
       '13h20', '"Midday"', '16h25', '13h55', '13h45', '14h35', 'Night',
       '1500h ', '10h10', '11h20', '07h15', '07h00', '08h00', '10h30',
       '14h50', '12h00', '11h15', '07h53', '17h45', '10jh45', '17h30',
       '13h12', '07h30', '10h15', '14h09', '10h40', '19h12', 'Morning',
       '15h20', '11h24', '16h30', '14h45', '06h40', '11h46', '20h30',
       '11h00', '16h39', '14h30', '16h45', '08h56', '11h45', '19h00',
       '18h30', '07h45', '07h58', '14h20', '08h40', 'Sunset', '10h45',
       

In [107]:
shark_df.dtypes

Date           object
Year          float64
Total          object
Country        object
State          object
Location       object
Activity       object
Name           object
Sex            object
Age            object
Injury         object
Fatal            bool
Time           object
Species        object
Source         object
clean_time     object
dtype: object

In [109]:
shark_df['Date']=pd.to_datetime(shark_df['Date'], infer_datetime_format=True, errors="ignore")

  shark_df['Date']=pd.to_datetime(shark_df['Date'], infer_datetime_format=True, errors="ignore")
  shark_df['Date']=pd.to_datetime(shark_df['Date'], infer_datetime_format=True, errors="ignore")


In [112]:
shark_df['Time']=pd.to_datetime(shark_df['Time'], infer_datetime_format=True, errors="ignore")

  shark_df['Time']=pd.to_datetime(shark_df['Time'], infer_datetime_format=True, errors="ignore")
  shark_df['Time']=pd.to_datetime(shark_df['Time'], infer_datetime_format=True, errors="ignore")
  shark_df['Time']=pd.to_datetime(shark_df['Time'], infer_datetime_format=True, errors="ignore")


In [113]:
shark_df["Time"].value_counts()

Time
Afternoon                               121
11h00                                   103
Morning                                  84
15h00                                    77
16h00                                    77
12h00                                    73
14h00                                    73
16h30                                    60
13h00                                    57
14h30                                    56
10h00                                    50
11h30                                    48
17h30                                    48
18h00                                    46
15h30                                    46
17h00                                    45
13h30                                    45
09h00                                    38
Night                                    36
10h30                                    35
12h30                                    33
08h00                                    28
09h30                      

In [114]:
# Ensure Pandas does not truncate large outputs
pd.set_option('display.max_rows', None)  # No limit on rows
pd.set_option('display.max_columns', None)  # No limit on columns

In [119]:
# Assuming your DataFrame is df and the column is 'column_name'
unique_values = shark_df['Time'].unique()
unique_values

array(['1340hrs', '1615hr', '1710hr', '?', '1637hr', 'AM', '16.30hrs',
       '1600hr', '11.30hr', '1100hr', '1735hr', '11hr15', '16hr15',
       'after 1200hr', '1400hr', 15.5, '13h15', '9h', 1300, '14h',
       '15h30', 'Not stated', '13h30', '9h15', 'Not advised', '13h40',
       '12h30', '16h00', nan, '11h30', '06h30', '20h00', '13h00', '15h00',
       '02h00', '09h15', 'Early Morning', '14h00', '09h00', '10h20',
       '15h05', '17h00', '01h00', '10h00', 'Afternoon', '19h30', '08h45',
       '13h20', '"Midday"', '16h25', '13h55', '13h45', '14h35', 'Night',
       '1500h ', '10h10', '11h20', '07h15', '07h00', '08h00', '10h30',
       '14h50', '12h00', '11h15', '07h53', '17h45', '10jh45', '17h30',
       '13h12', '07h30', '10h15', '14h09', '10h40', '19h12', 'Morning',
       '15h20', '11h24', '16h30', '14h45', '06h40', '11h46', '20h30',
       '11h00', '16h39', '14h30', '16h45', '08h56', '11h45', '19h00',
       '18h30', '07h45', '07h58', '14h20', '08h40', 'Sunset', '10h45',
       

In [121]:
# new time values
time_dict = {
    '1340hrs': '13:40',
    '1615hr': '16:15',
    '1710hr': '17:10',
    '?': '00:00',  # Assuming placeholder time
    '1637hr': '16:37',
    'AM': '00:00',  # Assuming as AM (midnight)
    '16.30hrs': '16:30',
    '1600hr': '16:00',
    '11.30hr': '11:30',
    '1100hr': '11:00',
    '1735hr': '17:35',
    '11hr15': '11:15',
    '16hr15': '16:15',
    'after 1200hr': '12:00',
    '1400hr': '14:00',
    '15.5': '15:30',  # Interpreted as 15:30
    '13h15': '13:15',
    '9h': '09:00',
    '1300': '13:00',
    '14h': '14:00',
    '15h30': '15:30',
    'Not stated': '00:00',  # Placeholder for unspecified time
    '13h30': '13:30',
    '9h15': '09:15',
    'Not advised': '00:00',  # Placeholder for unspecified time
    '13h40': '13:40',
    '12h30': '12:30',
    '16h00': '16:00',
    'nan': '00:00',  # Placeholder for NaN
    '11h30': '11:30',
    '06h30': '06:30',
    '20h00': '20:00',
    '13h00': '13:00',
    '11h12': '11:12',
    '16h30': '16:30',
    '15h00': '15:00',
    '02h00': '02:00',
    '09h15': '09:15',
    'Early Morning': '05:00',  # Interpreted as early morning
    '16h32': '16:32',
    '11h00': '11:00',
    'Morning': '06:00',  # Interpreted as morning
    '10h30': '10:30',
    '13h20': '13:20',
    '14h00': '14:00',
    '09h00': '09:00',
    '10h20': '10:20',
    '15h05': '15:05',
    '17h00': '17:00',
    '15h45': '15:45',
    '07h45': '07:45',
    '10h40': '10:40',
    '07h50': '07:50',
    '01h00': '01:00',
    '10h00': '10:00',
    'Afternoon': '12:00',  # Interpreted as afternoon
    '19h30': '19:30',
    'Evening': '18:00',  # Interpreted as evening
    '17h50': '17:50',
    '09h30': '09:30',
    '08h45': '08:45',
    '"Midday"': '12:00',
    '16h25': '16:25',
    '13h55': '13:55',
    '13h50': '13:50',
    '17h20': '17:20',
    '13h45': '13:45',
    '10h10': '10:10',
    '14h35': '14:35',
    'Night': '00:00',  # Placeholder for night
    '1500h ': '15:00',
    '19h15': '19:15',
    '11h20': '11:20',
    '07h15': '07:15',
    '07h00': '07:00',
    '18h00': '18:00',
    '08h00': '08:00',
    '14h20': '14:20',
    '17h30': '17:30',
    '07h20': '07:20',
    '14h50': '14:50',
    '-16h30': '16:30',
    '12h00': '12:00',
    '17h17': '17:17',
    '11h15': '11:15',
    '19h00': '19:00',
    '07h53': '07:53',
    '16h10': '16:10',
    '11h17': '11:17',
    '17h45': '17:45',
    '10jh45': '10:45',
    'Early  morning': '05:00',  # Interpreted as early morning
    '13h12': '13:12',
    '07h30': '07:30',
    '11hoo': '11:00',  # Interpreted as typo for 11:00
    '11h43': '11:43',
    '10h15': '10:15',
    '14h09': '14:09',
    '12h15': '12:15',
    '19h12': '19:12',
    '15h20': '15:20',
    '16h40': '16:40',
    '11h24': '11:24',
    '12h50': '12:50',
    '07h31': '07:31',
    '14h45': '14:45',
    '19h20': '19:20',
    'Dusk': '18:00',  # Interpreted as dusk
    '11h45': '11:45',
    '06h40': '06:40',
    '`17h00': '17:00',
    '07h51': '07:51',
    '11h46': '11:46',
    '20h30': '20:30',
    '12h23': '12:23',
    '07h07': '07:07',
    '16h39': '16:39',
    '15h57': '15:57',
    '14h30': '14:30',
    '16h45': '16:45',
    '10j30': '10:30',
    '08h15': '08:15',
    '08h56': '08:56',
    '15h40': '15:40',
    '18h30': '18:30',
    '07h58': '07:58',
    '17h40': '17:40',
    '09h00-10h00': '09:30',  # Approximation
    '17h10': '17:10',
    '09h36': '09:36',
    '08h40': '08:40',
    '06h00': '06:00',
    'Sunset': '18:00',  # Interpreted as sunset
    '10h45': '10:45',
    '1415': '14:15',
    '14h00-15h00': '14:30',  # Approximation
    '14h15': '14:15',
    '09h08': '09:08',
    '15h59': '15:59',
    '08h30': '08:30',
    '12h20': '12:20',
    '10h50': '10:50',
    'Midday': '12:00',
    '09h40': '09:40',
    '14h33': '14:33',
    '12h58': '12:58',
    '"Evening"': '18:00',
    '16h15': '16:15',
    '23h00': '23:00',
    '06h50': '06:50',
    '12h45': '12:45',
    '11h55': '11:55',
    '22h20': '22:20',
    '08h48': '08:48',
    '16h21': '16:21',
    '16h26': '16:26',
    '18h45': '18:45',
    '03h00': '03:00',
    '06h15': '06:15',
    'Before 10h00': '09:00',
    '06h45': '06:45',
    'Early afternoon': '13:00',
    '06h55': '06:55',
    '13h42': '13:42',
    '09h29': '09:29',
    '10h47': '10:47',
    '14h11': '14:11',
    '15h35': '15:35',
    '14h40': '14:40',
    '14h00  -15h00': '14:30',  # Approximation
    'Late afternoon': '16:00',  # Interpreted as late afternoon
    '16h50': '16:50',
    '21h50': '21:50',
    '17h35': '17:35',
    '19h00, Dusk': '19:00',  # Assuming dusk is around 19:00
    '15h01': '15:01',
    '1000': '10:00',
    '23h30': '23:30',
    '10h44': '10:44',
    '13h19': '13:19',
    'Shortly before 12h00': '11:30',  # Interpreted as shortly before noon
    '17h34': '17:34',
    '08h50': '08:50',
    '09h50': '09:50',
    '9h00': '09:00',
    '10h43': '10:43',
    'After noon': '12:00',  # Interpreted as after noon
    '15h15': '15:15',
    '19h05': '19:05',
    '14h30 / 15h30': '15:00',  # Approximation
    '22h00': '22:00',
    '16h20': '16:20',
    '14h34': '14:34',
    '15h25': '15:25',
    '14h55': '14:55',
    '17h46': '17:46',
    'Morning ': '06:00',  # Interpreted as morning
    '15h49': '15:49',
    'Midnight': '00:00',
    '09h30 / 10h00': '09:45',  # Approximation
    '18h15': '18:15',
    '04h00': '04:00',
    '10h25': '10:25',
    '10h45-11h15': '10:45',
    '15h52': '15:52',
    '19h45': '19:45',
    '12h10': '12:10',
    '18h05': '18:05',
    '11h41': '11:41',
    '12h25': '12:25',
    '17h51': '17:51',
    '16h12': '16:12',
    '09h45': '09:45',
    '05h00': '05:00',
    '03h30': '03:30',
    'Sometime between 06h00 & 08hoo': '07:00',  # Approximation
    '16h18': '16:18',
    '11h10': '11:10',
    '07h00 - 08h00': '07:30',  # Approximation
    '18h15-18h30': '18:22',  # Approximation
    '17h01': '17:01',
    '09h57': '09:57',
    '08h20': '08:20',
    '17h58': '17:58',
    '15h19': '15:19',
    '10h55': '10:55',
    '15h55': '15:55',
    '12h40': '12:40',
    '16h05': '16:05',
    '14h10': '14:10',
    '13h24': '13:24',
    '09h00 - 09h30': '09:15',  # Approximation
    '0830': '08:30',
    '11h40': '11:40',
    '08h10': '08:10',
    '15h56': '15:56',
    'Just before noon': '11:30',  # Interpreted as shortly before noon
    '07h56': '07:56',
    '1600': '16:00',
    '16h35': '16:35',
    '09h05': '09:05',
    '19h28': '19:28',
    '12h38': '12:38',
    '05h50': '05:50',
    '15h50': '15:50',
    '11h05': '11:05',
    'Early morning': '05:00',  # Interpreted as early morning
    'Dawn': '05:00',
    '05h45': '05:45',
    '13h25': '13:25',
    '13h26': '13:26',
    '09h11': '09:11',
    '18h20': '18:20',
    '13h51': '13:51',
    'A.M.': '00:00',  # Assuming A.M. (midnight)
    '08h05': '08:05',
    '10h35': '10:35',
    '15h44': '15:44',
    '21h00': '21:00',
    'Lunchtime': '12:00',  # Interpreted as lunchtime
    '15j45': '15:45',  # Typo for 15:45
    '09h35': '09:35',
    '10h27': '10:27',
    '10h16': '10:16',
    '0500': '05:00',
    'Before 07h00': '06:30',  # Interpreted as before 7:00
    '09h20': '09:20',
    '10h00 -- 11h00': '10:30',  # Approximation
    '12h05': '12:05',
    '14h21': '14:21',
    '18h50': '18:50',
    '15h53': '15:53',
    '"Just before 11h00"': '10:45',  # Approximation
    '11h115': '11:15',  # Typo for 11:15
    '20h15': '20:15',
    '12h39': '12:39',
    '07h05': '07:05',
    '  ': '00:00',  # Placeholder for empty space
    '13h05': '13:05',
    'N': '00:00',  # Placeholder for "N"
    '11h50': '11:50',
    'Just before sundown': '18:00',  # Approximation for sundown
    '17h55': '17:55',
    '22h30': '22:30',
    '17h15': '17:15',
    '11h30 ': '11:30',
    '06h10': '06:10',
    'Between 05h00 and 08h00': '06:30',  # Approximation
    '07h08': '07:08',
    '17h00 or 17h40': '17:20',  # Approximation
    '>08h00': '08:00',
    '--': '00:00',  # Placeholder for '--'
    '12h02': '12:02',
    '12h55': '12:55',
    '16h14': '16:14',
    '17h11': '17:11',
    '00h30': '00:30',
    '14h37': '14:37',
    '10h07': '10:07',
    '13h53': '13:53',
    '13h23': '13:23',
    'Just after 12h00': '12:05',
    '02h30': '02:30',
    '11h56': '11:56',
    ' ': '00:00',  # Placeholder for empty space
    'Shortly after midnight': '00:30',  # Approximation
    '14h25': '14:25',
    '13h345': '13:35',  # Typo for 13:35
    '\xa0 ': '00:00',  # Placeholder for special character
    '06h47': '06:47',
    '09h00 -10h00': '09:30',  # Approximation
    '20h45 (Sunset)': '20:45',  # Interpreted as sunset
    'Late morning': '10:00',  # Interpreted as late morning
    'P.M.': '12:00',  # Interpreted as P.M. (noon)
    '18h40': '18:40',
    '13h14': '13:14',
    '13h06': '13:06',
    'Shortly before 13h00': '12:45',  # Approximation
    '12h34': '12:34',
    '11h53': '11:53',
    '8:04 pm': '20:04',  # PM time converted
    '12h46': '12:46',
    '12h48': '12:48',
    '17h42': '17:42',
    '12h35': '12:35',
    'Possibly same incident as 2000.08.21': '00:00',  # Placeholder for unclear context
    'After Dusk': '18:00',  # Approximation for after dusk
    '11h57': '11:57',
    'Noon': '12:00',
    '11h25': '11:25',
    '18h25': '18:25',
    '10h28': '10:28',
    '14h16': '14:16',
    '09h55': '09:55',
    '2 hours after Opperman': '00:00',  # Placeholder for unclear context
    '09h30 ': '09:30',
    'Mid afternoon': '15:00',  # Interpreted as mid-afternoon
    'Mid morning': '09:00',  # Interpreted as mid-morning
    '11h48': '11:48',
    '11h00 / 11h30': '11:15',  # Approximation
    '07h19': '07:19',
    '13h37': '13:37',
    '11h06': '11:06',
    '"Night"': '00:00',  # Placeholder for "Night"
    '18h30?': '18:30',
    '11h58': '11:58',
    '11h51': '11:51',
    '18h12': '18:12',
    '07h10': '07:10',
    '07h40': '07:40',
    '12h33': '12:33',
    '30 minutes after 1992.07.08.a': '00:00',  # Placeholder for unclear context
    '>06h45': '06:45',
    '15h06': '15:06',
    '12h54': '12:54',
    'Between 06h00 & 07h20': '06:40',  # Approximation
    '16h55': '16:55',
    '05h40': '05:40',
    '<07h30': '07:00',  # Interpreted as before 7:30
    '21h30': '21:30',
    '17h00 Sunset': '17:00',  # Interpreted as sunset
    'Nightfall': '00:00',  # Placeholder for nightfall
    'X': '00:00',  # Placeholder for 'X'
    '08h57': '08:57',
    '18h30 (Sunset)': '18:30',  # Interpreted as sunset
    '06j00': '06:00',  # Typo for 06:00
    '08h35': '08:35',
    '10h22': '10:22',
    '02h45': '02:45',
    'Prior to 10h37': '10:00',  # Interpreted as prior to 10:37
    'Daybreak': '05:00',  # Interpreted as daybreak
    '18h10': '18:10',
    '>12h00': '12:00',  # Placeholder for after noon
    'Mid-morning': '09:00',  # Interpreted as mid-morning
    '08h55': '08:55',
    '16h30 or 18h00': '17:15',  # Approximation
    'Just before dawn': '05:00',  # Interpreted as just before dawn
    ' 14h00': '14:00',
    'Daytime': '12:00',  # Interpreted as daytime
    'Dark': '00:00',  # Placeholder for dark
    '10h00 / 11h00': '10:30',  # Approximation
    '"After lunch"': '14:00',  # Interpreted as after lunch
    '07h32': '07:32',
    '15h00 or 15h45': '15:23',  # Approximation
    '>17h00': '17:00',
    '19h00 / 20h00': '19:30',  # Approximation
    '12h45 / 13h45': '13:15',  # Approximation
    '14h00 - 15h00': '14:30',  # Approximation
    'night': '00:00',  # Placeholder for "night"
    '03h45 - 04h00': '03:52',  # Approximation
    '13h10': '13:10',
    '09h30 / 15h30': '12:30',  # Approximation
    '08h00 / 09h30': '08:45',  # Approximation
    '19h35': '19:35',
    '12h00 to 14h00': '13:00',  # Approximation
    '13h35': '13:35',
    'Late night': '00:00',  # Placeholder for late night
    '01h32': '01:32',
    '10h30 or 13h30': '12:00',  # Approximation
    '16h23': '16:23',
    '15h00j': '15:00',  # Typo for 15:00
    'Midday.': '12:00',
    '"After dark"': '18:00',  # Interpreted as after dark
    '10h00 or 14h00': '12:00',  # Approximation
    '19h10': '19:10',
    '2 hrs before sunset': '17:00',  # Approximation
    '18h15 to 21h30': '19:00',  # Approximation
    '1500': '15:00',
    '"shortly before dusk"': '18:00',  # Approximation for dusk
    '>17h30': '17:30',
    '>14h30': '14:30',
    'Between 11h00 & 12h00': '11:30',  # Approximation
    'After 04h00': '04:30',
    '11h01 -time of ship sinking': '00:00',  # Placeholder for unclear context
    'Ship abandoned at 03h10': '03:10',  # Exact time provided
    '19h55': '19:55',
    'After dusk': '18:00',  # Approximation for after dusk
    'FATAL  (Wire netting installed at local beaches after this incident.)': '00:00',  # Placeholder for unclear context
    '01h30': '01:30',
    'After midnight': '00:30',
    'Late afternoon': '16:00',  # Interpreted as late afternoon
    '05h30': '05:30',
    '08h58': '08:58',
    '"Early evening"': '18:00',  # Interpreted as early evening
    'Late Afternoon': '16:30',  # Interpreted as late afternoon
    '   ': '00:00',  # Placeholder for empty space
    'Before daybreak': '05:00',  # Interpreted as before daybreak
    'dusk': '18:00',  # Interpreted as dusk
    'Before 10h30': '10:00',  # Interpreted as before 10:30
    '06h00 -- 07h00': '06:30',  # Approximation
    '01h50': '01:50',
    '17h00-18h00': '17:30',  # Approximation
    '19h00-20h00': '19:30'  # Approximation
}








In [123]:
# new column with cleaned time added to list
shark_df['clean_time'] = shark_df['Time'].map(time_dict)

In [125]:
shark_df.head()

Unnamed: 0,Date,Year,Total,Country,State,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,clean_time
0,2025-01-11 00:00:00,2025.0,Provoked,USA,Hawaii,Off Haleiwa Boat Harbour Oahu,Diving,Male not stated was a dive tour worker,M,23,Bitten on the arm,False,1340hrs,none,Kevin McMurray Trackingsharks.com,13:40
1,2025-01-02 00:00:00,2025.0,Unprovoked,New Caledonia,Grande Terre,Islet of Kendek near Koumac,Spearfishing,Robert Cuewapuru,M,40,Severe arm injury and delay in medical treatme...,True,1615hr,tiger,Johannes Marchand Todd Smith,16:15
2,2025-01-02 00:00:00,2025.0,Unprovoked,Australia,South Australia,Granites Beach near Westall Streaky Bay,Surfing,Lance Appleby,M,28,Body not recovered,True,1710hr,white,Glen Folkard: Simon De Marchi News.com.au: The...,17:10
3,2024-12-29 00:00:00,2024.0,Unprovoked,Egypt,North of Marsa Alam,Red Sea,SCUBA Diving,Peppino Fappani,M,69,Injuries to stomach preventing attack on friend,False,?,tiger,Todd Smith : Kevin McMurray Trackingsharks .co...,00:00
4,2024-12-29 00:00:00,2024.0,Unprovoked,Egypt,North of Marsa Alam,Red Sea,SCUBA Diving,Gianluca Di Gioia,M,48,Entire calf muscle removed and bitten both arm...,True,?,tiger,Todd Smith : Kevin McMurray Trackingsharks .co...,00:00


In [127]:
# New column "time group" with three categories: morninh, afternoon, night
shark_df['time_group']=shark_df['clean_time'].apply(lambda x: "morning" if (str(x) > '05:59') and (str(x) < '12:01') else ("afternoon" if (str(x) > '12:00') and (str(x) < '18:01') else "night"))

In [128]:
# time group output
shark_df.groupby('time_group').size()

time_group
afternoon    1014
morning       960
night        1884
dtype: int64

In [133]:
# New Table time group vs. species vs. fatality
shark_time_eval=shark_df.pivot_table(index='time_group', columns='Species', values=['Fatal'], aggfunc='sum')

In [135]:
shark_time_eval

Unnamed: 0_level_0,Fatal,Fatal,Fatal,Fatal
Species,bull,none,tiger,white
time_group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
afternoon,14,38,24,77
morning,11,35,21,54
night,15,102,39,42
