In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

In [3]:
def sanitize(input_str: str) -> str:
    """
    Sanitizes the input string by replacing all special characters with '_'
    and converting the string to lowercase.

    Args:
        input_str (str): The string to be sanitized.

    Returns:
        str: The sanitized string.
    """
    if not isinstance(input_str, str):
        return str(input_str)
    sanitized_str = re.sub(r"[^a-zA-Z0-9]+", "_", input_str)
    sanitized_str = sanitized_str.lower()
    return sanitized_str

In [4]:
def sanitize_column_values(column_values: str, mapping: dict):
    if pd.isna(column_values):
        return "unknown"
        # return np.nan
    for clean_label, synonyms in mapping.items():
        if column_values in synonyms:
            return sanitize(clean_label)
    return sanitize(column_values)

In [5]:
def convert_to_numeric(series):
    """Attempts to convert a series to numeric, leaves values as is if conversion fails."""
    try:
        return pd.to_numeric(series, errors="ignore")
    except Exception as e:
        return series

In [6]:
list(range(0, 101, 10))

[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [7]:
# def nan_stats(df: pd.DataFrame):
#     """Calculate NaN count, percentage, and bins for each column."""
#     nan_count = df.isna().sum()
#     nan_percentage = (df.isna().mean()) * 100
#     non_nan_count = df.shape[0] - nan_count
#     non_nan_percentage = 100 - nan_percentage

#     bins = list(range(0, 101, 10))
#     bin_labels = [
#         "[0-10%)",
#         "[10-20%)",
#         "[20-30%)",
#         "[30-40%)",
#         "[40-50%)",
#         "[50-60%)",
#         "[60-70%)",
#         "[70-80%)",
#         "[80-90%)",
#         "[90-100%]",
#     ]

#     binned_nan_percentage = pd.cut(
#         nan_percentage, bins=bins, labels=bin_labels, include_lowest=True
#     )
#     binned_values_present_percentage = pd.cut(
#         non_nan_percentage, bins=bins, labels=bin_labels, include_lowest=True
#     )

#     result = pd.DataFrame(
#         {
#             "column": df.columns,
#             "nan_count": nan_count.values,
#             "nan_percentage": nan_percentage.values,
#             "values_present": non_nan_count,
#             "values_present_percentage": non_nan_percentage,
#             "binned_nan_percentage": binned_nan_percentage.values,
#             "binned_values_present_percentage": binned_values_present_percentage.values,
#         }
#     )

#     return result.sort_values(["nan_count", "column"]).reset_index(drop=True)


def nan_stats(df: pd.DataFrame):
    """Calculate NaN count, percentage, and bins for each column with separate bins for 0% and 100%."""
    nan_count = df.isna().sum()
    nan_percentage = (df.isna().mean()) * 100
    non_nan_count = df.shape[0] - nan_count
    non_nan_percentage = 100 - nan_percentage

    bins = [-0.01, 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    bin_labels = [
        "0%",
        "(0-10%)",
        "(10-20%)",
        "(20-30%)",
        "(30-40%)",
        "(40-50%)",
        "(50-60%)",
        "(60-70%)",
        "(70-80%)",
        "(80-90%)",
        "(90-100%)",
        "100%",
    ]

    def assign_bins(value, bins, labels):
        """Custom function to assign bins."""
        if value == 0:
            return "0%"
        elif value == 100:
            return "100%"
        else:
            for i in range(1, len(bins)):
                if bins[i - 1] < value <= bins[i]:
                    return labels[i - 1]
        return "Unknown"

    binned_nan_percentage = nan_percentage.apply(
        assign_bins, args=(bins, bin_labels)
    )
    binned_values_present_percentage = non_nan_percentage.apply(
        assign_bins, args=(bins, bin_labels)
    )

    result = pd.DataFrame(
        {
            "column": df.columns,
            "nan_count": nan_count.values,
            "non_nan_count": non_nan_count,
            "nan_percentage": nan_percentage.values,
            "non_nan_percentage": non_nan_percentage,
            "binned_nan_percentage": binned_nan_percentage.values,
            "binned_non_nan_percentage": binned_values_present_percentage.values,
        }
    )

    return result.sort_values(["nan_count", "column"]).reset_index(drop=True)

In [8]:
sanitize("Present.at.*&(time.of.bite")

'present_at_time_of_bite'

In [9]:
df = pd.read_excel(
    "/Users/paniket/TU_Eindhoven/2_Study/Q2_JBI100_Visualisation_4/4_Code/JBI100_Visualisation/data/Australian Shark-Incident Database Public Version.xlsx",
    sheet_name="ASID",
)

In [10]:
df.rename(columns=sanitize, inplace=True)

In [11]:
df.head(2)

Unnamed: 0,uin,incident_month,incident_year,victim_injury,state,location,latitude,longitude,site_category,site_category_comment,shark_common_name,shark_scientific_name,shark_identification_method,shark_identification_source,shark_length_m,basis_for_length,provoked_unprovoked,provocative_act,no_sharks,victim_activity,fish_speared_,commercial_dive_activity,object_of_bite,present_at_time_of_bite,direction_first_strike,shark_behaviour,victim_aware_of_shark,shark_captured,injury_location,injury_severity,victim_gender,victim_age,victim_clothing,clothing_coverage,dominant_clothing_colour,other_clothing_colour,clothing_pattern,fin_colour,diversionary_action_taken,diversionary_action_outcome,people_3m,people_3_15m,time_of_incident,depth_of_incident_m,teeth_recovered,time_in_water_min,water_temperature_c,total_water_depth_m,water_visability_m,distance_to_shore_m,spring_or_neap_tide,tidal_cycle,wind_condition,weather_condition,air_temperature_c,personal_protective_device,deterrent_brand_and_type,data_source,reference,unnamed_59
0,1,1,1791,fatal,NSW,near sydney,-33.866667,151.2,coastal,,white shark,Carcharodon carcharias,"bite analysis, shark behaviour, geographical location",,,,unprovoked,,,swimming,,,,,,,,,torso,major lacerations,female,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,book,"shark&survl, whitley 1958, book ref 1793",
1,2,3,1803,injured,WA,"hamelin bay, faure island",-25.833333,113.883333,coastal,,tiger shark,Galeocerdo cuvier,"bite analysis, shark behaviour, geographical location",,,,unprovoked,,1.0,swimming,,,,,,swam at victim,Y,,,,male,,,,,,,,pushed at shark,,,,,0.0,,1.0,,,,,,,,,,,,book,"balgridge,green,taylor,whitley 1940",


In [12]:
df["provoked_unprovoked"].unique()

array(['unprovoked', 'provoked', nan], dtype=object)

In [13]:
# df["shark_behaviour"].unique()

In [14]:
df.describe(include="all").T.sort_values("count", ascending=False)
# df.describe(include='all').T.sort_values("count", ascending=False).index

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
uin,1233.0,,,,617.0,356.080749,1.0,309.0,617.0,925.0,1233.0
incident_year,1233.0,,,,1968.518248,48.451842,1791.0,1933.0,1986.0,2011.0,2024.0
victim_injury,1233.0,6.0,injured,746.0,,,,,,,
state,1233.0,7.0,NSW,449.0,,,,,,,
latitude,1233.0,631.0,-10.566667,24.0,,,,,,,
longitude,1233.0,587.0,142.216667,30.0,,,,,,,
site_category,1233.0,8.0,coastal,800.0,,,,,,,
incident_month,1233.0,,,,5.939984,4.084692,1.0,2.0,5.0,10.0,12.0
location,1230.0,939.0,"thursday island, torre strait",20.0,,,,,,,
provoked_unprovoked,1226.0,2.0,unprovoked,824.0,,,,,,,


In [15]:
print(df["state"].unique())

state_mapping = {
    "NSW": "New South Wales",
    "WA": "Western Australia",
    "TAS": "Tasmania",
    "SA": "South Australia",
    "QLD": "Queensland",
    "VIC": "Victoria",
    "NT": "Northern Territory",
}

df["state_names"] = df["state"].replace(state_mapping)

['NSW' 'WA' 'TAS' 'SA' 'QLD' 'VIC' 'NT']


In [16]:
print(df["site_category"].unique())
df["site_category_cleaned"] = df["site_category"].apply(lambda x: sanitize(x))
df["site_category_cleaned"].unique()

['coastal' 'estuary/harbour' 'island open ocean' 'river' 'ocean/pelagic'
 'Coastal' 'Ocean/pelagic' 'other: fish farm']


array(['coastal', 'estuary_harbour', 'island_open_ocean', 'river',
       'ocean_pelagic', 'other_fish_farm'], dtype=object)

# Cleaning data


In [17]:
df.loc[
    (df["provoked_unprovoked"] == "provoked")
    & (df["provocative_act"].isna() == True),
    "provocative_act",
] = "unknown"

In [18]:
df.loc[
    (df["provoked_unprovoked"] == "unprovoked")
    & (df["provocative_act"].isna() == False)
]

Unnamed: 0,uin,incident_month,incident_year,victim_injury,state,location,latitude,longitude,site_category,site_category_comment,shark_common_name,shark_scientific_name,shark_identification_method,shark_identification_source,shark_length_m,basis_for_length,provoked_unprovoked,provocative_act,no_sharks,victim_activity,fish_speared_,commercial_dive_activity,object_of_bite,present_at_time_of_bite,direction_first_strike,shark_behaviour,victim_aware_of_shark,shark_captured,injury_location,injury_severity,victim_gender,victim_age,victim_clothing,clothing_coverage,dominant_clothing_colour,other_clothing_colour,clothing_pattern,fin_colour,diversionary_action_taken,diversionary_action_outcome,people_3m,people_3_15m,time_of_incident,depth_of_incident_m,teeth_recovered,time_in_water_min,water_temperature_c,total_water_depth_m,water_visability_m,distance_to_shore_m,spring_or_neap_tide,tidal_cycle,wind_condition,weather_condition,air_temperature_c,personal_protective_device,deterrent_brand_and_type,data_source,reference,unnamed_59,state_names,site_category_cleaned
1167,1168,1,2021,fatal,SA,port macdonnell,-38.0522,140.7018,coastal,ocean beach,white shark,Carcharodon carcharias,,,,,unprovoked,snorkelling,1.0,snorkeling,no,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,South Australia,coastal


In [19]:
df["victim_injury"].unique()

array(['fatal', 'injured', 'uninjured', 'unknown', 'Injured', 'injury'],
      dtype=object)

In [20]:
# df["shark_behaviour"].unique()

In [21]:
df["victim_injury"] = df["victim_injury"].apply(lambda x: sanitize(x))
df["victim_injury"] = df["victim_injury"].replace(
    {"injury": "injured", "fatality": "fatal"}
)

df["injury_severity"] = df["injury_severity"].apply(lambda x: sanitize(x))
df["injury_severity"] = df["injury_severity"].replace(
    {"injury": "injured", "fatal": "fatality"}
)

In [22]:
df["injury_severity"].unique()

array(['major_lacerations', 'nan', 'minor_lacerations',
       'other_teeth_marks', 'abrasion', 'surface_wound', 'punctures',
       'fatality', 'lacerations'], dtype=object)

In [23]:
# df.loc[
#     (df["victim_injury"] == "uninjured")
#     & (df["shark_behaviour"].str.lower().str.contains("bit"))
# ]

In [24]:
# print(df["victim_activity"].unique())

In [25]:
# print(df["injury_location"].unique())

In [26]:
# df.describe(include='all').T.sort_values("count", ascending=False).index

In [27]:
# df.describe(include='all').T.sort_values("count", ascending=False)[["count", "unique"]]

In [28]:
mapping = {
    "bit arm": [
        "bit victim on arm",
        "bit victim's arm",
        "bit victim on arms",
        "bit arm",
        "bit victim on right arm",
        "bit victim on left arm",
        "bit forearm",
        "bit hand & arm",
        "bit victim on shoulder/arm",
        "bit arm & board while resting",
        "bite arm",
    ],
    "bit leg": [
        "bit victim on leg",
        "bit victim on leg - thigh",
        "bit victim on thigh",
        "bit victim on thigh & arm",
        "bit victim on thigh while sitting on board",
        "bit legs",
        "bit leg off",
        "bit leg - thigh",
        "bit leg - left",
        "bit victim on left leg",
        "bit victim on right thigh",
        "bit victim on upper thigh & torso",
        "bit leg twice",
        "bit victim s legs",
        "bit victims leg",
        "bit victims leg off",
        "bit victim on leg - knee",
    ],
    "bit torso": [
        "bit victim on torso",
        "bit torso",
        "bit victim on chest",
        "bit victim on chest ",
        "bit victim on torso & arm",
        "bit diver on torso",
    ],
    "bit shoulder": [
        "bit victim on shoulder",
        "bit victim on shoulder & head",
        "bit shoulder",
        "bit victim on left arm & shoulder",
    ],
    "bit head": [
        "bit head",
        "bit victims head",
        "bit victim on head",
        "bit victim on head ",
        "bit victim on head & chest",
    ],
    "bit foot": [
        "bit victim on foot",
        "bit victim on left foot",
        "bit victim on flipper / foot",
        "bit victim on ankle",
        "bit foot",
        "bit victim on knee",
    ],
    "bit multiple times": [
        "bit victim multiple times",
        "bit victim mutliple times",
        "multiple bites",
    ],
    "bit other": [
        "bit victim on back",
        "bit victim on body",
        "bit victim on body - buttock",
        "bit victim on hip",
        "bit victim on stomach",
        "bit victim on face",
        "bit victim on buttock",
    ],
    "grazed": [
        "grazed leg with fin",
        "grazed leg",
        "grazed victim with fin",
        "grazed victim on leg",
        "grazed victim skin",
        "grazed by close pass",
        "grazed victim w/skin",
    ],
    "bumped": [
        "bumped hull of craft",
        "bumped canoe with nose",
        "bumped victim with nose",
        "bumped victim",
        "bumped victim on arm",
        "bumped board with nose",
        "bumped board",
        "bumped surfboard",
        "bumped board from below",
        "bumped kayak",
        "bumped leg",
        "bumped off his board and bitten",
        "bumped ski",
        "bumped ski from below",
        "bumped into divers back",
        "bumped into victim",
        "bumped rowing scull",
        "bumped surfboard into air",
    ],
    "circled": [
        "circled diver",
        "circled sunken boat",
        "circled victim",
        "circled victim on ski",
        "circled & bumped board",
        "circled surfer and then came back and knocked off the paddle boarder. ",
    ],
    "swam at victim": [
        "swam at victim",
        "swam towards victim",
        "swam towards victim & horse",
        "swam towards diver",
        "swam between legs of victim",
        "swam towards spearfisherman",
        "swam from above in agitated manner, pectoral fins down",
    ],
    "swam away": [
        "swam away after being disturbed",
        "swam away - body in mouth",
    ],
    "attempted bite": [
        "attempted to bite victim",
        "attempted to bit legs",
        "attempted to bite leg",
        "attempted to bite arm",
        "attempted to bit divers camera",
    ],
    "attacked": [
        "attacked victim",
        "bit victim",
        "bit victim ",
        "bit another person first",
        "one shark bit the victim",
        "aggressive towards victim",
        "aggressive towards another person",
        "aggressive towards diver",
    ],
    "bit object": [
        "bit canoe",
        "bit dress",
        "bit lead shoe",
        "bit scull in 2 pieces",
        "bit clothing - pants",
        "bit shoe",
        "bit sock",
        "bit trouser leg",
        "bit spear held on victim",
        "bit collecting bag",
        "bit rubber dinghy",
        "bit surfboard",
        "bit board",
        "bit surf ski",
        "bit paddle board",
        "bit kayak",
        "bit spear & tangled in line",
        "bit surfboard leg rope",
        "bit boat hull",
        "bit swim fin",
        "bit body board being towed",
        "bit surfboard, leg & hand",
        "bit rear of surfboard",
        "bit spear & tangled in line",
    ],
    "miscellaneous": [
        "victim was bumped, not bitten",
        "victim never saw shark",
        "awaiting dpi investigation",
        "shark never sighted",
        "video online (graphic)",
        "bit fish & fingers",
        "bit fish on weight belt",
        "bit surf ski paddle",
        "bit scull near stern ",
        "shark got caught in leg rope ",
    ],
}

df["shark_behaviour_generic"] = df["shark_behaviour"].apply(
    lambda x: sanitize_column_values(x, mapping)
)

In [29]:
mapping = {
    "shark not seen": [
        "victim never saw shark",
        "awaiting dpi investigation",
        "shark never sighted",
        "victim was bumped, not bitten",
    ],
    "bit object": [
        "bit victim’s camera",
        "attempted to bit divers camera",
        "bit spear & tangled in line",
        "bit spear held on victim",
        "bit fish on weight belt",
        "bit collecting bag",
    ],
    "bit equipment": [
        "bit canoe",
        "bit kayak",
        "bit paddle board",
        "bit rubber dinghy",
        "bit surfboard",
        "bit surfboard leg rope",
        "bit scull near stern",
        "bit board",
        "bit body board being towed",
        "bit swim fin",
        "bit boat hull",
        "bit rear of surfboard",
        # "bit board & surfer",
    ],
    "bumped into object": [
        "bumped board with nose",
        "bumped canoe with nose",
        "bumped hull of craft",
        "bumped kayak",
        "bumped ski",
        "bumped rowing scull",
        "bumped surfboard",
        "bumped board",
        "bumped victim’s camera",
        "bumped into victim",
    ],
    "bit head & chest": ["bit victim on head & chest"],
    "bit torso & arm": ["bit victim on torso & arm"],
    "bit thigh & arm": ["bit victim on thigh & arm"],
    "bit leg & back": ["bit victim on leg & back"],
    "bit shoulder & head": ["bit victim on shoulder & head"],
    "bit leg & arm": ["bit victim on leg & arm"],
    "bit flipper / foot": ["bit victim on flipper / foot"],
    "bit arm": [
        "bit victim on arm",
        "bit victim's arm",
        "bit victim on arms",
        "bit arm",
        "bit forearm",
        "bit hand & arm",
        "bit victim on shoulder/arm",
        "bite arm",
    ],
    "bit leg": [
        "bit victim on leg",
        "bit legs",
        "bit leg off",
        "bit leg - thigh",
        "bit victim on left leg",
        "bit victim on right thigh",
        "bit victim on leg - knee",
        "bit leg - left",
        "bit victim s legs",
    ],
    "bit torso": [
        "bit victim on torso",
        "bit torso",
        "bit victim on chest",
        "bit victim on chest ",
    ],
    "bit shoulder": ["bit victim on shoulder", "bit shoulder"],
    "bit head": [
        "bit head",
        "bit victims head",
        "bit victim on head",
        "bit victim on head ",
    ],
    "bit foot": [
        "bit victim on foot",
        "bit victim on ankle",
        "bit victim on knee",
        "bit foot",
    ],
    "bit face": ["bit victim on face"],
    "bit back": ["bit victim on back"],
    "bit stomach": ["bit victim on stomach"],
    "bit buttock": ["bit victim on body - buttock", "bit victim on buttock"],
    "bit multiple times": [
        "bit victim multiple times",
        "bit victim mutliple times",
        "multiple bites",
    ],
    "bit victim in half": ["bit victim in half"],
    "bit victim & board": ["bit victim & board"],
    "bit arm & board": ["bit arm & board while resting"],
    "grazed": [
        "grazed leg with fin",
        "grazed leg",
        "grazed victim with fin",
        "grazed victim skin",
        "grazed victim w/skin",
    ],
    "bumped": [
        "bumped into divers back",
        "bumped victim on arm",
        "bumped victim",
    ],
    "swam at victim": [
        "swam at victim",
        "swam towards victim",
        "swam towards diver",
    ],
    "swam between legs": ["swam between legs of victim"],
    "swam away": [
        "swam away after being disturbed",
        "swam away - body in mouth",
    ],
    "attempted bite": [
        "attempted to bite victim",
        "attempted to bite arm",
        "attempted to bite leg",
    ],
    "attacked": [
        "attacked victim",
        "aggressive towards victim",
        "aggressive towards diver",
    ],
    "miscellaneous": ["video online (graphic)", "awaiting dpi investigation"],
}

df["shark_behaviour_specific"] = df["shark_behaviour"].apply(
    lambda x: sanitize_column_values(x, mapping)
)

In [30]:
# non_numeric_columns = df.select_dtypes(exclude=["number"]).columns

In [31]:
# df = df.apply(convert_to_numeric)

In [32]:
# non_numeric_columns = df.select_dtypes(
#     exclude=["number", "int", "float"]
# ).columns

In [33]:
df.describe(include="all").T.sort_values("count", ascending=False)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
uin,1233.0,,,,617.0,356.080749,1.0,309.0,617.0,925.0,1233.0
site_category,1233,8,coastal,800,,,,,,,
shark_behaviour_generic,1233,154,bit_leg,295,,,,,,,
site_category_cleaned,1233,6,coastal,812,,,,,,,
state_names,1233,7,New South Wales,449,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
clothing_pattern,13,3,stripe,8,,,,,,,
deterrent_brand_and_type,6,5,"Shark Pod, on",2,,,,,,,
fish_speared_,4,2,no,2,,,,,,,
unnamed_59,1.0,,,,415438758.0,,415438758.0,415438758.0,415438758.0,415438758.0,415438758.0


In [34]:
df["month_year"] = pd.to_datetime(
    df["incident_month"].astype(str).str.zfill(2)
    + "-"
    + df["incident_year"].astype(str)
)

  df["month_year"] = pd.to_datetime(


In [35]:
df_nan_stats = nan_stats(df)

In [36]:
df_nan_stats.head(4)

Unnamed: 0,column,nan_count,non_nan_count,nan_percentage,non_nan_percentage,binned_nan_percentage,binned_non_nan_percentage
0,incident_month,0,1233,0.0,100.0,0%,100%
1,incident_year,0,1233,0.0,100.0,0%,100%
2,injury_severity,0,1233,0.0,100.0,0%,100%
3,latitude,0,1233,0.0,100.0,0%,100%


In [37]:
df_nan_stats.head(30)
# print(df_nan_stats["binned_nan_percentage"].unique())
# print(df_nan_stats["binned_values_present_percentage"].unique())

Unnamed: 0,column,nan_count,non_nan_count,nan_percentage,non_nan_percentage,binned_nan_percentage,binned_non_nan_percentage
0,incident_month,0,1233,0.0,100.0,0%,100%
1,incident_year,0,1233,0.0,100.0,0%,100%
2,injury_severity,0,1233,0.0,100.0,0%,100%
3,latitude,0,1233,0.0,100.0,0%,100%
4,longitude,0,1233,0.0,100.0,0%,100%
5,month_year,0,1233,0.0,100.0,0%,100%
6,shark_behaviour_generic,0,1233,0.0,100.0,0%,100%
7,shark_behaviour_specific,0,1233,0.0,100.0,0%,100%
8,site_category,0,1233,0.0,100.0,0%,100%
9,site_category_cleaned,0,1233,0.0,100.0,0%,100%


# Filling NaN Categorical values with Unknown

### After we did select columns to avoid a whole lot of unknowns


In [38]:
columns_to_fill = [
    "victim_gender",
    "victim_activity",
    "shark_behaviour",
    "injury_severity",
    "data_source",
    "shark_identification_method",
    # "shark_common_name",
    # "shark_scientific_name",
    "location",
    "provoked_unprovoked",
    "injury_location",
]

df[columns_to_fill] = df[columns_to_fill].fillna(value="unknown")

df.loc[
    (df["shark_common_name"].isna() == True)
    & (df["shark_scientific_name"].isna() == True),
    ["shark_common_name", "shark_scientific_name"],
] = "shark_not_known"

In [39]:
df_selected_columns = df[df_nan_stats["column"].to_list()[:27]].drop(
    columns=[
        "uin",
        "state",
        "site_category",
    ]
)
# df_selected_columns.to_csv("df_shark_selected_columns.csv", index = False)

In [40]:
df_selected_columns.to_csv(
    "/Users/paniket/TU_Eindhoven/2_Study/Q2_JBI100_Visualisation_4/4_Code/JBI100_Visualisation/data/filtered_cleaned_shark_data.csv",
    index=False,
)
df.to_csv(
    "/Users/paniket/TU_Eindhoven/2_Study/Q2_JBI100_Visualisation_4/4_Code/JBI100_Visualisation/data/cleaned_engineered_shark_data.csv",
    index=False,
)

In [41]:
dfx = pd.read_csv(
    "/Users/paniket/TU_Eindhoven/2_Study/Q2_JBI100_Visualisation_4/4_Code/JBI100_Visualisation/data/filtered_cleaned_shark_data.csv"
)
dfx

Unnamed: 0,incident_month,incident_year,injury_severity,latitude,longitude,month_year,shark_behaviour_generic,shark_behaviour_specific,site_category_cleaned,state_names,victim_injury,location,provoked_unprovoked,victim_gender,victim_activity,shark_common_name,shark_scientific_name,no_sharks,reference,injury_location,data_source,shark_identification_method,shark_behaviour,victim_age
0,1,1791,major_lacerations,-33.86666666666667,151.2,1791-01-01,unknown,unknown,coastal,New South Wales,fatal,near sydney,unprovoked,female,swimming,white shark,Carcharodon carcharias,,"shark&survl, whitley 1958, book ref 1793",torso,book,"bite analysis, shark behaviour, geographical location",unknown,
1,3,1803,,-25.833333333333332,113.88333333333334,1803-03-01,swam_at_victim,swam_at_victim,coastal,Western Australia,injured,"hamelin bay, faure island",unprovoked,male,swimming,tiger shark,Galeocerdo cuvier,1.0,"balgridge,green,taylor,whitley 1940",unknown,book,"bite analysis, shark behaviour, geographical location",swam at victim,
2,1,1807,minor_lacerations,-33.86666666666667,151.2,1807-01-01,bit_victim_on_wrist,bit_victim_on_wrist,estuary_harbour,New South Wales,injured,"cockle bay, sydney harbour",unprovoked,male,swimming,bull shark,Carcharhinus leucas,1.0,sydney gazette 18.1.1807,"arm, hand",media outlet,"bite analysis, shark behaviour",bit victim on wrist,
3,1,1820,major_lacerations,-42.8,147.53333333333333,1820-01-01,bit_leg,bit_leg,coastal,Tasmania,fatal,"sweetwater point, pitt water",provoked,male,swimming,shark_not_known,shark_not_known,1.0,"shark&survl, c. black researcher",leg,witness account,unknown,bit victim on leg,
4,1,1825,minor_lacerations,-33.85,151.21666666666667,1825-01-01,bit_leg,bit_leg,estuary_harbour,New South Wales,injured,"kirribili point, sydney harbour",unprovoked,male,swimming,bull shark,Carcharhinus leucas,1.0,maitland daily mercury 13.11.1899,leg,media outlet,"bite analysis, shark behaviour, geographical location",bit legs,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1228,12,2023,fatality,-34.679356,137.684906.,2023-12-01,unknown,unknown,coastal,South Australia,fatal,"ethel beach, yorke penninsula",unprovoked,male,surfing,white shark,Carcharodon carcharias,1.0,,fatal,unknown,unknown,unknown,15.0
1229,1,2024,lacerations,-34.3333,135.7500.,2024-01-01,unknown,unknown,coastal,South Australia,injured,eyre penninsula,unprovoked,male,surfing,shark_not_known,shark_not_known,1.0,,leg,unknown,unknown,unknown,64.0
1230,1,2024,,-32.333,152.533,2024-01-01,unknown,unknown,estuary_harbour,New South Wales,injured,elizabeth bay,unprovoked,unknown,unknown,bull shark,Carcharhinus leucas,1.0,,unknown,unknown,unknown,unknown,
1231,2,2024,,-30.30591,115.03825,2024-02-01,unknown,unknown,coastal,Western Australia,injured,jurien bay,unprovoked,unknown,unknown,tiger shark,Galeocerdo cuvier,1.0,,unknown,unknown,unknown,unknown,


In [43]:
df_nan_stats.to_csv(
    "/Users/paniket/TU_Eindhoven/2_Study/Q2_JBI100_Visualisation_4/4_Code/JBI100_Visualisation/data/nan_information_df.csv",
    index=False,
)