# JBI100 Visualization

### Academic year 2024-2025

## Incidents and Accidents

Data sources:

- Australian Shark Incidents (https://github.com/cjabradshaw/AustralianSharkIncidentDatabase)
- Railroad Incidents (https://data.transportation.gov/Railroads/Railroad-Equipment-Accident-Incident-Source-Data-F/aqxq-n5hy/about_data)
- Work-related Injury and Illness (https://www.osha.gov/Establishment-Specific-Injury-and-Illness-Data)

Data dictionaries and additional info can be found in the respective data folders.
Note: you only need to select one dataset for your project.


# Using the initial imports as base


In [None]:
# Import libraries
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import os

# Do not truncate tables
# pd.set_option("display.max_columns", None)

# If you receive a 'ModuleNotFoundError' please install the according library.
# This can be done from within the Jupyter environment with the command
#'!python3 -m pip install lib' where lib is the according library name.

In [None]:
%matplotlib inline

In [None]:
import plotly.io as pio

pio.renderers.default = "browser"

In [None]:
# Australian Shark Incidents Data
# df_shark = pd.read_excel(
#     "../Australian Shark Incidents/Australian Shark-Incident Database Public Version.xlsx",
#     index_col=0,
# )

# # Railroad Incidents
# df_railroad = pd.read_csv('../Railroad Incidents/Railroad_Equipment_Accident_Incident_Source_Data__Form_54__20241026.csv', delimiter=',', low_memory=False)

# # Work-related Injury and Illness
df = pd.read_csv(
    "../Work-related Injury and Illness/ITA Case Detail Data 2023 through 8-31-2023.csv",
    delimiter=",",
    low_memory=False,
)

## Explore Shark Data


In [None]:
df.shape

In [None]:
df.sample(5)

In [None]:
df["date_of_incident"].isna().sum()

In [None]:
df.describe().T

In [None]:
df.info(memory_usage="deep")

In [None]:
fig = px.histogram(
    df, x="incident_outcome", color="type_of_incident", width=1000, height=800
)
fig.show()

In [None]:
df["date_of_incident"] = pd.to_datetime(df["date_of_incident"], errors="coerce")

In [None]:
fig1 = px.bar(
    df,
    x="incident_outcome",
    title="Distribution of Incident Outcomes",
    labels={"incident_outcome": "Incident Outcome"},
)
fig1.show()

In [None]:
fig2 = px.line(
    df,
    x="date_of_incident",
    y=df.groupby("date_of_incident")["case_number"].transform("count"),
    title="Incidents Over Time",
    labels={"x": "Date of Incident", "y": "Number of Incidents"},
)
fig2.show()

In [None]:
df["incident_count"] = df.groupby("total_hours_worked")["case_number"].transform(
    "count"
)
fig3 = px.scatter(
    df,
    x="total_hours_worked",
    y="incident_count",
    title="Total Hours Worked vs Number of Incidents",
    labels={
        "total_hours_worked": "Total Hours Worked",
        "incident_count": "Number of Incidents",
    },
)
fig3.show()

In [None]:
fig4 = px.bar(
    df,
    x="soc_description",
    title="Distribution of Job Roles Involved in Incidents",
    labels={"soc_description": "Job Role", "count": "Count of Incidents"},
)
fig4.show()

In [None]:
fig5 = px.box(
    df,
    x="soc_description",
    y="soc_probability",
    title="Probability of SOC Review by Job Role",
    labels={"soc_description": "Job Role", "soc_probability": "SOC Probability"},
)
fig5.show()

In [None]:
df["incident_count_est"] = df.groupby("establishment_name")["case_number"].transform(
    "count"
)
fig6 = px.bar(
    df,
    x="establishment_name",
    y="incident_count_est",
    title="Number of Incidents per Establishment",
    labels={
        "establishment_name": "Establishment Name",
        "incident_count_est": "Number of Incidents",
    },
)
fig6.show()

In [None]:
fig7 = px.pie(
    df,
    values="case_number",
    names="industry_description",
    title="Incident Counts by Industry",
    labels={"industry_description": "Industry", "case_number": "Number of Incidents"},
)
fig7.show()

In [None]:
fig8 = px.histogram(
    df,
    x="time_of_incident",
    title="Time of Incident Analysis",
    labels={"time_of_incident": "Time of Incident", "count": "Number of Incidents"},
)
fig8.show()

In [None]:
df["incident_count_role"] = df.groupby("annual_average_employees")[
    "case_number"
].transform("count")
fig9 = px.scatter(
    df,
    x="annual_average_employees",
    y="incident_count_role",
    size="total_hours_worked",
    title="Annual Average Employees vs Incident Count",
    labels={
        "annual_average_employees": "Annual Avg Employees",
        "incident_count_role": "Number of Incidents",
    },
)
fig9.show()

In [None]:
fig10 = px.bar(
    df,
    x="soc_description",
    y="case_number",
    color="incident_outcome",
    barmode="stack",
    title="Incident Outcome Breakdown by Job Role",
    labels={
        "soc_description": "Job Role",
        "case_number": "Number of Incidents",
        "incident_outcome": "Outcome",
    },
)
fig10.show()

# End here


In [None]:
# fig = px.scatter(df_work, x="Incident.year", y="Victim.age", width=1000, height=800)
# fig.show()

## Explore Railroad Incident data


In [None]:
# df_railroad.sample(5)

In [None]:
# df_railroad.describe()

In [None]:
# # Years are missing the centuries, so add them
# df_railroad['corrected_year'] = np.where(df_railroad['YEAR'] > 24.0, 1900+df_railroad['YEAR'], 2000+df_railroad['YEAR'])

# fig = px.histogram(df_railroad, x="corrected_year",
#                  width=1000, height=800)
# fig.show()

## Explore Work-related Injury and Illness data


In [None]:
# df_work.sample(5)

In [None]:
# df_work.describe()

In [None]:
# fig = px.histogram(df_work, x="incident_outcome",
#                  color="type_of_incident",
#                  width=1000, height=800)
# fig.show()