In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import plotly.graph_objects as go

import chart_studio.plotly as py
import cufflinks as cf
import seaborn as sns
import plotly.express as px
%matplotlib inline

# Make Plotly work in your Jupyter Notebook
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
# Use Plotly locally
cf.go_offline()

import datapane as dp


In [2]:
import plotly.io as pio

# Create a custom default template
custom_template = pio.templates["plotly_white"]  # Start with an existing template
custom_template.layout.title.font.size = 16
custom_template.layout.title.x = 0.5  # Center titles
custom_template.layout.xaxis.title.font.size = 14
custom_template.layout.yaxis.title.font.size = 14
custom_template.layout.margin = dict(l=40, r=40, t=40, b=40)  # Adjust margins
custom_template.layout.legend.font.size = 12

# Register the custom template globally
pio.templates["custom_template"] = custom_template

# Set it as the default template
pio.templates.default = "plotly_white+custom_template"

In [3]:
## Figure wide defaults and example

# def set_plot_defaults(fig):
#     fig.update_layout(
#         font=dict(size=14),  # Default font size
#         title=dict(x=0.5),  # Center titles
#         margin=dict(l=50, r=50, t=50, b=50),  # Margins
#         legend=dict(font=dict(size=12))  # Legend font size
#     )
#     return fig

# fig = px.bar(df, x="incident_outcome", y="count", title="Incident Outcome Distribution")
# fig = set_plot_defaults(fig)
# fig.show()

In [4]:
# ## Notebook wide defaults:
# import plotly.io as pio

# def set_plotly_defaults():
#     pio.renderers.default = "browser"
#     pio.templates.default = "plotly_white"


# # Import to other nbs
# from plotly_defaults import set_plotly_defaults
# set_plotly_defaults()

In [5]:
df = pd.read_csv(
    "../Work-related Injury and Illness/ITA Case Detail Data 2023 through 8-31-2023.csv",
    delimiter=",",
    low_memory=False,
)

In [6]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889447 entries, 0 to 889446
Data columns (total 33 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   id                        889447 non-null  int64  
 1   establishment_id          889447 non-null  int64  
 2   establishment_name        889447 non-null  object 
 3   ein                       812922 non-null  object 
 4   company_name              840391 non-null  object 
 5   street_address            889446 non-null  object 
 6   city                      889447 non-null  object 
 7   state                     889447 non-null  object 
 8   zip_code                  889447 non-null  float64
 9   naics_code                889447 non-null  int64  
 10  naics_year                888174 non-null  float64
 11  industry_description      829648 non-null  object 
 12  establishment_type        887563 non-null  float64
 13  size                      889447 non-null  i

In [7]:
df["date_of_incident"] = pd.to_datetime(df["date_of_incident"], errors="coerce")

In [8]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
id,889447.0,451380.236701,680.0,226415.5,451770.0,675989.5,900940.0,259802.354363
establishment_id,889447.0,863122.366861,41940.0,651512.0,989369.0,1140101.0,1246187.0,357576.169569
zip_code,889447.0,25948429.910993,603.0,29601.0,53146.0,84663.0,997000000.0,132134803.602068
naics_code,889447.0,496292.148951,111110.0,444110.0,491110.0,622110.0,999999.0,135696.454961
naics_year,888174.0,2021.012598,2012.0,2022.0,2022.0,2022.0,2022.0,2.873469
establishment_type,887563.0,1.083898,0.0,1.0,1.0,1.0,3.0,0.362637
size,889447.0,9.833503,1.0,3.0,3.0,22.0,22.0,9.040229
annual_average_employees,889447.0,8200.306033,0.0,149.0,347.0,1323.0,172307584.0,1033831.036612
total_hours_worked,889447.0,3528374.320517,0.0,235685.0,584011.0,2125664.0,13833831664.0,24140354.185441
soc_reviewed,889447.0,0.838397,0.0,0.0,1.0,1.0,2.0,0.69882


In [9]:
# fig1 = px.bar(df, x='incident_outcome', title='Distribution of Incident Outcomes', labels={'incident_outcome': 'Incident Outcome'})
# fig1.show()

In [10]:
df.columns

Index(['id', 'establishment_id', 'establishment_name', 'ein', 'company_name',
       'street_address', 'city', 'state', 'zip_code', 'naics_code',
       'naics_year', 'industry_description', 'establishment_type', 'size',
       'annual_average_employees', 'total_hours_worked', 'case_number',
       'job_description', 'soc_code', 'soc_description', 'soc_reviewed',
       'soc_probability', 'date_of_incident', 'incident_outcome',
       'dafw_num_away', 'djtr_num_tr', 'type_of_incident', 'time_started_work',
       'time_of_incident', 'time_unknown', 'date_of_death',
       'created_timestamp', 'year_filing_for'],
      dtype='object')

In [11]:
df["type_of_incident"].unique()

array([6, 1, 5, 2, 3, 4])

In [12]:
df["incident_outcome"].unique()

array([4, 3, 2, 1])

In [13]:
df.shape

(889447, 33)

In [14]:
df_agg = df.groupby("incident_outcome").size().reset_index(name="count")