Import Required Libraries

In [59]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize,stem
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
import lightgbm as lgb
import nltk
from nltk.util import ngrams
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS
import shap
shap.initjs()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dinus\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Load the Data Set

In [60]:
#Uploaded the file to google drive and access it
df = pd.read_csv("https://drive.google.com/uc?id=1beItLRR7JOvyvs7AtvPvjH7GD9nVA1s2")
print(df.head())

   Unnamed: 0                 Data   Countries     Local Industry Sector  \
0           0  2016-01-01 00:00:00  Country_01  Local_01          Mining   
1           1  2016-01-02 00:00:00  Country_02  Local_02          Mining   
2           2  2016-01-06 00:00:00  Country_01  Local_03          Mining   
3           3  2016-01-08 00:00:00  Country_01  Local_04          Mining   
4           4  2016-01-10 00:00:00  Country_01  Local_04          Mining   

  Accident Level Potential Accident Level Genre Employee or Third Party  \
0              I                       IV  Male             Third Party   
1              I                       IV  Male                Employee   
2              I                      III  Male    Third Party (Remote)   
3              I                        I  Male             Third Party   
4             IV                       IV  Male             Third Party   

         Critical Risk                                        Description  
0              P

Drop Unnecessary Columns And Rename Columns

In [61]:
df.drop("Unnamed: 0", axis=1, inplace=True)
df.rename(columns={'Data':'Date', 'Countries':'Country', 'Genre':'Gender', 'Employee or Third Party':'Employee type'}, inplace=True)
df.head()

Unnamed: 0,Date,Country,Local,Industry Sector,Accident Level,Potential Accident Level,Gender,Employee type,Critical Risk,Description
0,2016-01-01 00:00:00,Country_01,Local_01,Mining,I,IV,Male,Third Party,Pressed,While removing the drill rod of the Jumbo 08 f...
1,2016-01-02 00:00:00,Country_02,Local_02,Mining,I,IV,Male,Employee,Pressurized Systems,During the activation of a sodium sulphide pum...
2,2016-01-06 00:00:00,Country_01,Local_03,Mining,I,III,Male,Third Party (Remote),Manual Tools,In the sub-station MILPO located at level +170...
3,2016-01-08 00:00:00,Country_01,Local_04,Mining,I,I,Male,Third Party,Others,Being 9:45 am. approximately in the Nv. 1880 C...
4,2016-01-10 00:00:00,Country_01,Local_04,Mining,IV,IV,Male,Third Party,Others,Approximately at 11:45 a.m. in circumstances t...


Data Pre-Processing

In [62]:
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].apply(lambda x : x.year)
df['Month'] = df['Date'].apply(lambda x : x.month)
df['Day'] = df['Date'].apply(lambda x : x.day)
df['Weekday'] = df['Date'].apply(lambda x : x.day_name())
df['WeekofYear'] = df['Date'].apply(lambda x : x.weekofyear)
df.head()

Unnamed: 0,Date,Country,Local,Industry Sector,Accident Level,Potential Accident Level,Gender,Employee type,Critical Risk,Description,Year,Month,Day,Weekday,WeekofYear
0,2016-01-01,Country_01,Local_01,Mining,I,IV,Male,Third Party,Pressed,While removing the drill rod of the Jumbo 08 f...,2016,1,1,Friday,53
1,2016-01-02,Country_02,Local_02,Mining,I,IV,Male,Employee,Pressurized Systems,During the activation of a sodium sulphide pum...,2016,1,2,Saturday,53
2,2016-01-06,Country_01,Local_03,Mining,I,III,Male,Third Party (Remote),Manual Tools,In the sub-station MILPO located at level +170...,2016,1,6,Wednesday,1
3,2016-01-08,Country_01,Local_04,Mining,I,I,Male,Third Party,Others,Being 9:45 am. approximately in the Nv. 1880 C...,2016,1,8,Friday,1
4,2016-01-10,Country_01,Local_04,Mining,IV,IV,Male,Third Party,Others,Approximately at 11:45 a.m. in circumstances t...,2016,1,10,Sunday,1


In [63]:
#function to convert month variable into seasons
def month2seasons(x):
    if x in [9, 10, 11]:
        season = 'Spring'
    elif x in [12, 1, 2]:
        season = 'Summer'
    elif x in [3, 4, 5]:
        season = 'Autumn'
    elif x in [6, 7, 8]:
        season = 'Winter'
    return season

In [64]:
df['Season'] = df['Month'].apply(month2seasons)
df.head()

Unnamed: 0,Date,Country,Local,Industry Sector,Accident Level,Potential Accident Level,Gender,Employee type,Critical Risk,Description,Year,Month,Day,Weekday,WeekofYear,Season
0,2016-01-01,Country_01,Local_01,Mining,I,IV,Male,Third Party,Pressed,While removing the drill rod of the Jumbo 08 f...,2016,1,1,Friday,53,Summer
1,2016-01-02,Country_02,Local_02,Mining,I,IV,Male,Employee,Pressurized Systems,During the activation of a sodium sulphide pum...,2016,1,2,Saturday,53,Summer
2,2016-01-06,Country_01,Local_03,Mining,I,III,Male,Third Party (Remote),Manual Tools,In the sub-station MILPO located at level +170...,2016,1,6,Wednesday,1,Summer
3,2016-01-08,Country_01,Local_04,Mining,I,I,Male,Third Party,Others,Being 9:45 am. approximately in the Nv. 1880 C...,2016,1,8,Friday,1,Summer
4,2016-01-10,Country_01,Local_04,Mining,IV,IV,Male,Third Party,Others,Approximately at 11:45 a.m. in circumstances t...,2016,1,10,Sunday,1,Summer


NLP Pre-processing

In [65]:
STOPWORDS.update(["cm", "kg", "mr", "wa" ,"nv", "ore", "da", "pm", "am", "cx"])
print(STOPWORDS)

{'had', "here's", "weren't", 'she', 'above', 'can', 'otherwise', 'on', 'yourselves', "i'll", 'you', 'hence', 'these', 'off', 'which', 'been', "he's", 'all', 'myself', "we're", 'this', 'some', 'itself', 'himself', 'while', 'me', "can't", 'k', 'be', 'ours', "who's", 'cannot', 'there', 'very', 'else', 'those', 'to', 'through', 'get', 'their', 'com', 'further', 'we', 'kg', 'by', "it's", 'nor', 'www', 'he', 'most', 'how', 'both', 'also', "aren't", 'being', 'few', 'each', 'is', 'as', 'could', 'from', 'just', 'then', "wouldn't", 'over', "don't", 'too', "he'll", 'so', 'or', 'not', "mustn't", 'ore', "you'll", 'da', "i'd", 'do', 'our', "let's", "that's", "i'm", 'them', 'am', "how's", 'r', 'themselves', 'a', 'before', 'down', 'between', "he'd", "what's", "didn't", 'and', 'during', "you've", 'after', 'his', 'same', 'who', "couldn't", 'the', 'are', 'other', 'that', 'yours', 'since', "there's", 'only', 'for', 'until', "they'll", "hasn't", "why's", "i've", 'more', 'shall', 'him', 'yourself', 'any', '

In [66]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from wordcloud import STOPWORDS

# Download necessary NLTK resources
nltk.download('wordnet')
nltk.download('punkt')

# Define the corrected function
def nlp_preprocesser(row):
    sentence = row.Description
    
    # Convert all characters to lowercase
    lowered = sentence.lower()
    
    # Tokenization
    tok = word_tokenize(lowered)

    # Lemmatizing & stemming
    lemmatizer = WordNetLemmatizer()
    lem = [lemmatizer.lemmatize(i) for i in tok if i not in STOPWORDS]

    stemmer = PorterStemmer()
    stems = [stemmer.stem(i) for i in lem if i not in STOPWORDS]

    # Remove non-alphabetical characters like '(', '.', or '!'
    alphas = [i for i in stems if i.isalpha() and (i not in STOPWORDS)]
    
    return " ".join(alphas)

print("Function updated successfully!")

Function updated successfully!


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dinus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dinus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [67]:
df['Description_processed'] = df.apply(nlp_preprocesser, axis=1)
df.head()

Unnamed: 0,Date,Country,Local,Industry Sector,Accident Level,Potential Accident Level,Gender,Employee type,Critical Risk,Description,Year,Month,Day,Weekday,WeekofYear,Season,Description_processed
0,2016-01-01,Country_01,Local_01,Mining,I,IV,Male,Third Party,Pressed,While removing the drill rod of the Jumbo 08 f...,2016,1,1,Friday,53,Summer,remov drill rod jumbo mainten supervisor proce...
1,2016-01-02,Country_02,Local_02,Mining,I,IV,Male,Employee,Pressurized Systems,During the activation of a sodium sulphide pum...,2016,1,2,Saturday,53,Summer,activ sodium sulphid pump pipe uncoupl sulfid ...
2,2016-01-06,Country_01,Local_03,Mining,I,III,Male,Third Party (Remote),Manual Tools,In the sub-station MILPO located at level +170...,2016,1,6,Wednesday,1,Summer,milpo locat level collabor excav work pick han...
3,2016-01-08,Country_01,Local_04,Mining,I,I,Male,Third Party,Others,Being 9:45 am. approximately in the Nv. 1880 C...,2016,1,8,Friday,1,Summer,approxim personnel begin task unlock soquet bo...
4,2016-01-10,Country_01,Local_04,Mining,IV,IV,Male,Third Party,Others,Approximately at 11:45 a.m. in circumstances t...,2016,1,10,Sunday,1,Summer,approxim circumst mechan anthoni group leader ...


Sentiment Analysis

In [68]:
def sentiment2score(text):
    analyzer = SentimentIntensityAnalyzer()
    sent_score = analyzer.polarity_scores(text)["compound"]
    return float(sent_score)

In [69]:
df['Description_sentiment_score'] = df['Description'].apply(lambda x: sentiment2score(x))
df.head()

Unnamed: 0,Date,Country,Local,Industry Sector,Accident Level,Potential Accident Level,Gender,Employee type,Critical Risk,Description,Year,Month,Day,Weekday,WeekofYear,Season,Description_processed,Description_sentiment_score
0,2016-01-01,Country_01,Local_01,Mining,I,IV,Male,Third Party,Pressed,While removing the drill rod of the Jumbo 08 f...,2016,1,1,Friday,53,Summer,remov drill rod jumbo mainten supervisor proce...,0.7845
1,2016-01-02,Country_02,Local_02,Mining,I,IV,Male,Employee,Pressurized Systems,During the activation of a sodium sulphide pum...,2016,1,2,Saturday,53,Summer,activ sodium sulphid pump pipe uncoupl sulfid ...,0.2732
2,2016-01-06,Country_01,Local_03,Mining,I,III,Male,Third Party (Remote),Manual Tools,In the sub-station MILPO located at level +170...,2016,1,6,Wednesday,1,Summer,milpo locat level collabor excav work pick han...,0.0
3,2016-01-08,Country_01,Local_04,Mining,I,I,Male,Third Party,Others,Being 9:45 am. approximately in the Nv. 1880 C...,2016,1,8,Friday,1,Summer,approxim personnel begin task unlock soquet bo...,0.0772
4,2016-01-10,Country_01,Local_04,Mining,IV,IV,Male,Third Party,Others,Approximately at 11:45 a.m. in circumstances t...,2016,1,10,Sunday,1,Summer,approxim circumst mechan anthoni group leader ...,-0.4215


Explotary Data Analysis (EDA)

Univariate Analysis

In [70]:
country_cnt = np.round(df['Country'].value_counts(normalize=True) * 100)
hv.Bars(country_cnt).opts(title="Country Count", color="lightgreen", xlabel="Countries", ylabel="Percentage", yformatter='%d%%')\
                .opts(opts.Bars(width=500, height=300,tools=['hover'],show_grid=True))\
            * hv.Text('Country_01', 15, f"{int(country_cnt.loc['Country_01'])}%")\
            * hv.Text('Country_02', 15, f"{int(country_cnt.loc['Country_02'])}%")\
            * hv.Text('Country_03', 15, f"{int(country_cnt.loc['Country_03'])}%")

In [71]:
local_cnt = np.round(df['Local'].value_counts(normalize=True) * 100)
hv.Bars(local_cnt).opts(title="Local Count", color="lightblue", xlabel="Locals", ylabel="Percentage", yformatter='%d%%')\
                .opts(opts.Bars(width=700, height=300,tools=['hover'],show_grid=True))

Industry Sector

In [72]:
sector_cnt = np.round(df['Industry Sector'].value_counts(normalize=True) * 100)
hv.Bars(sector_cnt).opts(title="Industry Sector Count", color="lightpink", xlabel="Sectors", ylabel="Percentage", yformatter='%d%%')\
                .opts(opts.Bars(width=500, height=300,tools=['hover'],show_grid=True))\
                * hv.Text('Mining', 15, f"{int(sector_cnt.loc['Mining'])}%")\
                * hv.Text('Metals', 15, f"{int(sector_cnt.loc['Metals'])}%")\
                * hv.Text('Others', 15, f"{int(sector_cnt.loc['Others'])}%")

Accident Levels

In [73]:
# Calculate accident and potential accident levels as percentages
ac_level_cnt = np.round(df['Accident Level'].value_counts(normalize=True) * 100, decimals=1)
pot_ac_level_cnt = np.round(df['Potential Accident Level'].value_counts(normalize=True) * 100, decimals=1)

# Combine both into a single DataFrame
ac_pot = pd.concat([ac_level_cnt, pot_ac_level_cnt], axis=1, sort=False).fillna(0)

# Rename columns to 'Accident' and 'Potential'
ac_pot.columns = ['Accident', 'Potential']

# Melt for visualization
ac_pot = pd.melt(ac_pot.reset_index(), id_vars=['index'], value_vars=['Accident', 'Potential'])
ac_pot = ac_pot.rename(columns={'index': 'Severity', 'variable': 'Levels', 'value': 'Percentage'})

# Ensure Severity is treated as categorical for correct ordering
ac_pot['Severity'] = pd.Categorical(ac_pot['Severity'], categories=sorted(ac_pot['Severity'].unique()), ordered=True)

# Define the color palette
color_palette = ['red', 'blue']

# Plot using Holoviews
bar_plot = hv.Bars(ac_pot, kdims=['Severity', 'Levels'], vdims='Percentage').opts(
    opts.Bars(
        title="Accident Levels Count",
        width=700,
        height=300,
        tools=['hover'],
        show_grid=True,
        xrotation=45,
        ylabel="Percentage",
        yformatter='%d%%',
        color=hv.dim('Levels'),
        cmap=color_palette  # Apply color palette
    )
)

bar_plot

  dataset.data.groupby(group_by, sort=False)]


Gender 

In [74]:
gender_cnt = np.round(df['Gender'].value_counts(normalize=True) * 100)
hv.Bars(gender_cnt).opts(title="Gender Count", color="lightyellow", xlabel="Gender", ylabel="Percentage", yformatter='%d%%')\
                .opts(opts.Bars(width=500, height=300,tools=['hover'],show_grid=True))


Employee Type

In [75]:
emp_type_cnt = np.round(df['Employee type'].value_counts(normalize=True) * 100)
hv.Bars(emp_type_cnt).opts(title="Employee type Count", color="cyan", xlabel="Employee Type", ylabel="Percentage", yformatter='%d%%')\
                .opts(opts.Bars(width=500, height=300,tools=['hover'],show_grid=True))

Critical Risks

In [76]:
cr_risk_cnt = np.round(df['Critical Risk'].value_counts(normalize=True) * 100)
hv.Bars(cr_risk_cnt[::-1]).opts(title="Critical Risk Count", color="green", xlabel="Critical Risks", ylabel="Percentage", xformatter='%d%%')\
                .opts(opts.Bars(width=600, height=600,tools=['hover'],show_grid=True,invert_axes=True))

Calendar

In [77]:
year_cnt = np.round(df['Year'].value_counts(normalize=True,sort=False) * 100)
y = hv.Bars(year_cnt).opts(title="Year Count", color="green", xlabel="Years")
month_cnt = np.round(df['Month'].value_counts(normalize=True,sort=False) * 100)
m = hv.Bars(month_cnt).opts(title="Month Count", color="skyblue", xlabel="Months") * hv.Curve(month_cnt).opts(color='red', line_width=3)
day_cnt = np.round(df['Day'].value_counts(normalize=True,sort=False) * 100)
d = hv.Bars(day_cnt).opts(title="Day Count", color="skyblue", xlabel="Days") * hv.Curve(day_cnt).opts(color='red', line_width=3)

weekday_cnt = pd.DataFrame(np.round(df['Weekday'].value_counts(normalize=True, sort=False) * 100))
weekday_cnt.columns = ['Percentage']  # Rename the column to 'Percentage'
weekday_cnt['week_num'] = [ ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'].index(i) for i in weekday_cnt.index ]
weekday_cnt.sort_values('week_num', inplace=True)

# Use the correct column name ('Percentage') in the visualization
w = hv.Bars((weekday_cnt.index, weekday_cnt['Percentage'])).opts(
    title="Weekday Count", color="green", xlabel="Weekdays"
) * hv.Curve(weekday_cnt['Percentage']).opts(color='red', line_width=3)

# Combine with other visualizations (y, m, d)
(y + m + d + w).opts(
    opts.Bars(width=400, height=300, tools=['hover'], show_grid=True, ylabel="Percentage", yformatter='%d%%')
).cols(2)


Season

In [78]:
season_cnt = pd.DataFrame(np.round(df['Season'].value_counts(normalize=True, sort=False) * 100).reset_index())

# Rename the columns for clarity
season_cnt.columns = ['Season', 'Percentage']

# Apply the function to the 'Season' column
season_cnt['season_order'] = season_cnt['Season'].apply(lambda x: ['Spring','Summer','Autumn','Winter'].index(x))

season_cnt.sort_values('season_order', inplace=True)

# Set the 'Season' column as the index
season_cnt.set_index('Season', inplace=True)

# Drop the 'season_order' column 
season_cnt.drop('season_order', axis=1, inplace=True)

# Plot
hv.Bars(season_cnt).opts(title="Season Count", color="pink", xlabel="Season", ylabel="Percentage", yformatter='%d%%')\
                .opts(opts.Bars(width=500, height=300, tools=['hover'], show_grid=True))

Multivariate Analysis

In [79]:
# Define a lambda function to calculate percentages
f = lambda x: np.round(x / x.sum() * 100)

# Group the data by 'Country' and 'Industry Sector', count the occurrences of each sector,
# unstack the result to have sectors as columns, and apply the percentage function row-wise
con_sector = df.groupby(['Country', 'Industry Sector'])['Industry Sector'].count().unstack().apply(f, axis=1)

# Create a bar chart from the data
# - Melt the DataFrame to convert it into a format suitable for plotting
# - Use 'Country' and 'Industry Sector' as the x-axis categories
# - 'value' represents the percentage of each sector in a country
hv.Bars(
    pd.melt(con_sector.reset_index(), ['Country']), 
    ['Country', 'Industry Sector'], 
    'value'
).opts(
    # Customize the plot's appearance
    opts.Bars(
        title="Industry Sector by Countries Count",  # Set the title
        width=800,  # Set the width of the plot
        height=300,  # Set the height of the plot
        tools=['hover'],  # Enable hover tool for interactive data display
        show_grid=True,  # Display grid lines
        xrotation=0,  # Set x-axis label rotation
        ylabel="Percentage",  # Set y-axis label
        yformatter='%d%%'  # Format y-axis values as percentages
    )
)

Employee Type By Geneder

In [80]:
# Define a lambda function to calculate percentages
f = lambda x: np.round(x / x.sum() * 100)

# Group the data by 'Gender' and 'Employee type', count the occurrences of each type,
# unstack the result to have types as columns, and apply the percentage function row-wise
em_gen = df.groupby(['Gender', 'Employee type'])['Employee type'].count().unstack().apply(f, axis=1)

# Create a bar chart from the data
# - Melt the DataFrame to convert it into a format suitable for plotting
# - Use 'Gender' and 'Employee type' as the x-axis categories
# - 'value' represents the percentage of each employee type by gender
hv.Bars(
    pd.melt(em_gen.reset_index(), ['Gender']), 
    ['Gender', 'Employee type'], 
    'value'
).opts(
    # Customize the plot's appearance
    opts.Bars(
        title="Employee type by Gender Count",  # Set the title
        width=800,  # Set the width of the plot
        height=300,  # Set the height of the plot
        tools=['hover'],  # Enable hover tool for interactive data display
        show_grid=True,  # Display grid lines
        xrotation=0,  # Set x-axis label rotation
        ylabel="Percentage",  # Set y-axis label
        yformatter='%d%%'  # Format y-axis values as percentages
    )
)

Industry Sector by Gender

In [81]:
# Define a lambda function to calculate percentages
f = lambda x: np.round(x / x.sum() * 100)

# Group the data by 'Gender' and 'Industry Sector', count the occurrences of each sector,
# unstack the result to have sectors as columns, and apply the percentage function row-wise
em_gen = df.groupby(['Gender', 'Industry Sector'])['Industry Sector'].count().unstack().apply(f, axis=1)

# Create a bar chart from the data
# - Melt the DataFrame to convert it into a format suitable for plotting
# - Use 'Gender' and 'Industry Sector' as the x-axis categories
# - 'value' represents the percentage of each sector by gender
hv.Bars(
    pd.melt(em_gen.reset_index(), ['Gender']), 
    ['Gender', 'Industry Sector'], 
    'value'
).opts(
    # Customize the plot's appearance
    opts.Bars(
        title="Industry Sector by Gender Count",  # Set the title
        width=800,  # Set the width of the plot
        height=300,  # Set the height of the plot
        tools=['hover'],  # Enable hover tool for interactive data display
        show_grid=True,  # Display grid lines
        xrotation=0,  # Set x-axis label rotation
        ylabel="Percentage",  # Set y-axis label
        yformatter='%d%%'  # Format y-axis values as percentages
    )
)

Accident Level By Gender

In [82]:
# Define a lambda function to calculate percentages
f = lambda x: np.round(x / x.sum() * 100)

# Group the data by 'Gender' and 'Accident Level', count the occurrences of each level,
# unstack the result to have levels as columns, and apply the percentage function row-wise
ac_gen = df.groupby(['Gender', 'Accident Level'])['Accident Level'].count().unstack().apply(f, axis=1)

# Create a bar chart for accident levels by gender
ac = hv.Bars(
    pd.melt(ac_gen.reset_index(), ['Gender']), 
    ['Gender', 'Accident Level'], 
    'value'
).opts(
    # Set the title of the plot
    opts.Bars(title="Accident Level by Gender Count")
)

# Group the data by 'Gender' and 'Potential Accident Level', count the occurrences of each level,
# unstack the result to have levels as columns, and apply the percentage function row-wise
pot_ac_gen = df.groupby(['Gender', 'Potential Accident Level'])['Potential Accident Level'].count().unstack().apply(f, axis=1)

# Create a bar chart for potential accident levels by gender
pot_ac = hv.Bars(
    pd.melt(pot_ac_gen.reset_index(), ['Gender']), 
    ['Gender', 'Potential Accident Level'], 
    'value'
).opts(
    # Set the title of the plot
    opts.Bars(title="Potential Accident Level by Gender Count")
)

# Combine the two plots and customize their appearance
(ac + pot_ac).opts(
    # Customize the plot's appearance
    opts.Bars(
        width=400,  # Set the width of each plot
        height=300,  # Set the height of each plot
        tools=['hover'],  # Enable hover tool for interactive data display
        show_grid=True,  # Display grid lines
        xrotation=0,  # Set x-axis label rotation
        ylabel="Percentage",  # Set y-axis label
        yformatter='%d%%'  # Format y-axis values as percentages
    )
)

Accident Level by Employee Types

In [83]:
# Define a lambda function to calculate percentages
f = lambda x: np.round(x / x.sum() * 100)

# Group the data by 'Employee type' and 'Accident Level', count the occurrences of each level,
# unstack the result to have levels as columns, and apply the percentage function row-wise
ac_em = df.groupby(['Employee type', 'Accident Level'])['Accident Level'].count().unstack().apply(f, axis=1)

# Create a bar chart for accident levels by employee type
ac = hv.Bars(
    pd.melt(ac_em.reset_index(), ['Employee type']), 
    ['Employee type', 'Accident Level'], 
    'value'
).opts(
    # Set the title of the plot
    opts.Bars(title="Accident Level by Employee type Count")
)

# Group the data by 'Employee type' and 'Potential Accident Level', count the occurrences of each level,
# unstack the result to have levels as columns, and apply the percentage function row-wise
pot_ac_em = df.groupby(['Employee type', 'Potential Accident Level'])['Potential Accident Level'].count().unstack().apply(f, axis=1)

# Create a bar chart for potential accident levels by employee type
pot_ac = hv.Bars(
    pd.melt(pot_ac_em.reset_index(), ['Employee type']), 
    ['Employee type', 'Potential Accident Level'], 
    'value'
).opts(
    # Set the title of the plot
    opts.Bars(title="Potential Accident Level by Employee type Count")
)

# Combine the two plots and customize their appearance
(ac + pot_ac).opts(
    # Customize the plot's appearance
    opts.Bars(
        width=400,  # Set the width of each plot
        height=300,  # Set the height of each plot
        tools=['hover'],  # Enable hover tool for interactive data display
        show_grid=True,  # Display grid lines
        xrotation=0,  # Set x-axis label rotation
        ylabel="Percentage",  # Set y-axis label
        yformatter='%d%%',  # Format y-axis values as percentages
        fontsize={'title': 9}  # Adjust the font size of the title
    )
)


Accident Level By Month

In [84]:
# Define a lambda function to calculate percentages
f = lambda x: np.round(x / x.sum() * 100)

# Group the data by 'Month' and 'Accident Level', count the occurrences of each level,
# unstack the result to have levels as columns, apply the percentage function row-wise,
# and fill any missing values with 0
ac_mo = df.groupby(['Month', 'Accident Level'])['Accident Level'].count().unstack().apply(f, axis=1).fillna(0)

# Create a line chart for accident levels by month
ac = (
    # Plot each accident level as a separate curve
    hv.Curve(ac_mo['I'], label='I') * 
    hv.Curve(ac_mo['II'], label='II') * 
    hv.Curve(ac_mo['III'], label='III') * 
    hv.Curve(ac_mo['IV'], label='IV') * 
    hv.Curve(ac_mo['V'], label='V')
).opts(
    # Set the title of the plot
    opts.Curve(title="Accident Level by Month Count")
)

# Group the data by 'Month' and 'Potential Accident Level', count the occurrences of each level,
# unstack the result to have levels as columns, apply the percentage function row-wise,
# and fill any missing values with 0
pot_ac_mo = df.groupby(['Month', 'Potential Accident Level'])['Potential Accident Level'].count().unstack().apply(f, axis=1).fillna(0)

# Create a line chart for potential accident levels by month
pot_ac = (
    # Plot each potential accident level as a separate curve
    hv.Curve(pot_ac_mo['I'], label='I') * 
    hv.Curve(pot_ac_mo['II'], label='II') * 
    hv.Curve(pot_ac_mo['III'], label='III') * 
    hv.Curve(pot_ac_mo['IV'], label='IV') * 
    hv.Curve(pot_ac_mo['V'], label='V') * 
    hv.Curve(pot_ac_mo['VI'], label='VI')
).opts(
    # Set the title of the plot
    opts.Curve(title="Potential Accident Level by Month Count")
)

# Combine the two plots and customize their appearance
(ac + pot_ac).opts(
    # Customize the plot's appearance
    opts.Curve(
        width=800,  # Set the width of the plot
        height=300,  # Set the height of the plot
        tools=['hover'],  # Enable hover tool for interactive data display
        show_grid=True,  # Display grid lines
        ylabel="Percentage",  # Set y-axis label
        yformatter='%d%%'  # Format y-axis values as percentages
    )
).cols(1)  # Display the plots in a single column

Accident Level By Weekday

In [85]:
# Define a lambda function to calculate percentages
f = lambda x: np.round(x / x.sum() * 100)

# Group the data by 'Weekday' and 'Accident Level', count the occurrences of each level,
# unstack the result to have levels as columns, apply the percentage function row-wise,
# and fill any missing values with 0
ac_weekday = df.groupby(['Weekday', 'Accident Level'])['Accident Level'].count().unstack().apply(f, axis=1).fillna(0)

# Add a temporary column to sort weekdays in order
ac_weekday['week_num'] = [['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'].index(i) for i in ac_weekday.index]
ac_weekday.sort_values('week_num', inplace=True)  # Sort by weekday order
ac_weekday.drop('week_num', axis=1, inplace=True)  # Remove the temporary column

# Create a line chart for accident levels by weekday
ac = (
    # Plot each accident level as a separate curve
    hv.Curve(ac_weekday['I'], label='I') * 
    hv.Curve(ac_weekday['II'], label='II') * 
    hv.Curve(ac_weekday['III'], label='III') * 
    hv.Curve(ac_weekday['IV'], label='IV') * 
    hv.Curve(ac_weekday['V'], label='V')
).opts(
    # Set the title of the plot
    opts.Curve(title="Accident Level by Weekday Count")
)

# Group the data by 'Weekday' and 'Potential Accident Level', count the occurrences of each level,
# unstack the result to have levels as columns, apply the percentage function column-wise (corrected),
# and fill any missing values with 0
pot_ac_weekday = df.groupby(['Weekday', 'Potential Accident Level'])['Potential Accident Level'].count().unstack().apply(f, axis=0).fillna(0)

# Add a temporary column to sort weekdays in order
pot_ac_weekday['week_num'] = [['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'].index(i) for i in pot_ac_weekday.index]
pot_ac_weekday.sort_values('week_num', inplace=True)  # Sort by weekday order
pot_ac_weekday.drop('week_num', axis=1, inplace=True)  # Remove the temporary column

# Create a line chart for potential accident levels by weekday
pot_ac = (
    # Plot each potential accident level as a separate curve
    hv.Curve(pot_ac_weekday['I'], label='I') * 
    hv.Curve(pot_ac_weekday['II'], label='II') * 
    hv.Curve(pot_ac_weekday['III'], label='III') * 
    hv.Curve(pot_ac_weekday['IV'], label='IV') * 
    hv.Curve(pot_ac_weekday['V'], label='V') * 
    hv.Curve(pot_ac_weekday['VI'], label='VI')
).opts(
    # Set the title of the plot
    opts.Curve(title="Potential Accident Level by Weekday Count")
)

# Combine the two plots and customize their appearance
(ac + pot_ac).opts(
    # Customize the plot's appearance
    opts.Curve(
        width=800,  # Set the width of the plot
        height=300,  # Set the height of the plot
        tools=['hover'],  # Enable hover tool for interactive data display
        show_grid=True,  # Display grid lines
        ylabel="Percentage",  # Set y-axis label
        yformatter='%d%%'  # Format y-axis values as percentages
    )
).cols(1)  # Display the plots in a single column


Accident Level By Season

In [86]:
# Define a lambda function to calculate percentages
f = lambda x: np.round(x / x.sum() * 100)

# Group the data by 'Season' and 'Accident Level', count the occurrences of each level,
# unstack the result to have levels as columns, apply the percentage function row-wise,
# and fill any missing values with 0
ac_season = df.groupby(['Season', 'Accident Level'])['Accident Level'].count().unstack().apply(f, axis=1).fillna(0)

# Add a temporary column to sort seasons in order
ac_season['season_num'] = [['Spring', 'Summer', 'Autumn', 'Winter'].index(i) for i in ac_season.index]
ac_season.sort_values('season_num', inplace=True)  # Sort by season order
ac_season.drop('season_num', axis=1, inplace=True)  # Remove the temporary column

# Create a line chart for accident levels by season
ac = (
    # Plot each accident level as a separate curve
    hv.Curve(ac_season['I'], label='I') * 
    hv.Curve(ac_season['II'], label='II') * 
    hv.Curve(ac_season['III'], label='III') * 
    hv.Curve(ac_season['IV'], label='IV') * 
    hv.Curve(ac_season['V'], label='V')
).opts(
    # Set the title of the plot
    opts.Curve(title="Accident Level by Season Count")
)

# Group the data by 'Season' and 'Potential Accident Level', count the occurrences of each level,
# unstack the result to have levels as columns, apply the percentage function column-wise (corrected),
# and fill any missing values with 0
pot_ac_season = df.groupby(['Season', 'Potential Accident Level'])['Potential Accident Level'].count().unstack().apply(f, axis=0).fillna(0)

# Add a temporary column to sort seasons in order
pot_ac_season['season_num'] = [['Spring', 'Summer', 'Autumn', 'Winter'].index(i) for i in pot_ac_season.index]
pot_ac_season.sort_values('season_num', inplace=True)  # Sort by season order
pot_ac_season.drop('season_num', axis=1, inplace=True)  # Remove the temporary column

# Create a line chart for potential accident levels by season
pot_ac = (
    # Plot each potential accident level as a separate curve
    hv.Curve(pot_ac_season['I'], label='I') * 
    hv.Curve(pot_ac_season['II'], label='II') * 
    hv.Curve(pot_ac_season['III'], label='III') * 
    hv.Curve(pot_ac_season['IV'], label='IV') * 
    hv.Curve(pot_ac_season['V'], label='V') * 
    hv.Curve(pot_ac_season['VI'], label='VI')
).opts(
    # Set the title of the plot
    opts.Curve(title="Potential Accident Level by Season Count")
)

# Combine the two plots and customize their appearance
(ac + pot_ac).opts(
    # Customize the plot's appearance
    opts.Curve(
        width=800,  # Set the width of the plot
        height=300,  # Set the height of the plot
        tools=['hover'],  # Enable hover tool for interactive data display
        show_grid=True,  # Display grid lines
        ylabel="Percentage",  # Set y-axis label
        yformatter='%d%%'  # Format y-axis values as percentages
    )
).cols(1)  # Display the plots in a single column

Analysis Using Accident Description

In [87]:
def ngram_func(ngram, trg='', trg_value=''):
    #trg_value is list-object
    if (trg == '') or (trg_value == ''):
        string_filterd =  df['Description_processed'].sum().split()
    else:
        string_filterd =  df[df[trg].isin(trg_value)]['Description_processed'].sum().split()
    dic = nltk.FreqDist(nltk.ngrams(string_filterd, ngram)).most_common(30)
    ngram_df = pd.DataFrame(dic, columns=['ngram','count'])
    ngram_df.index = [' '.join(i) for i in ngram_df.ngram]
    ngram_df.drop('ngram',axis=1, inplace=True)
    return ngram_df

Gender Level

In [88]:
uni_ma=hv.Bars(ngram_func(1, 'Gender', ['Male'])[0:15][::-1]).opts(title="Unigram with Male", color="red", xlabel="Unigrams", ylabel="Count")
uni_fe=hv.Bars(ngram_func(1, 'Gender', ['Female'])[0:15][::-1]).opts(title="Unigram with Female", color="red", xlabel="Unigrams", ylabel="Count")

bi_ma=hv.Bars(ngram_func(2, 'Gender', ['Male'])[0:15][::-1]).opts(title="Bigram with Male", color="yellow", xlabel="Bigrams", ylabel="Count")
bi_fe=hv.Bars(ngram_func(2, 'Gender', ['Female'])[0:15][::-1]).opts(title="Bigram with Female", color="yellow", xlabel="Bigrams", ylabel="Count")

tri_ma=hv.Bars(ngram_func(3, 'Gender', ['Male'])[0:15][::-1]).opts(title="Trigram with Male", color="blue", xlabel="Trigrams", ylabel="Count")
tri_fe=hv.Bars(ngram_func(3, 'Gender', ['Female'])[0:15][::-1]).opts(title="Trigram with Female", color="blue", xlabel="Trigrams", ylabel="Count")
                

(uni_ma + uni_fe + bi_ma + bi_fe + tri_ma + tri_fe).opts(opts.Bars(width=400, height=300,tools=['hover'],show_grid=True,invert_axes=True, shared_axes=False)).opts(shared_axes=False).cols(2)

Accident Level

In [89]:
uni_ac_lo=hv.Bars(ngram_func(1, 'Accident Level', ['I','II'])[0:15][::-1]).opts(title="Unigram with High Accident Level", color="red", xlabel="Unigrams", ylabel="Count")
uni_ac_hi=hv.Bars(ngram_func(1, 'Accident Level', ['III','IV','V'])[0:15][::-1]).opts(title="Unigram with Low Accident Level", color="red", xlabel="Unigrams", ylabel="Count")

bi_ac_lo=hv.Bars(ngram_func(2, 'Accident Level', ['I','II'])[0:15][::-1]).opts(title="Bigram with High Accident Level", color="yellow", xlabel="Bigrams", ylabel="Count")
bi_ac_hi=hv.Bars(ngram_func(2, 'Accident Level', ['III','IV','V'])[0:15][::-1]).opts(title="Bigram with Low Accident Level", color="yellow", xlabel="Bigrams", ylabel="Count")

tri_ac_lo=hv.Bars(ngram_func(3, 'Accident Level', ['I','II'])[0:15][::-1]).opts(title="Trigram with High Accident Level", color="blue", xlabel="Trigrams", ylabel="Count")
tri_ac_hi=hv.Bars(ngram_func(3, 'Accident Level', ['III','IV','V'])[0:15][::-1]).opts(title="Trigram with Low Accident Level", color="blue", xlabel="Trigrams", ylabel="Count")
                
(uni_ac_lo + uni_ac_hi + bi_ac_lo + bi_ac_hi + tri_ac_lo + tri_ac_hi).opts(opts.Bars(width=400, height=300,tools=['hover'],show_grid=True,invert_axes=True, shared_axes=False)).opts(shared_axes=False).cols(2)

Industry Sector

In [90]:
uni_mine=hv.Bars(ngram_func(1, 'Industry Sector', ['Mining'])[0:15][::-1]).opts(title="Unigram with Mining Sector", color="red", xlabel="Unigrams", ylabel="Count")
uni_metal=hv.Bars(ngram_func(1, 'Industry Sector', ['Metals'])[0:15][::-1]).opts(title="Unigram with Metal Sector", color="red", xlabel="Unigrams", ylabel="Count")
uni_others=hv.Bars(ngram_func(1, 'Industry Sector', ['Others'])[0:15][::-1]).opts(title="Unigram with Other Sector", color="red", xlabel="Unigrams", ylabel="Count")

bi_mine=hv.Bars(ngram_func(2, 'Industry Sector', ['Mining'])[0:15][::-1]).opts(title="Bigram with Mining Sector", color="yellow", xlabel="Bigrams", ylabel="Count")
bi_metal=hv.Bars(ngram_func(2, 'Industry Sector', ['Metals'])[0:15][::-1]).opts(title="Bigram with Metal Sector", color="yellow", xlabel="Bigrams", ylabel="Count")
bi_others=hv.Bars(ngram_func(2, 'Industry Sector', ['Others'])[0:15][::-1]).opts(title="Bigram with Other Sector", color="yellow", xlabel="Bigrams", ylabel="Count")

tri_mine=hv.Bars(ngram_func(3, 'Industry Sector', ['Mining'])[0:15][::-1]).opts(title="Trigram with Mining Sector", color="blue", xlabel="Trigrams", ylabel="Count")
tri_metal=hv.Bars(ngram_func(3, 'Industry Sector', ['Metals'])[0:15][::-1]).opts(title="Trigram with Metal Sector", color="blue", xlabel="Trigrams", ylabel="Count")
tri_others=hv.Bars(ngram_func(3, 'Industry Sector', ['Others'])[0:15][::-1]).opts(title="Trigram with Other Sector", color="blue", xlabel="Trigrams", ylabel="Count")

(uni_mine + uni_metal + uni_others + bi_mine + bi_metal + bi_others + tri_mine + tri_metal + tri_others)\
            .opts(opts.Bars(width=265, height=300,tools=['hover'],show_grid=True,invert_axes=True, shared_axes=False,fontsize={'title':6.5,'labels':7,'yticks':8.5})).opts(shared_axes=False).cols(3)

Employee Type

In [91]:
uni_emp=hv.Bars(ngram_func(1, 'Employee type', ['Employee'])[0:15][::-1]).opts(title="Unigram with Employee", color="red", xlabel="Unigrams", ylabel="Count")
uni_third=hv.Bars(ngram_func(1, 'Employee type', ['Third Party','Third Party (Remote)'])[0:15][::-1]).opts(title="Unigram with Third Party", color="red", xlabel="Unigrams", ylabel="Count")

bi_emp=hv.Bars(ngram_func(2, 'Employee type', ['Employee'])[0:15][::-1]).opts(title="Bigram with Employee", color="yellow", xlabel="Bigrams", ylabel="Count")
bi_third=hv.Bars(ngram_func(2, 'Employee type', ['Third Party','Third Party (Remote)'])[0:15][::-1]).opts(title="Bigram with Third Party", color="yellow", xlabel="Bigrams", ylabel="Count")

tri_emp=hv.Bars(ngram_func(3, 'Employee type', ['Employee'])[0:15][::-1]).opts(title="Trigram with Employee", color="blue", xlabel="Trigrams", ylabel="Count")
tri_third=hv.Bars(ngram_func(3, 'Employee type', ['Third Party','Third Party (Remote)'])[0:15][::-1]).opts(title="Trigram with Third Party", color="blue", xlabel="Trigrams", ylabel="Count")

(uni_emp + uni_third+ bi_emp + bi_third + tri_emp + tri_third).opts(opts.Bars(width=400, height=300,tools=['hover'],show_grid=True,invert_axes=True, shared_axes=False)).opts(shared_axes=False).cols(2)


Sentiment Trend

In [93]:
v1 = hv.Curve(df.groupby('Month')["Description_sentiment_score"].mean())\
    .opts(opts.Curve(xlabel="Month", ylabel="Sentiment Score", width=800, height=300,tools=['hover'],show_grid=True,title='Month Average Sentiment Score'))
v2 = hv.Curve(df.groupby('Weekday')["Description_sentiment_score"].mean().reindex(index=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']))\
    .opts(opts.Curve(xlabel="Weekday", ylabel="Sentiment Score", width=800, height=300,tools=['hover'],show_grid=True,title='Weekday Average Sentiment Score'))
(v1 + v2).cols(1)

Modeling

Feature Engineering

TFIDF

In [None]:
# Initialize an empty DataFrame to store TF-IDF features
feature_df = pd.DataFrame(index=df.index)  # Ensure index matches df

# Loop through different n-gram ranges (1, 2, 3) to extract TF-IDF features
for i in [1, 2, 3]:
    # Initialize a TF-IDF vectorizer with specified parameters
    vec_tfidf = TfidfVectorizer(
        max_features=10,  # Select top 10 features based on TF-IDF scores
        norm='l2',  # Normalize vectors using L2 norm
        stop_words='english',  # Remove common English stop words
        lowercase=True,  # Convert text to lowercase
        use_idf=True,  # Use inverse document frequency
        ngram_range=(i, i)  # Extract n-grams of size i
    )

    # Fit the vectorizer to the 'Description_processed' column and transform it into a matrix
    X = vec_tfidf.fit_transform(df['Description_processed'])

    # Create a DataFrame from the TF-IDF matrix with feature names
    # Using sparse matrices can be more memory-efficient
    tfs = pd.DataFrame(
        X.toarray(), 
        columns=["TFIDF_{}g_{}".format(i, n) for n in vec_tfidf.get_feature_names_out()]
    )

    # Concatenate the new TF-IDF features to the existing feature DataFrame
    feature_df = pd.concat([feature_df, tfs], axis=1)

# Concatenate the original DataFrame with the new feature DataFrame
feature_df = pd.concat([df, feature_df], axis=1)

In [None]:
# Display the first three rows of the resulting DataFrame
feature_df.head()

In [None]:
# Use LabelEncoder to transform categorical columns into numerical values
# This is necessary for many machine learning algorithms that require numerical inputs
feature_df['Country'] = LabelEncoder().fit_transform(feature_df['Country']).astype(np.int8)
feature_df['Local'] = LabelEncoder().fit_transform(feature_df['Local']).astype(np.int8)
feature_df['Industry Sector'] = LabelEncoder().fit_transform(feature_df['Industry Sector']).astype(np.int8)
feature_df['Accident Level'] = LabelEncoder().fit_transform(feature_df['Accident Level']).astype(np.int8)
feature_df['Potential Accident Level'] = LabelEncoder().fit_transform(feature_df['Potential Accident Level']).astype(np.int8)
feature_df['Gender'] = LabelEncoder().fit_transform(feature_df['Gender']).astype(np.int8)
feature_df['Employee type'] = LabelEncoder().fit_transform(feature_df['Employee type']).astype(np.int8)
feature_df['Critical Risk'] = LabelEncoder().fit_transform(feature_df['Critical Risk']).astype(np.int8)
feature_df['Weekday'] = LabelEncoder().fit_transform(feature_df['Weekday']).astype(np.int8)
feature_df['Season'] = LabelEncoder().fit_transform(feature_df['Season']).astype(np.int8)

# Drop columns that are no longer needed
# 'Date' is likely not useful for modeling if time-related features are extracted elsewhere
# 'Description' and 'Description_processed' are replaced by TF-IDF features
feature_df.drop(['Date', 'Description', 'Description_processed'], axis=1, inplace=True)

# Display the first three rows of the resulting DataFrame
feature_df.head(3)

In [None]:
# Split the data into target (y) and feature (x) sets
y_series = feature_df['Accident Level']  # Target variable
x_df = feature_df.drop(['Accident Level', 'Potential Accident Level'], axis=1)  # Feature set

# Split the data into training and validation sets
# - Use stratified splitting to maintain the same class distribution in both sets
# - Set test_size to 0.2, meaning 20% of the data will be used for validation
# - Use a fixed random_state for reproducibility
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(x_df, y_series, test_size=0.2, random_state=0, stratify=y_series)


In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
# oversampling the train dataset using SMOTE
smt = SMOTE()
X_train_sm, y_train_sm = smt.fit_resample(x_df, y_series)

In [None]:
y_train_sm.value_counts()

Import Required Libraries for Modeling

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import time

In [None]:
# Import LightGBM
from lightgbm import LGBMClassifier

# Define models
models = {
    "Logistic Regression": make_pipeline(StandardScaler(), LogisticRegression(max_iter=500)),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Neural Network": MLPClassifier(max_iter=1000),
    "LightGBM": LGBMClassifier(
        boosting_type='gbdt', 
        num_leaves=31, 
        learning_rate=0.05, 
        n_estimators=500, 
        max_depth=-1, 
        subsample=0.8, 
        colsample_bytree=0.8, 
        random_state=42
    )  # Added LightGBM with improved parameters
}


In [None]:
# Store evaluation results
results = []

In [None]:
# Train and evaluate each model
for name, model in models.items():
    print(f"\nProcessing model: {name}")

    # Record start time for training
    train_start_time = time.time()

    # Train the model
    try:
        model.fit(X_train, Y_train)
    except Exception as e:
        print(f"Error training model {name}: {e}")
        continue

    # Record end time for training
    train_end_time = time.time()

    # Calculate training duration
    train_time = train_end_time - train_start_time

    # Record start time for prediction
    predict_start_time = time.time()

    # Predict on validation set
    try:
        y_pred = model.predict(X_valid)  # Predicted class labels
        y_proba = model.predict_proba(X_valid)[:, 1]  # Probabilities for the positive class
        y_proba = model.predict_proba(X_valid)
    except Exception as e:
        print(f"Error predicting with model {name}: {e}")
        continue

    # Record end time for prediction
    predict_end_time = time.time()

    # Calculate prediction duration
    predict_time = predict_end_time - predict_start_time

    # Calculate evaluation metrics
    accuracy = accuracy_score(Y_valid, y_pred)
    precision = precision_score(Y_valid, y_pred, average="weighted", zero_division=1)  # Handle multiclass classification
    recall = recall_score(Y_valid, y_pred, average="weighted", zero_division=1)
    f1 = f1_score(Y_valid, y_pred, average="weighted", zero_division=1)
    auc_roc = roc_auc_score(Y_valid, y_proba, multi_class="ovr", average="weighted")  # Handle multiclass AUC-ROC

    # Store results in a list
    results.append({
        "Model": name,
        "Training Time (s)": train_time,
        "Prediction Time (s)": predict_time,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "AUC-ROC": auc_roc
    })

In [None]:
# Convert results to DataFrame for viewing
results_df = pd.DataFrame(results)

# Print model evaluation results
print("\nModel evaluation results:")
print(results_df)

results_df

In [None]:
# Define accident labels
ac_label = ['Accident Level : I', 'Accident Level : II', 'Accident Level : III', 'Accident Level : IV', 'Accident Level : V']

# Ensure LightGBM model is trained
gbm_ac = LGBMClassifier(
    boosting_type='gbdt', 
    num_leaves=31, 
    learning_rate=0.05, 
    n_estimators=500, 
    max_depth=-1, 
    subsample=0.8, 
    colsample_bytree=0.8, 
    random_state=42
)

# Train the model
gbm_ac.fit(X_train, Y_train)

# Use SHAP for model explainability
explainer = shap.Explainer(gbm_ac, X_train)  # Use shap.Explainer for better compatibility with LightGBM
shap_values_ac = explainer(X_train)  # Compute SHAP values

In [None]:
#Plot SHAP
shap.summary_plot(shap_values_ac, features=X_train, feature_names=x_df.columns, plot_type="bar", max_display=30, class_names=ac_label)


In [None]:
gbm_ac = LGBMClassifier(
    boosting_type='gbdt',
    num_leaves=50,  # Increase leaves to allow more splits
    learning_rate=0.05,
    n_estimators=500,
    max_depth=10,  # Restrict depth to prevent overfitting
    min_data_in_leaf=20,  # Reduce minimum samples per leaf
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)


In [None]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.01)  # Remove near-constant features
X_train_filtered = selector.fit_transform(X_train)
X_test_filtered = selector.transform(X_valid)


In [None]:
y_series = feature_df['Accident Level']
x_df = feature_df.drop(['Accident Level','Potential Accident Level'], axis=1) 
X_train, X_valid, Y_train, Y_valid = train_test_split(x_df, y_series, test_size=0.2, random_state=0, stratify=y_series)

lgb_train = lgb.Dataset(X_train, Y_train)
lgb_valid = lgb.Dataset(X_valid, Y_valid, reference=lgb_train)

In [None]:
params = {
    'task' : 'train',
    'boosting' : 'gbdt',
    'objective': 'multiclass',
    'num_class': 5,
    'metric': 'multi_logloss',
    'num_leaves': 200,
    'feature_fraction': 1.0,
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'min_child_samples': 5
}
gbm_ac = lgb.train(params,
            lgb_train,
            num_boost_round=100,
            valid_sets=lgb_valid)

In [None]:
ac_label = ['Accident Level : I','Accident Level : II','Accident Level : III','Accident Level : IV','Accident Level : V']
explainer = shap.TreeExplainer(model=gbm_ac)
shap_values_ac = explainer.shap_values(X=X_train)
shap.summary_plot(shap_values=shap_values_ac, features=X_train, feature_names=X_train.columns, plot_type="bar", max_display=30, class_names=ac_label)

In [None]:
_feature_df = feature_df[~feature_df['Potential Accident Level'].isin([5])]
y_series = _feature_df['Potential Accident Level']
x_df = _feature_df.drop(['Accident Level','Potential Accident Level'], axis=1) 
X_train, X_valid, Y_train, Y_valid = train_test_split(x_df, y_series, test_size=0.2, random_state=0, stratify=y_series)

lgb_train = lgb.Dataset(X_train, Y_train)
lgb_valid = lgb.Dataset(X_valid, Y_valid, reference=lgb_train)

In [None]:
params = {
    'task' : 'train',
    'boosting' : 'gbdt',
    'objective': 'multiclass',
    'num_class': 5,
    'metric': 'multi_logloss',
    'num_leaves': 200,
    'feature_fraction': 1.0,
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'min_child_samples': 5
}
gbm_pac = lgb.train(params,
            lgb_train,
            num_boost_round=100,
            valid_sets=lgb_valid)


In [None]:
pac_label = ['Potential Accident Level : I','Potential Accident Level : II','Potential Accident Level : III','Potential Accident Level : IV','Potential Accident Level : V']
explainer = shap.TreeExplainer(model=gbm_pac)
shap_values_pac = explainer.shap_values(X=X_train)
shap.summary_plot(shap_values=shap_values_pac, features=X_train, feature_names=X_train.columns, plot_type="bar", max_display=30, class_names=pac_label)