In [1]:
pip install mlxtend

Note: you may need to restart the kernel to use updated packages.


In [2]:
#Import necessary libraries, pandas, numpy and mlxtend
import pandas as pd
import numpy as np
import mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [3]:
data=pd.read_csv("merged_data.csv")

In [4]:
#Check your current working directory
import os
os.getcwd()

#If you need to change your current working directory, you can


'/Users/dharunsomalingam/Documents/Semester_5/CITS3401/Project'

In [5]:
df=data[["Crash Type","Speed Limit","Gender","Road User","Age Group","Time of Day"]]

In [6]:
new_df = df.astype(str)

In [7]:
new_df.dtypes

Crash Type     object
Speed Limit    object
Gender         object
Road User      object
Age Group      object
Time of Day    object
dtype: object

In [8]:
list = new_df.values.tolist()

#Covert the list to one-hot encoded boolean numpy array. 
#Apriori function allows boolean data type only, such as 1 and 0, or FALSE and TRUE.
te = TransactionEncoder()
array_te = te.fit(list).transform(list)

#Check the array
array_te

#Check the colunms
te.columns_

#Apriori function can handle dataframe only, covert the array to a dataframe
arm_df = pd.DataFrame(array_te, columns = te.columns_)

In [9]:
#Find the frequent itemsets
frequent_itemsets = apriori(arm_df,min_support=0.01,use_colnames =True)
#Check the length of rules
frequent_itemsets['length']=frequent_itemsets['itemsets'].apply(lambda x: len(x))

#Assume the length is 2 and the min support is >= 0.3
frequent_itemsets[ (frequent_itemsets['length']==2) & 
                  (frequent_itemsets['support']>=0.3)]


Unnamed: 0,support,itemsets,length
167,0.382019,"(Male, Day)",2
169,0.306315,"(Day, Multiple)",2
175,0.342553,"(Male, Driver)",2
185,0.306769,"(Male, Multiple)",2
186,0.336024,"(Male, Night)",2
190,0.411691,"(Male, Single)",2


In [10]:
#Assume the min confidence is 0.5
rules_con = association_rules(frequent_itemsets, metric="confidence",min_threshold=0.7)

In [11]:
# Generate association rules based on the 'lift' metric with a minimum lift threshold of 1
# The 'frequent_itemsets' object contains the frequent itemsets obtained from a previous step
rules_lift = association_rules(frequent_itemsets, metric="lift", min_threshold=1)


In [12]:
# Define a list of road users
road_users = ['Driver','Passenger', 'Motorcycle rider', 'Pedestrian', 'Pedal cyclist', 'Motorcycle pillion passenger']

# Filter the 'rules_con' dataframe to only include rows where the consequents contain any of the road users
filtered_rules = rules_con[rules_con['consequents'].apply(
    lambda x: any(item in x for item in road_users))]  # Check if any item in the 'consequents' matches a road user


In [13]:
# Select the relevant columns from the filtered rules to focus on the antecedents, consequents, support, confidence, and lift values
result_arm = filtered_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]  # Extract specific columns


In [14]:
result_arm

Unnamed: 0,antecedents,consequents,support,confidence,lift
20,"(100, 0_to_16)",(Passenger),0.01908,0.761216,3.424197
154,"(Day, 100, 0_to_16)",(Passenger),0.012569,0.762376,3.429418
179,"(Male, 65_to_74, 100)",(Driver),0.012442,0.710881,1.561808
184,"(Male, 75_or_older, 100)",(Driver),0.0109,0.758838,1.667171
327,"(Single, 65_to_74, 60)",(Pedestrian),0.011027,0.794771,5.157174
334,"(Single, 75_or_older, 60)",(Pedestrian),0.019425,0.832168,5.399836
382,"(Male, 75_or_older, Multiple)",(Driver),0.019842,0.717848,1.577114
443,"(Male, Day, 65_to_74, 100)",(Driver),0.01003,0.710797,1.561623
511,"(Day, 75_or_older, Single, 60)",(Pedestrian),0.014582,0.812121,5.269756
515,"(Male, Single, 75_or_older, 60)",(Pedestrian),0.011825,0.827411,5.36897


In [15]:
# Sort the association rules by 'confidence' and 'lift' in descending order and select the top 7 rules
top_k = 7  # Define the number of top rules to select
top_k_rules = result_arm.sort_values(by=['confidence', 'lift'], ascending=False).head(top_k)  # Sort and select top 7 rules


In [16]:
top_k_rules

Unnamed: 0,antecedents,consequents,support,confidence,lift
334,"(Single, 75_or_older, 60)",(Pedestrian),0.019425,0.832168,5.399836
515,"(Male, Single, 75_or_older, 60)",(Pedestrian),0.011825,0.827411,5.36897
511,"(Day, 75_or_older, Single, 60)",(Pedestrian),0.014582,0.812121,5.269756
327,"(Single, 65_to_74, 60)",(Pedestrian),0.011027,0.794771,5.157174
154,"(Day, 100, 0_to_16)",(Passenger),0.012569,0.762376,3.429418
20,"(100, 0_to_16)",(Passenger),0.01908,0.761216,3.424197
184,"(Male, 75_or_older, 100)",(Driver),0.0109,0.758838,1.667171
